<a href="https://colab.research.google.com/github/douglasmmachado/MedicineConsumption/blob/master/notebooks/division_approach/6_Forecasting_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 6 - Forecasting and prediction validation



---



---



In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import math as m

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image


from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error,  mean_absolute_percentage_error
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle


df_h1_url ="https://raw.githubusercontent.com/douglasmmachado/ExploratoryDataAnalysis/master/datasets/division_approach/clustered/df_h1_clustered_9.csv"
df_h2_url ="https://raw.githubusercontent.com/douglasmmachado/ExploratoryDataAnalysis/master/datasets/division_approach/clustered/df_h2_clustered_3.csv"
df_h3_url ="https://raw.githubusercontent.com/douglasmmachado/ExploratoryDataAnalysis/master/datasets/division_approach/clustered/df_h3_clustered_8.csv"
df_h4_url ="https://raw.githubusercontent.com/douglasmmachado/ExploratoryDataAnalysis/master/datasets/division_approach/clustered/df_h4_clustered_13.csv"



df_h1 = pd.read_csv(df_h1_url)
df_h2 = pd.read_csv(df_h2_url)
df_h3 = pd.read_csv(df_h3_url)
df_h4 = pd.read_csv(df_h4_url)

medicines = [3400892088310,3400892075761,3400892203645,
             3400892065366,3400892052120,3400891996128,
             3400893826706,3400893736135,3400893875490,
             3400890837149,3400891235203,3400891225037,
             3400891191226,3400892729589,3400892745848,
             3400892697789,3400892761527,3400893022634,
             3400892761695,3400892669236,3400892508566]

numerical_features = ['HOSPI_CODE_UCD', 'LIT_HC', 'LIT_HP', 'N_ETB',
'N_UFS', 'PN_MEDICAL', 'POPULATION',
'P_MEDICAL', 'QUANTITY',
'SEJ_HAD', 'SEJ_MCO', 'SEJ_PSY',
'SEJ_SLD', 'SEJ_SSR', 'MONTH_1.0', 'MONTH_2.0', 'MONTH_3.0',
                      'MONTH_4.0', 'MONTH_5.0', 'MONTH_6.0',
                      'MONTH_7.0', 'MONTH_8.0', 'MONTH_9.0',
                      'MONTH_10.0', 'MONTH_11.0', 'MONTH_12.0', 'CLUSTER']

In [30]:
features_h1 = list(set(numerical_features) - {"N_ETB", "SEJ_HAD", "SEJ_PSY"})
features_h2 = list(set(numerical_features) - {'SEJ_HAD', 'SEJ_SLD', 'N_ETB'})
features_h3 = list(set(numerical_features) - {'N_ETB'})
features_h4 = list(set(numerical_features) - {'SEJ_HAD', 'N_ETB'})

In [None]:
df_h1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1208 entries, 0 to 1207
Data columns (total 47 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HOSPI_CODE_UCD     1208 non-null   int64  
 1   LIT_HC             1208 non-null   float64
 2   LIT_HP             1208 non-null   float64
 3   N_UFS              1208 non-null   float64
 4   PN_MEDICAL         1208 non-null   float64
 5   POPULATION         1208 non-null   float64
 6   P_MEDICAL          1208 non-null   float64
 7   QUANTITY           1208 non-null   float64
 8   QUANTITY_MA        1208 non-null   float64
 9   SEJ_MCO            1208 non-null   float64
 10  SEJ_SLD            1208 non-null   float64
 11  SEJ_SSR            1208 non-null   float64
 12  YEAR               1208 non-null   float64
 13  MONTH_1.0          1208 non-null   int64  
 14  MONTH_2.0          1208 non-null   int64  
 15  MONTH_3.0          1208 non-null   int64  
 16  MONTH_4.0          1208 

In [None]:
df_h2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697 entries, 0 to 696
Data columns (total 47 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HOSPI_CODE_UCD     697 non-null    int64  
 1   LIT_HC             697 non-null    float64
 2   LIT_HP             697 non-null    float64
 3   N_UFS              697 non-null    float64
 4   PN_MEDICAL         697 non-null    float64
 5   POPULATION         697 non-null    float64
 6   P_MEDICAL          697 non-null    float64
 7   QUANTITY           697 non-null    float64
 8   QUANTITY_MA        697 non-null    float64
 9   SEJ_MCO            697 non-null    float64
 10  SEJ_PSY            697 non-null    float64
 11  SEJ_SSR            697 non-null    float64
 12  YEAR               697 non-null    float64
 13  MONTH_1.0          697 non-null    int64  
 14  MONTH_2.0          697 non-null    int64  
 15  MONTH_3.0          697 non-null    int64  
 16  MONTH_4.0          697 non

In [None]:
df_h3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1397 entries, 0 to 1396
Data columns (total 49 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HOSPI_CODE_UCD     1397 non-null   int64  
 1   LIT_HC             1397 non-null   float64
 2   LIT_HP             1397 non-null   float64
 3   N_UFS              1397 non-null   float64
 4   PN_MEDICAL         1397 non-null   float64
 5   POPULATION         1397 non-null   float64
 6   P_MEDICAL          1397 non-null   float64
 7   QUANTITY           1397 non-null   float64
 8   QUANTITY_MA        1397 non-null   float64
 9   SEJ_HAD            1397 non-null   float64
 10  SEJ_MCO            1397 non-null   float64
 11  SEJ_PSY            1397 non-null   float64
 12  SEJ_SLD            1397 non-null   float64
 13  SEJ_SSR            1397 non-null   float64
 14  YEAR               1397 non-null   float64
 15  MONTH_1.0          1397 non-null   int64  
 16  MONTH_2.0          1397 

In [None]:
df_h4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770 entries, 0 to 769
Data columns (total 48 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HOSPI_CODE_UCD     770 non-null    int64  
 1   LIT_HC             770 non-null    float64
 2   LIT_HP             770 non-null    float64
 3   N_UFS              770 non-null    float64
 4   PN_MEDICAL         770 non-null    float64
 5   POPULATION         770 non-null    float64
 6   P_MEDICAL          770 non-null    float64
 7   QUANTITY           770 non-null    float64
 8   QUANTITY_MA        770 non-null    float64
 9   SEJ_MCO            770 non-null    float64
 10  SEJ_PSY            770 non-null    float64
 11  SEJ_SLD            770 non-null    float64
 12  SEJ_SSR            770 non-null    float64
 13  YEAR               770 non-null    float64
 14  MONTH_1.0          770 non-null    int64  
 15  MONTH_2.0          770 non-null    int64  
 16  MONTH_3.0          770 non

In [65]:
def test_2_clustering(df, df_scores):
  for cluster in df.CLUSTER.unique():

    medicines = df[df['CLUSTER'] == cluster]['HOSPI_CODE_UCD'].unique()
    print()
    print("All data")
    for medicine in medicines:
        num_data_points = len(df[(df['CLUSTER'] == cluster) & (df['HOSPI_CODE_UCD'] == medicine)])
        print(f"Medicine: {medicine}, Data Points: {num_data_points}")
    print()

    X = df[df['CLUSTER'] == cluster].drop(['QUANTITY', 'CLUSTER'], axis=1).copy().values
    y = df[df['CLUSTER'] == cluster]['QUANTITY'].copy().values

    X, y = shuffle(X, y, random_state = 42)

    # Perform the train-test split with shuffled samples
    X_train = X
    X_test = X
    y_train = y
    y_test = y

    df_test = pd.DataFrame(X_test, columns = df.drop(['QUANTITY', 'CLUSTER'], axis=1).copy().columns)
    df_test['QUANTITY'] = y_test

    # Define the parameter distributions for RandomizedSearchCV
    param_distributions = {
        'max_depth': np.arange(2, 101, 1),
        'n_estimators': np.arange(2, 201, 1),
        'max_features': [1, 'sqrt', 'log2'],
        'min_samples_split': np.arange(2, 11, 1),
        'min_samples_leaf': np.arange(1, 5, 1)
    }

    # Create the RandomizedSearchCV object
    randomized_search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=42),
                                          param_distributions=param_distributions,
                                          n_iter=1000,
                                          cv = 5,
                                          verbose=1,
                                          random_state=42,
                                          n_jobs = -1)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(np.array(X_train)[:, 1:])
    # Fit the RandomizedSearchCV object to the data
    randomized_search.fit(X_train_scaled, y_train)

    # Get the best estimator
    best_estimator = randomized_search.best_estimator_

    for medicine in df_test.HOSPI_CODE_UCD.unique():

      X_test_medicine = df_test[df_test['HOSPI_CODE_UCD'] == medicine].drop(['QUANTITY'], axis=1).copy().values

      scaler = StandardScaler()
      X_test_scaled = scaler.fit_transform(np.array(X_test_medicine)[:, 1:])

      y_test_medicine = df_test[df_test['HOSPI_CODE_UCD'] == medicine]['QUANTITY'].copy().values

      # Make predictions using the best estimator
      y_pred = best_estimator.predict(X_test_scaled)

      # Calculate R^2 score
      r2 = r2_score(y_test_medicine, y_pred)

      # Calculate MAE
      mae = mean_absolute_error(y_test_medicine, y_pred)

      # Calculate MAPE
      mape = mean_absolute_percentage_error(y_test_medicine, y_pred)

      # Calculate RMSE
      rmse = np.sqrt(mean_squared_error(y_test_medicine, y_pred))

      # Print the best parameters, best score, and evaluation metrics
      print('Cluster:' + str(cluster))
      print('Medicine:' + str(medicine))
      print('Best Parameters:', randomized_search.best_params_)
      print('Best Score:', randomized_search.best_score_)
      print('R^2 Score:', round(r2, 3))
      print('MAE:', round(mae, 3))
      print('MAPE:', round(mape, 3))
      print('RMSE:', round(rmse, 3))
      print()


      # Create the new row as a DataFrame
      new_row = pd.DataFrame({'HOSPI_CODE_UCD': ['CODE_UCD_'+str(int(medicine))],
                              'R2': [r2],
                              'RMSE': [rmse],
                              'MAE': [mae],
                              'MAPE': [mape],
                              'CLUSTER': [cluster]})

      # Append the new row to the DataFrame
      df_scores = pd.concat([df_scores, new_row], ignore_index=True)

  # Return the updated DataFrame
  return df_scores

In [None]:
df_prediction_scores_h1 = pd.DataFrame(columns=['HOSPI_CODE_UCD', 'R2', 'RMSE', 'MAE', 'MAPE', 'CLUSTER'])

df_prediction_scores_h1 = test_2_clustering(df_h1[features_h1], df_prediction_scores_h1)


df_prediction_scores_h1


All data
Medicine: 3400891191226, Data Points: 7
Medicine: 3400891225037, Data Points: 7
Medicine: 3400891996128, Data Points: 7
Medicine: 3400892761695, Data Points: 7
Medicine: 3400892065366, Data Points: 7
Medicine: 3400892075761, Data Points: 7
Medicine: 3400892088310, Data Points: 7
Medicine: 3400892669236, Data Points: 7
Medicine: 3400892761527, Data Points: 6
Medicine: 3400892697789, Data Points: 6
Medicine: 3400893736135, Data Points: 6
Medicine: 3400892203645, Data Points: 6
Medicine: 3400893826706, Data Points: 6
Medicine: 3400890837149, Data Points: 6
Medicine: 3400893875490, Data Points: 6
Medicine: 3400892508566, Data Points: 4

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
