<a href="https://colab.research.google.com/github/douglasmmachado/MedicineConsumption/blob/causal_model/notebooks/unified_approach/5_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5 - Forecasting and prediction



---



---



In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import math as m

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error,  mean_absolute_percentage_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle



df_3_clusters_url ="https://raw.githubusercontent.com/douglasmmachado/MedicineConsumption/causal_model/datasets/unified_approach/clustered/df_clustered_3.csv"
df_4_clusters_url ="https://raw.githubusercontent.com/douglasmmachado/MedicineConsumption/causal_model/datasets/unified_approach/clustered/df_clustered_4.csv"
df_5_clusters_url ="https://raw.githubusercontent.com/douglasmmachado/MedicineConsumption/causal_model/datasets/unified_approach/clustered/df_clustered_5.csv"
df_url = "https://raw.githubusercontent.com/douglasmmachado/ExploratoryDataAnalysis/causal_model/datasets/unified_approach/df_ma.csv"

df = pd.read_csv(df_url)
df_3_clusters = pd.read_csv(df_3_clusters_url)
df_4_clusters = pd.read_csv(df_4_clusters_url)
df_5_clusters = pd.read_csv(df_5_clusters_url)

medicines = [3400892088310,3400892075761,3400892203645,
             3400892065366,3400892052120,3400891996128,
             3400893826706,3400893736135,3400893875490,
             3400890837149,3400891235203,3400891225037,
             3400891191226,3400892729589,3400892745848,
             3400892697789,3400892761527,3400893022634,
             3400892761695,3400892669236,3400892508566]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4074 entries, 0 to 4073
Data columns (total 48 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HOSPI_CODE_UCD     4074 non-null   int64  
 1   LIT_HC             4074 non-null   float64
 2   LIT_HP             4074 non-null   float64
 3   N_ETB              4074 non-null   float64
 4   N_UFS              4074 non-null   float64
 5   PN_MEDICAL         4074 non-null   float64
 6   POPULATION         4074 non-null   float64
 7   P_MEDICAL          4074 non-null   float64
 8   QUANTITY           4074 non-null   float64
 9   QUANTITY_MA        4074 non-null   float64
 10  SEJ_HAD            4074 non-null   float64
 11  SEJ_MCO            4074 non-null   float64
 12  SEJ_PSY            4074 non-null   float64
 13  SEJ_SLD            4074 non-null   float64
 14  SEJ_SSR            4074 non-null   float64
 15  MONTH_1.0          4074 non-null   int64  
 16  MONTH_2.0          4074 

## 5.1 - New database composition based on clusters

## 5.2 - Building forecasting models based on clusters

In [47]:
from sklearn.model_selection import GridSearchCV

def test_1_baseline(df, medicine, df_scores):

  X = df[df['HOSPI_CODE_UCD'] == medicine].drop(['QUANTITY', 'HOSPI_CODE_UCD'], axis=1).values

  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  y = df[df['HOSPI_CODE_UCD'] == medicine]['QUANTITY'].values

  X_scaled, y = shuffle(X_scaled, y, random_state=42)

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
                                                      test_size = .2,
                                                      random_state = 42)
  print(f'Size of data set: {len(X)}')
  print(f'Size of training set: {len(X_train)}')
  print(f'Size of test set: {len(X_test)}')

  # Define the parameter distributions for RandomizedSearchCV
  param_grid = {
      'max_depth': np.arange(2, 20, 1),
      'n_estimators': np.arange(2, int(round(len(X_train)*0.1,0)), 1)
  }
  depth_len = param_grid['max_depth'].size
  estimators_len = param_grid['n_estimators'].size

  print(f'Size of grid search: {depth_len * estimators_len}')

  # Create the RandomizedSearchCV object
  grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                             param_grid=param_grid,
                             scoring = 'r2',
                             cv = 5,
                             n_jobs = -1,
                             verbose = 3)

  ''' >3 : the fold and candidate parameter indexes
      are also displayed together with the starting time of the computation.
  '''
  # Fit the RandomizedSearchCV object to the data
  grid_search.fit(X_train, y_train)

  # Get the best estimator
  best_estimator = grid_search.best_estimator_

  # Make predictions using the best estimator
  y_pred = best_estimator.predict(X_test)

  # Calculate R^2 score
  r2 = r2_score(y_test, y_pred)

  # Calculate MAE
  mae = mean_absolute_error(y_test, y_pred)

  # Calculate MAPE
  mape = mean_absolute_percentage_error(y_test, y_pred)

  # Calculate RMSE
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))

  # Print the best parameters, best score, and evaluation metrics
  print('Medicine:' + str(medicine))
  print(f'Datapoints in test: {len(X_test)}')
  print('Best Parameters:', grid_search.best_params_)
  print('Training Score: R^2', round(grid_search.best_score_, 3))
  print('Test score: ')
  print('R^2 Score:', round(r2, 3))
  print('MAE:', round(mae, 3))
  print('MAPE:', round(mape, 3))
  print('RMSE:', round(rmse, 3))
  print()


  # Create the new row as a DataFrame
  new_row = pd.DataFrame({'HOSPI_CODE_UCD': ['CODE_UCD_'+str(medicine)],
                          'R2': [r2],
                          'RMSE': [rmse],
                          'MAE': [mae],
                          'MAPE': [mape]})

  # Append the new row to the DataFrame
  df_scores = pd.concat([df_scores, new_row], ignore_index=True)

  # Return the updated DataFrame
  return df_scores


In [28]:
features = ['HOSPI_CODE_UCD', 'LIT_HC', 'LIT_HP', 'N_ETB', 'N_UFS', 'PN_MEDICAL',
       'POPULATION', 'P_MEDICAL', 'QUANTITY', 'QUANTITY_MA', 'SEJ_HAD',
       'SEJ_MCO', 'SEJ_PSY', 'SEJ_SLD', 'SEJ_SSR', 'MONTH_1.0', 'MONTH_2.0',
       'MONTH_3.0', 'MONTH_4.0', 'MONTH_5.0', 'MONTH_6.0', 'MONTH_7.0',
       'MONTH_8.0', 'MONTH_9.0', 'MONTH_10.0', 'MONTH_11.0', 'MONTH_12.0']

In [46]:
df_prediction_scores = pd.DataFrame(columns=['HOSPI_CODE_UCD', 'R2', 'RMSE', 'MAE', 'MAPE'])

for medicine in medicines:

  df_prediction_scores = test_1_baseline(df[features], medicine, df_prediction_scores)

df_prediction_scores

Size of data set: 171
Size of training set: 136
Size of test set: 35
Size of grid search: 216
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Medicine:3400892088310
Datapoints in test: 35
Best Parameters: {'max_depth': 7, 'n_estimators': 3}
Training Score: R^2 0.8140414571379486
Test score: 
R^2 Score: 0.775
MAE: 607.882
MAPE: 8.533
RMSE: 860.776

Size of data set: 204
Size of training set: 163
Size of test set: 41
Size of grid search: 252
Fitting 5 folds for each of 252 candidates, totalling 1260 fits
Medicine:3400892075761
Datapoints in test: 41
Best Parameters: {'max_depth': 7, 'n_estimators': 15}
Training Score: R^2 0.6672976634986967
Test score: 
R^2 Score: 0.385
MAE: 371.478
MAPE: 0.149
RMSE: 590.036

Size of data set: 200
Size of training set: 160
Size of test set: 40
Size of grid search: 252
Fitting 5 folds for each of 252 candidates, totalling 1260 fits
Medicine:3400892203645
Datapoints in test: 40
Best Parameters: {'max_depth': 5, 'n_estimators': 14}
Training 

Unnamed: 0,HOSPI_CODE_UCD,R2,RMSE,MAE,MAPE
0,CODE_UCD_3400892088310,0.775266,860.776201,607.882137,8.533207
1,CODE_UCD_3400892075761,0.384943,590.036073,371.477666,0.149066
2,CODE_UCD_3400892203645,0.936251,875.124638,569.658408,0.686666
3,CODE_UCD_3400892065366,0.830707,769.829387,504.420878,0.325603
4,CODE_UCD_3400892052120,0.808439,370.278037,211.330776,0.354099
5,CODE_UCD_3400891996128,0.794586,15856.203777,8209.707325,1.041047
6,CODE_UCD_3400893826706,0.856255,954.863299,655.912796,1.986171
7,CODE_UCD_3400893736135,0.89554,713.645111,557.441821,0.332676
8,CODE_UCD_3400893875490,0.725888,2563.055639,1525.765244,5.094943
9,CODE_UCD_3400890837149,0.834535,2915.524717,1519.411433,13.672224
