# Random Forest Model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score

import itertools as it

#Libraries for visualizing trees
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image
import time as time

np.warnings.filterwarnings('ignore')

In [2]:
red_train = pd.read_csv('red_train.csv')
red_test = pd.read_csv('red_test.csv')
white_train = pd.read_csv('white_train.csv')
white_test = pd.read_csv('white_test.csv')

In [3]:
red_train_copy = red_train.copy()
red_test_copy = red_test.copy()
white_train_copy = white_train.copy()
white_test_copy = white_test.copy()
red_train_copy['type'] = 'red'
red_test_copy['type'] = 'red'
white_train_copy['type'] = 'white'
white_test_copy['type'] = 'white'

In [4]:
combined_train = pd.concat([red_train_copy, white_train_copy], axis = 0)
combined_test = pd.concat([red_test_copy, white_test_copy], axis = 0)
combined_train = pd.get_dummies(combined_train)
combined_test = pd.get_dummies(combined_test)

In [5]:
red_train_x = red_train.drop('quality', axis = 1)
red_train_y = red_train['quality']
red_test_x = red_test.drop('quality', axis = 1)
red_test_y = red_test['quality']
white_train_x = white_train.drop('quality', axis = 1)
white_train_y = white_train['quality']
white_test_x = white_test.drop('quality', axis = 1)
white_test_y = white_test['quality']
combined_train_x = combined_train.drop('quality', axis = 1)
combined_train_y = combined_train['quality']
combined_test_x = combined_test.drop('quality', axis = 1)
combined_test_y = combined_test['quality']

# Create Base RandomForest Models

In [19]:
#Red
base_model_red = RandomForestRegressor().fit(red_train_x, red_train_y)
y_pred = base_model_red.predict(red_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(red_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(red_test_y, y_pred))

RMSE: 0.559647433658013
MAE: 0.407175


In [20]:
#White
base_model_white = RandomForestRegressor().fit(white_train_x, white_train_y)
y_pred = base_model_white.predict(white_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(white_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(white_test_y, y_pred))

RMSE: 0.6382078223351668
MAE: 0.45294693877551023


In [21]:
#Combined
base_model_combined = RandomForestRegressor().fit(combined_train_x, combined_train_y)
y_pred = base_model_combined.predict(combined_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(combined_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(combined_test_y, y_pred))

RMSE: 0.6238069721168067
MAE: 0.44387692307692306


# Tuning Red Model

In [18]:
n_samples = red_train_x.shape[0]
n_features = red_train_x.shape[1]

params = {'n_estimators': [700, 850],
          'max_features': list(range(2,8,2)),
          'bootstrap': [True, False]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
rf_regressor_grid = GridSearchCV(RandomForestRegressor(random_state=1, n_jobs=-1), 
                                      param_grid =params, cv=cv, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')
rf_regressor_grid.fit(red_train_x, red_train_y)
print('Best Parameters : ',rf_regressor_grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters :  {'bootstrap': False, 'max_features': 2, 'n_estimators': 850}


In [19]:
# Red Model with optimal parameters
optimal_model_red = RandomForestRegressor(n_estimators=850, random_state=1,
                                          bootstrap = False,n_jobs=-1, max_features=2).fit(red_train_x, red_train_y)

y_pred = optimal_model_red.predict(red_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(red_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(red_test_y, y_pred))

RMSE: 0.5496370586066515
MAE: 0.3714382352941176


In [22]:
#Further tuning red model with intuition
intuition_model_red = RandomForestRegressor(n_estimators=850, random_state=1,
                                          bootstrap = False,n_jobs=-1, max_features=7).fit(red_train_x, red_train_y)

y_pred = intuition_model_red.predict(red_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(red_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(red_test_y, y_pred))

RMSE: 0.5441681184858873
MAE: 0.3612529411764706


# Tuning White Model

In [9]:
n_samples = white_train_x.shape[0]
n_features = white_train_x.shape[1]

params = {'n_estimators': [700, 850],
          'max_features': list(range(2,8,2)),
          'bootstrap': [True, False]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
rf_regressor_grid = GridSearchCV(RandomForestRegressor(random_state=1, n_jobs=-1), 
                                      param_grid =params, cv=cv, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')
rf_regressor_grid.fit(white_train_x, white_train_y)
print('Best Parameters : ',rf_regressor_grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters :  {'bootstrap': False, 'max_features': 2, 'n_estimators': 700}


In [24]:
# White Model with optimal parameters
optimal_model_white = RandomForestRegressor(n_estimators=700, random_state=1, max_features = 2, bootstrap = False).fit(white_train_x, white_train_y)

y_pred = optimal_model_white.predict(white_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(white_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(white_test_y, y_pred))

RMSE: 0.616714316442351
MAE: 0.39846880466472306


# Tuning Combined Model

In [25]:
n_samples = combined_train_x.shape[0]
n_features = combined_train_x.shape[1]

params = {'n_estimators': [700, 850, 900],
          'max_features': list(range(2,8,2)),
          'bootstrap': [True, False]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
rf_regressor_grid = GridSearchCV(RandomForestRegressor(random_state=1, n_jobs=-1), 
                                      param_grid =params, cv=cv, n_jobs=-1, verbose=1, scoring='neg_mean_absolute_error')
rf_regressor_grid.fit(combined_train_x, combined_train_y)
print('Best Parameters : ',rf_regressor_grid.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters :  {'bootstrap': False, 'max_features': 2, 'n_estimators': 850}


In [26]:
# Combined Model with optimal parameters
optimal_model_combined = RandomForestRegressor(n_estimators=850, random_state=1, max_features = 2, bootstrap = False).fit(combined_train_x, combined_train_y)

y_pred = optimal_model_combined.predict(combined_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(combined_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(combined_test_y, y_pred))

RMSE: 0.6049172178553129
MAE: 0.39452018099547514


In [39]:
# Further tuning combined Model with intuition
optimal_model_combined = RandomForestRegressor(n_estimators=830, random_state=1, max_features = 3, bootstrap = False).fit(combined_train_x, combined_train_y)

y_pred = optimal_model_combined.predict(combined_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(combined_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(combined_test_y, y_pred))

RMSE: 0.6040048809591836
MAE: 0.39270139017608896


## Summary

In [46]:
# red
print("Red Base MAE: 0.407175")
print("Red Tuned MAE: 0.3612529411764706")

improvement = 100*((0.407175-0.3612529411764706)/0.407175)
print("Improvement:", str(round(improvement, 2)), "%")

Red Base MAE: 0.407175
Red Tuned MAE: 0.3612529411764706
Improvement: 11.28 %


In [47]:
# white
print("White Base MAE: 0.45294693877551023")
print("White Tuned MAE: 0.39846880466472306")

improvement = 100*((0.45294693877551023-0.39846880466472306)/0.45294693877551023)
print("Improvement:", str(round(improvement, 2)), "%")

White Base MAE: 0.45294693877551023
White Tuned MAE: 0.39846880466472306
Improvement: 12.03 %


In [None]:
# combined
print("Combined Base MAE: 0.44387692307692306")
print("Combined Tuned MAE: 0.39846880466472306")

improvement = 100*((0.45294693877551023-0.39846880466472306)/0.45294693877551023)
print("Improvement:", str(round(improvement, 2)), "%")