# Random Forest Model

In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score

import itertools as it

#Libraries for visualizing trees
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image
import time as time

np.warnings.filterwarnings('ignore')

In [15]:
red_train = pd.read_csv('red_train.csv')
red_test = pd.read_csv('red_test.csv')
white_train = pd.read_csv('white_train.csv')
white_test = pd.read_csv('white_test.csv')

In [16]:
red_train_copy = red_train.copy()
red_test_copy = red_test.copy()
white_train_copy = white_train.copy()
white_test_copy = white_test.copy()
red_train_copy['type'] = 'red'
red_test_copy['type'] = 'red'
white_train_copy['type'] = 'white'
white_test_copy['type'] = 'white'

In [17]:
combined_train = pd.concat([red_train_copy, white_train_copy], axis = 0)
combined_test = pd.concat([red_test_copy, white_test_copy], axis = 0)
combined_train = pd.get_dummies(combined_train)
combined_test = pd.get_dummies(combined_test)

In [18]:
red_train_x = red_train.drop('quality', axis = 1)
red_train_y = red_train['quality']
red_test_x = red_test.drop('quality', axis = 1)
red_test_y = red_test['quality']
white_train_x = white_train.drop('quality', axis = 1)
white_train_y = white_train['quality']
white_test_x = white_test.drop('quality', axis = 1)
white_test_y = white_test['quality']
combined_train_x = combined_train.drop('quality', axis = 1)
combined_train_y = combined_train['quality']
combined_test_x = combined_test.drop('quality', axis = 1)
combined_test_y = combined_test['quality']

# Create Base RandomForest Models

In [19]:
#Red
base_model_red = RandomForestRegressor().fit(red_train_x, red_train_y)
y_pred = base_model_red.predict(red_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(red_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(red_test_y, y_pred))

RMSE: 0.559647433658013
MAE: 0.407175


In [20]:
#White
base_model_white = RandomForestRegressor().fit(white_train_x, white_train_y)
y_pred = base_model_white.predict(white_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(white_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(white_test_y, y_pred))

RMSE: 0.6382078223351668
MAE: 0.45294693877551023


In [21]:
#Combined
base_model_combined = RandomForestRegressor().fit(combined_train_x, combined_train_y)
y_pred = base_model_combined.predict(combined_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(combined_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(combined_test_y, y_pred))

RMSE: 0.6238069721168067
MAE: 0.44387692307692306


# Tuning Red Model

In [46]:
start_time = time.time()

n_samples = red_train_x.shape[0]
n_features = red_train_x.shape[1]

params = {'n_estimators': [700, 800, 900],
          'max_features': [5,6,7]}

param_list=list(it.product(*(params[Name] for Name in params)))

oob_score = [0]*len(param_list)
i=0
for pr in param_list:
    model = RandomForestRegressor(random_state=1,criterion = "mae", oob_score=True,verbose=False,n_estimators = pr[0],
                                  max_features=pr[1],
                                  n_jobs=-1).fit(red_train_x,red_train_y)
    oob_score[i] = model.oob_score_
    i=i+1
    
end_time = time.time()
print("time taken = ", (end_time-start_time)/60, " minutes")
print("Best params = ", param_list[np.argmax(oob_score)])
print("Best score (R-squared) = ", np.max(oob_score))

time taken =  1.629862650235494  minutes
Best params =  (700, 5)
Best score (R-squared) =  0.44921802284571466


In [47]:
# Red Model with optimal parameters
optimal_model_red = RandomForestRegressor(n_estimators=700, random_state=1,
                                          oob_score=True,n_jobs=-1, max_features=5).fit(red_train_x, red_train_y)

y_pred = optimal_model_red.predict(red_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(red_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(red_test_y, y_pred))

RMSE: 0.5490726690688286
MAE: 0.40185000000000004


In [48]:
#Further tuning red model with intuition
intuition_model_red = RandomForestRegressor(n_estimators=850, random_state=1,
                                          oob_score=True,n_jobs=-1, max_features=7).fit(red_train_x, red_train_y)

y_pred = intuition_model_red.predict(red_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(red_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(red_test_y, y_pred))

RMSE: 0.546359043260244
MAE: 0.3988794117647059


# Tuning White Model

In [None]:
start_time = time.time()

n_samples = white_train_x.shape[0]
n_features = white_train_x.shape[1]

params = {'n_estimators': [700, 800, 900],
          'max_features': [5,6,7]}

param_list=list(it.product(*(params[Name] for Name in params)))

oob_score = [0]*len(param_list)
i=0
for pr in param_list:
    model = RandomForestRegressor(random_state=1,criterion = "mae", oob_score=True,verbose=False,n_estimators = pr[0],
                                  max_features=pr[1],
                                  n_jobs=-1).fit(white_train_x,white_train_y)
    oob_score[i] = model.oob_score_
    i=i+1
    
end_time = time.time()
print("time taken = ", (end_time-start_time)/60, " minutes")
print("Best params = ", param_list[np.argmax(oob_score)])
print("Best score (R-squared) = ", np.max(oob_score))

In [None]:
# White Model with optimal parameters
optimal_model_white = RandomForestRegressor(n_estimators=700, random_state=1,
                                          oob_score=True,n_jobs=-1, max_features=5).fit(white_train_x, white_train_y)

y_pred = optimal_model_white.predict(white_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(white_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(white_test_y, y_pred))

In [None]:
#Further tuning white model with intuition
intuition_model_white = RandomForestRegressor(n_estimators=850, random_state=1,
                                          oob_score=True,n_jobs=-1, max_features=7).fit(white_train_x, white_train_y)

y_pred = intuition_model_white.predict(white_test_x)

#RMSE on test data
print("RMSE:",np.sqrt(mean_squared_error(white_test_y, y_pred)))
#MAE on test data
print("MAE:",mean_absolute_error(white_test_y, y_pred))