In [1]:
#importing modules

import pandas as pd
import numpy as np

import graphviz 

from sklearn import tree

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score


In [3]:
#Loading the dataframe

df_wine = pd.read_csv('winequality-red.csv', sep = ';')
df_wine.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [4]:
df_wine["quality"].unique()

array([5, 6, 7, 4, 8, 3])

In [5]:
#Binning the target 

group_names = ["bad", "decent", "excellent"]

df_wine["quality"] = pd.cut(df_wine["quality"], 3 , labels = group_names, include_lowest = True)

In [7]:
#converting target categorical data to numeric data by using Label encoder 
le = LabelEncoder()
df_wine["quality"] = le.fit_transform(df_wine["quality"])

In [8]:
df_wine.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1313,7.0,0.36,0.21,2.3,0.086,20.0,65.0,0.99558,3.4,0.54,10.1,1
789,8.6,0.63,0.17,2.9,0.099,21.0,119.0,0.998,3.09,0.52,9.3,1
1405,7.7,0.28,0.3,2.0,0.062,18.0,34.0,0.9952,3.28,0.9,11.3,2
247,8.2,0.6,0.17,2.3,0.072,11.0,73.0,0.9963,3.2,0.45,9.3,1
246,7.1,0.68,0.07,1.9,0.075,16.0,51.0,0.99685,3.38,0.52,9.5,1
491,9.2,0.41,0.5,2.5,0.055,12.0,25.0,0.9952,3.34,0.79,13.3,2
126,8.2,1.33,0.0,1.7,0.081,3.0,12.0,0.9964,3.53,0.49,10.9,1
710,10.6,1.025,0.43,2.8,0.08,21.0,84.0,0.9985,3.06,0.57,10.1,1
1218,8.2,0.4,0.31,1.9,0.082,8.0,24.0,0.996,3.24,0.69,10.6,1
453,10.4,0.33,0.63,2.8,0.084,5.0,22.0,0.9998,3.26,0.74,11.2,2


In [9]:
#Splitting data into training and testing set

redwine_features = df_wine.drop("quality", axis = 1)
redwine_target = df_wine["quality"]

redwine_training_data, redwine_test_data, redwine_training_target, redwine_test_target = train_test_split(
    redwine_features, redwine_target, test_size=0.20,random_state=42)


In [10]:
#Building a random forest classifier using default values and evaluating the accuracy

clf_1 = RandomForestClassifier()

clf_1.fit(redwine_training_data, redwine_training_target)

redwine_pred =clf_1.predict(redwine_test_data)

accuracy_default = accuracy_score(redwine_test_target, redwine_pred)
print('The accuracy of the Random Forest Classifier before tuning the hyperparamets is', accuracy_default)

The accuracy of the Random Forest Classifier before tuning the hyperparamets is 0.865625


In [18]:
#Using Grid Search to find the best parameters

n_estimators = [50, 100, 500, 1000]
max_depth = [10, 25, 50, 100, 200]
min_samples_split = [2, 5, 10, 20]
min_samples_leaf = [1, 2, 5, 10]

para = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,}

gs = GridSearchCV(clf_1, para, cv = None, n_jobs = 2)
gs.fit(redwine_training_data, redwine_training_target)
print(gs.best_params_)

{'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [25]:
#Evaluating accuracy of the model using best parameters as defined by Grid Search
clf_tuned = RandomForestClassifier(max_depth = 25, min_samples_leaf = 1, min_samples_split = 2, n_estimators= 500)

clf_tuned.fit(redwine_training_data, redwine_training_target)
redwine_tuned_predict = clf_tuned.predict(redwine_test_data)


acc_tuned = accuracy_score(redwine_test_target,redwine_tuned_predict)
print('The accuracy of the Random Forest after tuning is', acc_tuned)

The accuracy of the Random Forest after tuning is 0.8625
