In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data = pd.read_csv('winequality-red.csv')

# Data preparation

In [4]:
Var = data.drop(['quality'], axis=1)
Target = data['quality']
Var_train, Var_test, Target_train, Target_test = train_test_split(Var,Target,
                                                                  test_size=.3,
                                                                  random_state=1)

In [5]:
accuracy = {}
def add_stat(name, algrthm):
    accuracy[name] = accuracy_score(algrthm.predict(Var_test), Target_test)
    

# Decision Tree

In [6]:
%%time
algrthm = DecisionTreeClassifier(random_state=1)
config = {'max_depth': np.arange(6, 10),
          'min_samples_leaf' : np.arange(1, 5),
          'max_features' : np.arange(3, 8)}
grid_sch = GridSearchCV(algrthm,config,cv=5,n_jobs=-1)
grid_sch.fit(Var_train,Target_train)

grid_sch.best_params_, grid_sch.best_score_

Wall time: 10.2 s


({'max_depth': 8, 'max_features': 7, 'min_samples_leaf': 4},
 0.6103699551569507)

In [7]:
%%time
dctree = DecisionTreeClassifier(max_depth=8,max_features=7,min_samples_leaf=4)
dctree.fit(Var_train,Target_train)
add_stat("Decision Tree", dctree)
accuracy_score(dctree.predict(Var_test), Target_test)


Wall time: 6 ms


0.575

# kNN

In [8]:
%%time
algrthm = KNeighborsClassifier(n_jobs=-1)
config = {'n_neighbors': np.arange(15,25)}
grid_sch = GridSearchCV(algrthm,config,cv=5,n_jobs=-1)
grid_sch.fit(Var_train,Target_train)

grid_sch.best_params_, grid_sch.best_score_

Wall time: 508 ms


({'n_neighbors': 20}, 0.529936739269699)

In [9]:
%%time
kNNghbrs = KNeighborsClassifier(n_jobs=-1,n_neighbors=20)
kNNghbrs.fit(Var_train,Target_train)
add_stat("kNN", kNNghbrs)
accuracy_score(kNNghbrs.predict(Var_test), Target_test)

Wall time: 246 ms


0.5041666666666667

# Logistic Regression

In [10]:
%%time
algrthm = LogisticRegression(random_state=1, n_jobs=-1)
config = {'C': np.linspace(0, 1, 11)}
grid_sch = GridSearchCV(algrthm,config,cv=5,n_jobs=-1)
grid_sch.fit(Var_train,Target_train)

grid_sch.best_params_, grid_sch.best_score_

Wall time: 558 ms


({'C': 1.0}, 0.5754964766175528)

In [11]:
%%time
LogRegrn = LogisticRegression(C=1,random_state=1, n_jobs=-1)
LogRegrn.fit(Var_train,Target_train)
add_stat("Logistic Regression", LogRegrn)
accuracy_score(LogRegrn.predict(Var_test), Target_test)

Wall time: 75 ms


0.6

# Linear Regression

In [12]:
%%time
LinRegrn = LinearRegression(n_jobs=-1)
LinRegrn.fit(Var_train,Target_train)
accuracy['Linear Regression'] = accuracy_score(np.around(LinRegrn.predict(Var_test)), Target_test)
accuracy_score(np.around(LinRegrn.predict(Var_test)), Target_test) 


Wall time: 11 ms


0.6166666666666667

# Random Forest

In [13]:
%%time
algrthm = RandomForestClassifier(random_state=1, n_jobs=-1)
config = {'max_depth': np.arange(3,20),
          'min_samples_leaf' : np.arange(1,5),
          'max_features' : np.arange(1, 5),
          'n_estimators' : np.arange(15,20)}
grid_sch = GridSearchCV(algrthm,config,cv=5,n_jobs=-1)
grid_sch.fit(Var_train,Target_train)

grid_sch.best_params_, grid_sch.best_score_

Wall time: 1min 35s


({'max_depth': 15,
  'max_features': 2,
  'min_samples_leaf': 1,
  'n_estimators': 17},
 0.6764814221652786)

In [14]:
%%time
RndmFrst = RandomForestClassifier(n_jobs=-1,
                                  random_state=1,
                                  max_depth=15,
                                  max_features=2,
                                  min_samples_leaf=1,
                                  n_estimators=17)
RndmFrst.fit(Var_train,Target_train)
add_stat("Random Forest", RndmFrst)
accuracy_score(RndmFrst.predict(Var_test), Target_test)

Wall time: 324 ms


0.6541666666666667

# Gradient Boosting

In [15]:
%%time
algrthm = GradientBoostingClassifier(random_state=1)
config = {'max_depth': np.arange(15,20),
          'min_samples_leaf' : np.arange(1,5),
          'max_features' : np.arange(1, 5),
          'n_estimators' : np.arange(15,20)}
grid_sch = GridSearchCV(algrthm,config,cv=5,n_jobs=-1)
grid_sch.fit(Var_train,Target_train)

grid_sch.best_params_, grid_sch.best_score_

Wall time: 2min 58s


({'max_depth': 17,
  'max_features': 4,
  'min_samples_leaf': 3,
  'n_estimators': 19},
 0.6720571748878924)

In [16]:
%%time
GrdBoost = GradientBoostingClassifier(random_state=1,
                                      max_depth=17,
                                      max_features=4,
                                      min_samples_leaf=3,
                                      n_estimators=19)
GrdBoost.fit(Var_train,Target_train)
add_stat("Gradient Boosting", GrdBoost)
accuracy_score(GrdBoost.predict(Var_test), Target_test)

Wall time: 1.15 s


0.68125

# Total

In [20]:
accuracy = pd.Series(accuracy)
total = pd.DataFrame({'accuracy': accuracy})
total.sort_values(by='accuracy', ascending=False)

Unnamed: 0,accuracy
Gradient Boosting,0.68125
Random Forest,0.654167
Linear Regression,0.616667
Logistic Regression,0.6
Decision Tree,0.575
kNN,0.504167
