In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
# import Housing.csv
df = pd.read_csv('data/wine.csv')
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
#drop rows with null values

df= df.dropna()
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6491,red,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [4]:
#show unique values of column 'quality'
df.quality.unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [5]:
#count samples column 'quality'

df.groupby('quality').size()

quality
3      30
4     214
5    2128
6    2820
7    1074
8     192
9       5
dtype: int64

In [6]:
#convert the range of quality from 3,4,5,6,7,8,9 to 5,6,7
#3,4,5 mapped to 5, 6 to 6, 7,8,9 mapped to 7

df['quality']= df['quality'].map({ 3: 5, 4: 5, 5: 5, 6: 6, 7:7, 8: 7, 9: 7})


In [7]:
df.groupby('quality').size()

quality
5    2372
6    2820
7    1271
dtype: int64

In [8]:
#x = df[["fixed acidity","volatile acidity","chlorides","citric acid","residual sugar", "sulphates", "pH","density", "alcohol"]]
x = df[["sulphates", "pH"]]
y=df["quality"]

X = x.to_numpy().reshape(-1,2)
y = y.to_numpy().reshape(-1,1)

In [9]:
# Train Test split 
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 42)

In [10]:
#train decision tree

clf = DecisionTreeClassifier(max_depth=90) #clf to denote classifier
clf.fit(X_train,y_train)

# Model Evaluation- Training Data

In [11]:
#evaluation on training data

y_pred = clf.predict(X_train)

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           5       0.70      0.84      0.77      1650
           6       0.80      0.76      0.78      1992
           7       0.84      0.62      0.71       882

    accuracy                           0.76      4524
   macro avg       0.78      0.74      0.75      4524
weighted avg       0.77      0.76      0.76      4524



# Model Evaluation- Test Data

In [12]:
#evaluation on test data

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           5       0.46      0.53      0.50       722
           6       0.52      0.50      0.51       828
           7       0.40      0.31      0.35       389

    accuracy                           0.47      1939
   macro avg       0.46      0.45      0.45      1939
weighted avg       0.47      0.47      0.47      1939



In [21]:
# Create grid parameters for hyperparameter tuning
#
params =  {
    
    'max_depth': [ 3, 5, 6,10, 20, 40, 80]
}
#
# Create gridsearch instance
#
grid = GridSearchCV(estimator=clf,
                    param_grid=params,
                    cv=10,
                    n_jobs=1,
                    verbose=2)
#
# Fit the model
#
grid.fit(X_train, y_train)
#
# Assess the score
#
grid.best_score_, grid.best_params_

Fitting 10 folds for each of 7 candidates, totalling 70 fits
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=3; total time=   0.0s
[CV] END ........................................max_depth=5; total time=   0.0s
[CV] END .......................................

(0.46241819531539974, {'max_depth': 80})