### import important libraries

In [51]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

### Data overview

In [52]:
# load data and check data shape
data = pd.read_csv("wine.csv")
print('Total samples of wine dataset: %d' % data.shape[0])
print('Total variables of wine dataset: %d' % data.shape[1])


Total samples of wine dataset: 2200
Total variables of wine dataset: 14


In [53]:
# checking the column names and datatypes
df=data.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   class                           2200 non-null   int64  
 1   Alcohol                         2200 non-null   float64
 2   Malic_acid                      2200 non-null   float64
 3   Ash                             2200 non-null   float64
 4   Alcalinity_of_ash               2200 non-null   float64
 5   Magnesium                       2200 non-null   float64
 6   Total_phenols                   2200 non-null   float64
 7   Flavanoids                      2200 non-null   float64
 8   Nonflavanoid_phenols            2200 non-null   float64
 9   Proanthocyanins                 2200 non-null   float64
 10  Color_intensity                 2200 non-null   float64
 11  Hue                             2200 non-null   float64
 12  OD280%2FOD315_of_diluted_wines  22

In [54]:
# view a sample of the data
data.sample(n=10, random_state=1)

Unnamed: 0,class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline
1276,1,12.102024,1.978243,2.206944,16.830709,90.91818,3.175905,3.286019,0.313303,1.742839,8.531437,1.053424,2.776765,1195.569148
1446,3,13.57233,2.542908,2.696846,23.484424,99.259132,1.674558,2.595252,0.421765,1.12974,8.589199,0.887106,2.344734,549.436133
335,1,14.340415,1.924594,2.524905,14.91331,120.34598,3.247975,2.851522,0.265849,1.259319,10.056588,0.893063,2.10183,1207.292428
1458,3,13.157149,1.59526,1.956307,20.726283,90.66267,2.675248,1.513338,0.598651,3.084097,4.978823,0.982451,1.355483,625.185784
2038,1,13.014567,1.771221,1.87766,14.545069,93.27238,2.288577,0.558747,0.350538,1.958462,4.528011,1.019395,3.077627,1208.63177
1314,3,12.331843,3.215488,2.72125,24.354064,108.370083,2.960632,0.571487,0.565,1.59368,10.216497,0.623675,2.429772,881.61993
389,1,14.142197,2.039219,2.236748,17.509961,103.760235,2.825525,3.912876,0.298394,1.832097,5.950905,1.047704,2.097227,1079.104998
1639,3,12.425975,2.294901,2.924798,20.858693,100.67721,1.81499,2.762398,0.306179,1.143092,9.920537,0.597982,2.141342,838.086048
2004,3,12.74562,4.072534,2.415821,21.417589,100.328676,1.861831,4.243816,0.317223,2.608496,8.036917,0.614371,1.561459,563.890107
403,3,14.12093,2.972,1.974538,22.460124,99.29783,1.482434,0.499428,0.619503,1.283445,9.515219,0.629635,1.438051,509.489705


In [55]:
# checking for duplicate values
df.duplicated().sum()

0

In [56]:
# checking for missing values in the data
print(f"\nNull values in each feature/variable:\n{df.isnull().sum()}")


Null values in each feature/variable:
class                             0
Alcohol                           0
Malic_acid                        0
Ash                               0
Alcalinity_of_ash                 0
Magnesium                         0
Total_phenols                     0
Flavanoids                        0
Nonflavanoid_phenols              0
Proanthocyanins                   0
Color_intensity                   0
Hue                               0
OD280%2FOD315_of_diluted_wines    0
Proline                           0
dtype: int64


### Data processing and splitting into training and test data

In [57]:
X = df.iloc[:, 1:14]
y = df[['class']]
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

### Apply different supervised learning models

####Building a decision tree model

In [58]:
# decision tree without tuning
dt = DecisionTreeClassifier(random_state=1).fit(X_train,y_train)
y_pred = dt.predict(X_test)
print("Accuracy of decision tree without tuning:",accuracy_score(y_test, y_pred))

Accuracy of decision tree without tuning: 0.8409090909090909


In [59]:
# Tune hyperparameters of decision tree using GridSearchCV
max_depth = np.arange(1,16)
min_samples_leaf = np.arange(1,21)
dt_parameters = {'max_depth' : max_depth,'min_samples_leaf':min_samples_leaf}
dt= GridSearchCV(dt, param_grid=dt_parameters, cv=5,scoring='accuracy')
start = time.time()
dt.fit(X_train, y_train)
end = time.time()
y_pred = dt.predict(X_test)
print("Best parameter for decision tree of the wine dataset:")
print(dt.best_params_)
print('Finished training in %f seconds' % (end - start))
print("Best accuracy of decision tree:",accuracy_score(y_test, y_pred))
print(classification_report(y_pred,y_test))


Best parameter for decision tree of the wine dataset:
{'max_depth': 9, 'min_samples_leaf': 5}
Finished training in 18.229996 seconds
Best accuracy of decision tree: 0.8681818181818182
              precision    recall  f1-score   support

           1       0.88      0.88      0.88       277
           2       0.87      0.87      0.87       348
           3       0.85      0.85      0.85       255

    accuracy                           0.87       880
   macro avg       0.87      0.87      0.87       880
weighted avg       0.87      0.87      0.87       880



####Building a kNN model

In [60]:
# kNN networks without tuning
knn= KNeighborsClassifier().fit(X_train,y_train.values.ravel())
y_pred = knn.predict(X_test)
print("Accuracy of kNN without tuning:",accuracy_score(y_test, y_pred))

Accuracy of kNN without tuning: 0.9068181818181819


In [61]:
# Tune hyperparameters of kNN using GridSearchCV
n_neighbors = np.arange(1, 36)
p = [1, 2]
knn_parameters = {'n_neighbors' : n_neighbors ,'p':p}
knn= GridSearchCV(knn, param_grid=knn_parameters, cv=5,scoring='accuracy')
start = time.time()
knn.fit(X_train, y_train.values.ravel())
end = time.time()
y_pred = knn.predict(X_test)
print("Best parameter for kNN of the wine dataset:")
print(knn.best_params_)
print('Finished training in %f seconds' % (end - start))
print("Best accuracy of kNN:",accuracy_score(y_test, y_pred))
print(classification_report(y_pred,y_test))

Best parameter for kNN of the wine dataset:
{'n_neighbors': 30, 'p': 1}
Finished training in 7.074204 seconds
Best accuracy of kNN: 0.9227272727272727
              precision    recall  f1-score   support

           1       0.94      0.89      0.92       293
           2       0.92      0.93      0.92       342
           3       0.91      0.95      0.93       245

    accuracy                           0.92       880
   macro avg       0.92      0.92      0.92       880
weighted avg       0.92      0.92      0.92       880



#### Building a SVM model

In [62]:
# kNN networks without tuning
svm = svm.SVC(random_state = 1).fit(X_train, y_train.values.ravel())
y_pred = svm.predict(X_test)
print("Accuracy of wine dataset by SVM without tuning:",accuracy_score(y_test, y_pred))

Accuracy of wine dataset by SVM without tuning: 0.9159090909090909


In [63]:
# Tune hyperparameters of SVM using GridSearchCV
svm_parameters = {'C':np.linspace(0.01,1,20), 'gamma': np.linspace(0.01,0.1,10)}
svm = GridSearchCV(svm, param_grid=svm_parameters, cv=5,scoring='accuracy')
start = time.time()
svm.fit(X_train, y_train.values.ravel())
end = time.time()
y_pred = svm.predict(X_test)
print("Best parameter for SVM of the wine dataset:")
print(svm.best_params_)
print('Finished training in %f seconds' % (end - start))
print("Best accuracy of SVM:",accuracy_score(y_test, y_pred))
print(classification_report(y_pred,y_test))

Best parameter for SVM of the wine dataset:
{'C': 0.4268421052631579, 'gamma': 0.05000000000000001}
Finished training in 45.356519 seconds
Best accuracy of SVM: 0.9102272727272728
              precision    recall  f1-score   support

           1       0.91      0.91      0.91       278
           2       0.91      0.92      0.91       345
           3       0.91      0.90      0.91       257

    accuracy                           0.91       880
   macro avg       0.91      0.91      0.91       880
weighted avg       0.91      0.91      0.91       880



#### Building a Boosting model

In [64]:
# Boosting without tuning
boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), random_state=1).fit(X_train, y_train.values.ravel())
y_pred = boost.predict(X_test)
print("Accuracy of wine dataset by Adaboost without tuning:",accuracy_score(y_test, y_pred))

Accuracy of wine dataset by Adaboost without tuning: 0.9011363636363636


In [65]:
# Tune hyperparameters of boosting using GridSearchCV
boost_parameters = {'n_estimators':[20,30,40,50,60,70,80,90,100,120], 'learning_rate': [0.01,0.04,0.07,0.1,0.2,0.3,0.4,0.5,0.6,0.8]}
boost = GridSearchCV(boost, param_grid=boost_parameters, cv=5,scoring='accuracy', error_score='raise')
start = time.time()
boost.fit(X_train, y_train.values.ravel())
end = time.time()
y_pred = boost.predict(X_test)
print("Best parameter for boosting of the wine dataset:")
print(boost.best_params_)
print('Finished training in %f seconds' % (end - start))
print("Best accuracy of boosting:",accuracy_score(y_test, y_pred))
print(classification_report(y_pred,y_test))

Best parameter for boosting of the wine dataset:
{'learning_rate': 0.2, 'n_estimators': 100}
Finished training in 136.633251 seconds
Best accuracy of boosting: 0.9125
              precision    recall  f1-score   support

           1       0.93      0.89      0.91       291
           2       0.92      0.91      0.92       350
           3       0.89      0.95      0.91       239

    accuracy                           0.91       880
   macro avg       0.91      0.91      0.91       880
weighted avg       0.91      0.91      0.91       880



#### Building a neural networks model

In [66]:
# neural networks without tuning
nn = MLPClassifier(random_state=1,max_iter=3000).fit(X_train, y_train.values.ravel())
y_pred = nn.predict(X_test)
print("Accuracy of neural networks without tuning:",accuracy_score(y_test, y_pred))

Accuracy of neural networks without tuning: 0.8965909090909091


In [67]:
# Tune hyperparameters of neural networks using GridSearchCV
nn = MLPClassifier(hidden_layer_sizes=(150, 100, 50),random_state=1,max_iter=3000).fit(X_train, y_train.values.ravel())
nn_parameters = {'alpha':np.linspace(5,15,12)}
nn = GridSearchCV(nn, param_grid=nn_parameters, cv=5,scoring='accuracy')
start = time.time()
nn.fit(X_train, y_train.values.ravel())
end = time.time()
y_pred = nn.predict(X_test)
print("Best parameter for neural networks of the wine dataset:")
print(nn.best_params_)
print('Finished training in %f seconds' % (end - start))
print("Best accuracy of neural networks:",accuracy_score(y_test, y_pred))
print(classification_report(y_pred,y_test))

Best parameter for neural networks of the wine dataset:
{'alpha': 14.09090909090909}
Finished training in 291.367849 seconds
Best accuracy of neural networks: 0.9090909090909091
              precision    recall  f1-score   support

           1       0.90      0.91      0.90       273
           2       0.91      0.91      0.91       350
           3       0.92      0.91      0.91       257

    accuracy                           0.91       880
   macro avg       0.91      0.91      0.91       880
weighted avg       0.91      0.91      0.91       880

