#### Importing needed Libraries

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

#### Importing Data

first the already classified data

In [2]:
wine_quality = pd.read_excel('train_excel.xlsx', sheet_name = 'train', index_col = 'index')

now import the data to classify

In [3]:
wine_test = pd.read_excel('test_excel.xlsx', sheet_name = 'test', index_col = 'index')

#### Exploring the Data

In [4]:
wine_quality.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2979 entries, 2737 to 610
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         2979 non-null   float64
 1   volatile acidity      2979 non-null   float64
 2   citric acid           2979 non-null   float64
 3   residual sugar        2979 non-null   float64
 4   chlorides             2979 non-null   float64
 5   free sulfur dioxide   2979 non-null   float64
 6   total sulfur dioxide  2979 non-null   float64
 7   density               2979 non-null   float64
 8   pH                    2979 non-null   float64
 9   sulphates             2979 non-null   float64
 10  alcohol               2979 non-null   float64
 11  quality               2979 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 302.6 KB


#### Modifying the data

creating new datasets, x and y for the independent and dependent variables respectively

In [5]:
x = wine_quality.iloc[:,:-1]
x

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2737,6.4,0.350,0.28,1.6,0.037,31.0,113.0,0.98779,3.12,0.40,14.2
1944,4.9,0.330,0.31,1.2,0.016,39.0,150.0,0.98713,3.33,0.59,14.0
2766,4.7,0.455,0.18,1.9,0.036,33.0,106.0,0.98746,3.21,0.83,14.0
283,5.8,0.320,0.20,2.6,0.027,17.0,123.0,0.98936,3.36,0.78,13.9
1828,5.8,0.240,0.28,1.4,0.038,40.0,76.0,0.98711,3.10,0.29,13.9
...,...,...,...,...,...,...,...,...,...,...,...
771,7.8,0.180,0.46,12.6,0.042,41.0,143.0,1.00000,3.24,0.76,8.5
1812,6.5,0.360,0.16,1.3,0.054,11.0,107.0,0.99398,3.19,0.39,8.5
1929,6.1,0.430,0.35,9.1,0.059,83.0,249.0,0.99710,3.37,0.50,8.5
2061,10.0,0.380,0.38,1.6,0.169,27.0,90.0,0.99914,3.15,0.65,8.5


In [6]:
y = wine_quality.iloc[:,-1]
y

index
2737    1
1944    1
2766    1
283     1
1828    1
       ..
771     1
1812    0
1929    0
2061    0
610     0
Name: quality, Length: 2979, dtype: int64

splitting the data into train and validation, where training contains 70% of observations

In [7]:
x_train, x_validation, y_train, y_validation = train_test_split(x, y,
                                                                train_size = 0.7,
                                                                shuffle = True,
                                                                stratify = y)

#### Modelling

creating the 'model' to use

In [8]:
model = RandomForestClassifier()

##### Hyperparameter Testing

try to play with the parameters below for a better model

In [9]:
param_grid = {
    'n_estimators': [100],
    'max_depth': [None],
    'min_samples_split': [5],
}

In [10]:
grid_search = GridSearchCV ( estimator = model, param_grid = param_grid, cv = 2, verbose = 2, n_jobs = 4)
grid_search.fit(x_train, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


get the best params

In [11]:
grid_search.best_params_

{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}

predict values

In [12]:
predictions_training = grid_search.predict(x_train)
predictions_training

array([1, 1, 0, ..., 0, 0, 1], dtype=int64)

In [13]:
predictions_validation = grid_search.predict(x_validation)

#### Assessment

mean accuracy in training dataset

In [14]:
grid_search.score(x_train, y_train)

0.9947242206235012

mean accuracy in validation dataset

In [15]:
grid_search.score(x, y)

0.9288351795904666

confusion matrix for both training and validation dataset for check ups

In [16]:
confusion_matrix(y_train, predictions_training)

array([[1000,    7],
       [   4, 1074]], dtype=int64)

In [17]:
confusion_matrix(y_validation, predictions_validation)

array([[340,  92],
       [109, 353]], dtype=int64)

#### Deployment

applying the model to the unclassified data

In [18]:
wine_test

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1279,8.8,0.350,0.49,1.0,0.036,14.0,56.0,0.99200,2.96,0.33,10.5
1280,7.8,0.645,0.00,2.0,0.082,8.0,16.0,0.99640,3.38,0.59,9.8
1281,8.9,0.300,0.35,4.6,0.032,32.0,148.0,0.99458,3.15,0.45,11.5
1282,7.4,0.410,0.24,1.8,0.066,18.0,47.0,0.99560,3.37,0.62,10.4
1283,7.4,0.390,0.23,7.0,0.033,29.0,126.0,0.99400,3.14,0.42,10.5
...,...,...,...,...,...,...,...,...,...,...,...
2552,6.6,0.705,0.07,1.6,0.076,6.0,15.0,0.99620,3.44,0.58,10.7
2553,6.4,0.140,0.28,7.9,0.057,21.0,82.0,0.99425,3.26,0.36,10.0
2554,6.1,0.360,0.58,15.0,0.044,42.0,115.0,0.99780,3.15,0.51,9.0
2555,6.4,0.570,0.12,2.3,0.120,25.0,36.0,0.99519,3.47,0.71,11.3


In [19]:
wine_test['quality'] = grid_search.predict(wine_test)

In [20]:
wine_test

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1279,8.8,0.350,0.49,1.0,0.036,14.0,56.0,0.99200,2.96,0.33,10.5,0
1280,7.8,0.645,0.00,2.0,0.082,8.0,16.0,0.99640,3.38,0.59,9.8,0
1281,8.9,0.300,0.35,4.6,0.032,32.0,148.0,0.99458,3.15,0.45,11.5,1
1282,7.4,0.410,0.24,1.8,0.066,18.0,47.0,0.99560,3.37,0.62,10.4,1
1283,7.4,0.390,0.23,7.0,0.033,29.0,126.0,0.99400,3.14,0.42,10.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2552,6.6,0.705,0.07,1.6,0.076,6.0,15.0,0.99620,3.44,0.58,10.7,0
2553,6.4,0.140,0.28,7.9,0.057,21.0,82.0,0.99425,3.26,0.36,10.0,1
2554,6.1,0.360,0.58,15.0,0.044,42.0,115.0,0.99780,3.15,0.51,9.0,0
2555,6.4,0.570,0.12,2.3,0.120,25.0,36.0,0.99519,3.47,0.71,11.3,1


export the data to a csv

In [21]:
wine_test['quality'].to_csv('Submission2.csv')