In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import copy
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("winequalityN.csv")
df.quality = df.quality.astype("category")
df.drop_duplicates(inplace=True)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline

In [4]:
def modelfit(models,X_train,Y_train):
    models = models.copy()
    modellocal = []
    for name, model in models:
        clf = model.fit(X_train,Y_train)
        modellocal.append((name,clf))
    return modellocal

In [5]:
def TrainModelCV(modelcv,cv,n_jobs,X_train,Y_train):
    modellocal = []
    # modelcv = modelcv.copy()
    for name,param, model in modelcv:
        gsv = RandomizedSearchCV(
        model,
        param,
        verbose = 1,
        cv=cv,
        n_jobs=n_jobs
        )
        # print(gsv.get_params().keys())
        gsv.fit(X_train,Y_train)
        modellocal.append((name,gsv))
    return modellocal


In [6]:
def Report(models,X_train,Y_train,X_test,Y_test):
    models = models.copy()
    for name, model in models:
        print(name)
        print("!!!!Train!!!!")
        print(classification_report(model.predict(X_train),Y_train))
        print("!!!!Test!!!!")
        print(classification_report(model.predict(X_test),Y_test))

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto','sqrt','log']
criterion = ['gini','entropy']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [8]:
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
weights = ['uniform','distance']
algorithm = ['auto','ball_tree','kd_tree','brute']

In [9]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion' : criterion}
param_grid = {'leaf_size': leaf_size,
              'n_neighbors': n_neighbors,
              'p': p,
              'weights':weights,
              'algorithm':algorithm,
              }
random_DC = { 'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion' : criterion
               }

## Train Test split  

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

In [11]:
X = df.drop(["quality"],axis=1)
Y = df.quality

In [12]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

## Imputation

In [13]:
my_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_train = my_imputer.fit_transform(X_train)
X_test = my_imputer.transform(X_test)

In [14]:
X_train = pd.DataFrame(X_train,columns=df.columns.drop('quality'))
X_test = pd.DataFrame(X_test,columns=df.columns.drop('quality'))

In [15]:
X_train

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,white,6.0,0.16,0.22,1.6,0.042,36.0,106.0,0.9905,3.24,0.32,11.4
1,red,6.1,0.59,0.01,2.1,0.056,5.0,13.0,0.99472,3.52,0.56,11.4
2,red,7.9,0.4,0.3,1.8,0.157,2.0,45.0,0.99727,3.31,0.91,9.5
3,white,8.4,0.22,0.3,8.9,0.024,17.0,118.0,0.99456,3.16,0.5,9.5
4,red,6.9,0.67,0.06,2.1,0.08,8.0,33.0,0.99845,3.68,0.71,9.6
...,...,...,...,...,...,...,...,...,...,...,...,...
4381,white,6.1,0.28,0.23,4.2,0.038,13.0,95.0,0.98898,2.97,0.7,13.1
4382,red,7.9,0.66,0.0,1.4,0.096,6.0,13.0,0.99569,3.43,0.58,9.5
4383,red,7.5,0.755,0.0,1.9,0.084,6.0,12.0,0.99672,3.34,0.49,9.7
4384,red,7.7,0.54,0.26,1.9,0.089,23.0,147.0,0.99636,3.26,0.59,9.7


## Ordinal

In [16]:
ordinal_encoder = OrdinalEncoder()
label_X_train = X_train.copy()
label_X_test = X_test.copy()
label_X_train["type"] = ordinal_encoder.fit_transform(X_train["type"].values.reshape(-1,1))
label_X_test["type"] = ordinal_encoder.transform(X_test["type"].values.reshape(-1,1))

## Model

In [17]:
models = []
models.append(('RandomForest',random_grid, RandomForestClassifier()))

In [18]:
model_cv =TrainModelCV(models,cv=5,n_jobs=-1,X_train=label_X_train,Y_train=Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


 0.56087667 0.55586294 0.55768449 0.56338496]


In [19]:
Report(model_cv,X_train=label_X_train,Y_train=Y_train,X_test=label_X_test,Y_test=Y_test)

RandomForest
!!!!Train!!!!
              precision    recall  f1-score   support

           3       0.69      1.00      0.82        18
           4       0.85      1.00      0.92       144
           5       1.00      0.98      0.99      1458
           6       1.00      0.98      0.99      1947
           7       0.98      0.99      0.99       703
           8       0.89      1.00      0.94       113
           9       0.75      1.00      0.86         3

    accuracy                           0.98      4386
   macro avg       0.88      0.99      0.93      4386
weighted avg       0.99      0.98      0.99      4386

!!!!Test!!!!
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.63      0.65      0.64       356
           6       0.76      0.56      0.65       656
           7       0.30      0.63      0.40        82
           8       0.07      0.67     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
