In [1]:
# Common imports
import numpy as np
import pandas as pd
import os

# Plotly imports
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns

In [2]:
wine_df = pd.read_csv('winequalityN.csv')

In [3]:
wine_df.rename(columns = {'type' : 'white_wine'}, inplace= True)

In [4]:
categorical = ['white_wine']
numerical = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

In [5]:
for col in categorical:
    arr = wine_df[col].unique()
    wine_df[col] = wine_df[col].astype('category')

## Data Cleaning

In [6]:
wine_df.isnull().sum()

def get_missing_data(dataFrame):
    percent_missing = dataFrame.isnull().sum() * 100 / len(dataFrame)
    missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
    return missing_value_df

missing_values_df = get_missing_data(wine_df)
print(missing_values_df)

                      percent_missing
white_wine                   0.000000
fixed acidity                0.153917
volatile acidity             0.123134
citric acid                  0.046175
residual sugar               0.030783
chlorides                    0.030783
free sulfur dioxide          0.000000
total sulfur dioxide         0.000000
density                      0.000000
pH                           0.138525
sulphates                    0.061567
alcohol                      0.000000
quality                      0.000000


In [7]:
for i in numerical:
  wine_df[i] = wine_df[i].fillna(wine_df[i].mean())

wine_df.info()

missing_values_df = get_missing_data(wine_df)
print(missing_values_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   white_wine            6497 non-null   category
 1   fixed acidity         6497 non-null   float64 
 2   volatile acidity      6497 non-null   float64 
 3   citric acid           6497 non-null   float64 
 4   residual sugar        6497 non-null   float64 
 5   chlorides             6497 non-null   float64 
 6   free sulfur dioxide   6497 non-null   float64 
 7   total sulfur dioxide  6497 non-null   float64 
 8   density               6497 non-null   float64 
 9   pH                    6497 non-null   float64 
 10  sulphates             6497 non-null   float64 
 11  alcohol               6497 non-null   float64 
 12  quality               6497 non-null   int64   
dtypes: category(1), float64(11), int64(1)
memory usage: 615.7 KB
                      percent_missing
white_win

## Wine Quality Modeling

In [8]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import StackingClassifier
from sklearn import svm


In [9]:

wine_df_quality = wine_df.copy()
# # 1. Split data
# # 70% -> Training and cross validation
# # 30% -> Testing
y = wine_df_quality["quality"]
X = wine_df_quality.iloc[:,0:12]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
#Encoding the wine type

def wine_bin(x):
    if x =='white':
        wine = 1
    else:
        wine = 0
    return wine


In [11]:
X_tr["white_wine"] = X_tr["white_wine"].apply(wine_bin)

In [12]:
X_tr_2 = X_tr.copy()

In [13]:
#Scaling the training data

minmax = MinMaxScaler()

X_tr_2[X_tr_2.columns] = minmax.fit_transform(X_tr_2[X_tr_2.columns])

X_tr_2

Unnamed: 0,white_wine,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
3761,1.0,0.190083,0.060000,0.168675,0.061290,0.026711,0.144330,0.255201,0.154942,0.372881,0.080000,0.449275
3740,1.0,0.264463,0.046667,0.168675,0.454839,0.065109,0.192440,0.396671,0.465688,0.203390,0.091429,0.144928
1359,1.0,0.223140,0.073333,0.192771,0.025806,0.046745,0.206186,0.349515,0.218817,0.542373,0.177143,0.405797
1230,1.0,0.289256,0.140000,0.180723,0.400000,0.051753,0.309278,0.643551,0.495037,0.288136,0.102857,0.101449
4751,1.0,0.289256,0.186667,0.373494,0.209677,0.035058,0.323024,0.496533,0.327579,0.355932,0.222857,0.376812
...,...,...,...,...,...,...,...,...,...,...,...,...
3772,1.0,0.206612,0.106667,0.174699,0.422581,0.038397,0.357388,0.355062,0.368580,0.381356,0.085714,0.376812
5191,0.0,0.256198,0.186667,0.150602,0.058065,0.143573,0.027491,0.027739,0.400086,0.584746,0.211429,0.304348
5226,0.0,0.793388,0.126667,0.373494,0.064516,0.116861,0.034364,0.041609,0.564091,0.372881,0.251429,0.246377
5390,0.0,0.421488,0.213333,0.307229,0.064516,0.066778,0.082474,0.058252,0.339663,0.508475,0.382857,0.782609


## ALL ML FOR STACKING

In [14]:
# Models with the best parameters from previous tunning

knn_clf = KNeighborsClassifier(metric='manhattan', n_neighbors=6, weights='distance')

rdn_clf = RandomForestClassifier(max_leaf_nodes=None, n_estimators=75)

clf = DecisionTreeClassifier(max_depth=1)
ada_clf = AdaBoostClassifier(clf, algorithm='SAMME.R', learning_rate=0.45, n_estimators=100)

gbt_clf = GradientBoostingClassifier(learning_rate=0.4, max_depth=6, max_features=4, n_estimators=150)

logreg_clf = LogisticRegression(penalty='none', solver='sag')

# models = [
#     ('knn', knn_clf),
#     ('rnd', rdn_clf),
#     ('ada', ada_clf),
#     ('grd', gbt_clf)
# ]

# sclf = StackingClassifier(estimators=models, final_estimator=logreg_clf, cv=5)

# classifiers = {"KNN": knn_clf,
#                "RDN": rdn_clf,
#                "ADA": ada_clf,
#                "GRD": gbt_clf,
#                "STCLF": sclf}


In [15]:
gbt_clf.fit(X_tr_2, y_tr)

GradientBoostingClassifier(learning_rate=0.4, max_depth=6, max_features=4,
                           n_estimators=150)

## INTERACTIVE INPUT ##

In [16]:
from ipywidgets import interact, widgets, interactive

In [17]:
def wine_brew(white_wine, vol_ac, cit_ac, TSO2, pH, alch):

    global minmax
    global gbt_clf
    qual = 0
    
    ingredient_dic = {'white_wine' : [white_wine],'fixed acidity' : [7.216579] , 'volatile acidity': [vol_ac], 'citric acid': [cit_ac], 'residual sugar': [5.444326],
           'chlorides': [0.056042], 'free sulfur dioxide': [30.525319], 'total sulfur dioxide': [TSO2], 'density':[0.994697],
           'pH': [pH], 'sulphates': [0.531215], 'alcohol': [alch]}

    ingred_df = pd.DataFrame.from_dict(ingredient_dic)
    
    

    
    ingred_df[ingred_df.columns] = minmax.transform(ingred_df[ingred_df.columns])


    
    ingred_pred = gbt_clf.predict(ingred_df)
    
    if ingred_pred[0] <= 4:
        qual = "Low"
    elif ingred_pred[0] >= 5 and ingred_pred[0] <= 6:
        qual = "Medium"
    elif ingred_pred[0] >= 7:
        qual = "High"

    return print("The wine quality is %s, valued at %s." % (qual,ingred_pred))




w = interactive(wine_brew, {'manual': True},
                
         white_wine =  (0.0,1,1),
         vol_ac=(0,1.6,0.01),
         cit_ac=(0.0,1.6,0.01),
         TSO2=(6,440,10),
         pH=(2.7,4,0.01),
         alch=(8,14.9,0.01)
         
         )




In [18]:
w

interactive(children=(FloatSlider(value=0.0, description='white_wine', max=1.0, step=1.0), FloatSlider(value=0…

**SAMPLE**

In [19]:
#Testing on a previously known wine 

wine_brew(1,0.1,0.28,116,3.20,11.5)

The wine quality is High, valued at [7].
