# Multiclass Voting Classifier to Predict Wine Quality Score

## Wine Data
Data from http://archive.ics.uci.edu/ml/datasets/Wine+Quality

### Citations
<pre>
Dua, D. and Karra Taniskidou, E. (2017). 
UCI Machine Learning Repository [http://archive.ics.uci.edu/ml/index.php]. 
Irvine, CA: University of California, School of Information and Computer Science.
</pre>

<pre>
P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 
Modeling wine preferences by data mining from physicochemical properties.
In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236.
</pre>

Available at:
- [@Elsevier](http://dx.doi.org/10.1016/j.dss.2009.05.016)
- [Pre-press (pdf)](http://www3.dsi.uminho.pt/pcortez/winequality09.pdf)
- [bib](http://www3.dsi.uminho.pt/pcortez/dss09.bib)

## Setup

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

red_wine = pd.read_csv('../../ch_10/data/winequality-red.csv')
white_wine = pd.read_csv('../../ch_10/data/winequality-white.csv', sep=';')

## EDA

In [None]:
wine = pd.concat([white_wine.assign(kind='white'), red_wine.assign(kind='red')])
wine.sample(5, random_state=10)

In [None]:
def plot_quality_scores(df, kind):
    ax = df.quality.value_counts().sort_index().plot.barh(
        title=f'{kind.title()} Wine Quality Scores', figsize=(12, 3)
    )
    ax.axes.invert_yaxis()
    for bar in ax.patches:
        ax.text(
            bar.get_width(), 
            bar.get_y() + bar.get_height()/2, 
            f'{bar.get_width()/df.shape[0]:.1%}',
            verticalalignment='center'
        )
    plt.xlabel('count of wines')
    plt.ylabel('quality score')

    for spine in ['top', 'right']:
        ax.spines[spine].set_visible(False)

    return ax

plot_quality_scores(wine, '')

In [None]:
wine.info()

In [None]:
wine.describe()

In [None]:
wine.kind.value_counts()

## Train test split

In [None]:
from sklearn.model_selection import train_test_split

wine_y = wine.pop('quality')
wine_X = wine

X_train, X_test, y_train, y_test = train_test_split(
    wine_X, wine_y, test_size=0.25, random_state=0, stratify=wine_y
)

X_train.sample(5, random_state=0)

## Build models
### Random Forest

In [None]:
%%capture
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pipeline = Pipeline([
    ('transformer', ColumnTransformer([
        ('scale', StandardScaler(), slice(0, -1)),
        ('encode', OneHotEncoder(sparse=False), [-1])
    ])),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=0))
])

search_space = {
    'rf__max_depth': np.arange(5, 20, 5)
}

rf = GridSearchCV(pipeline, search_space, scoring='f1_macro', cv=5).fit(X_train, y_train)

### Gradient Boosted Trees

In [None]:
%%capture
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pipeline = Pipeline([
    ('transformer', ColumnTransformer([
        ('scale', StandardScaler(), slice(0, -1)),
        ('encode', OneHotEncoder(sparse=False), [-1])
    ])),
    ('gb', GradientBoostingClassifier(random_state=0))
])

search_space = {
    'gb__max_depth': np.arange(3, 12, 3)
}

gb = GridSearchCV(pipeline, search_space, scoring='f1_macro', cv=5).fit(X_train, y_train)

### k-NN

In [None]:
%%capture
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pipeline = Pipeline([
    ('transformer', ColumnTransformer([
        ('scale', StandardScaler(), slice(0, -1)),
        ('encode', OneHotEncoder(sparse=False), [-1])
    ])),
    ('knn', KNeighborsClassifier())
])

search_space = {
    'knn__n_neighbors': np.arange(1, 5)
}

knn = GridSearchCV(pipeline, search_space, scoring='f1_macro', cv=5).fit(X_train, y_train)

### Logistic Regression

In [None]:
%%capture
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pipeline = Pipeline([
    ('transformer', ColumnTransformer([
        ('scale', StandardScaler(), slice(0, -1)),
        ('encode', OneHotEncoder(sparse=False), [-1])
    ])),
    ('lr', LogisticRegression(random_state=0, multi_class='multinomial'))
])

search_space = {
    'lr__C': [0.1, 0.5, 1, 5]
}

lr = GridSearchCV(pipeline, search_space, scoring='f1_macro', cv=5).fit(X_train, y_train)

### Naive Bayes

In [None]:
%%capture
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

nb = Pipeline([
    ('transformer', ColumnTransformer([
        ('scale', StandardScaler(), slice(0, -1)),
        ('encode', OneHotEncoder(sparse=False), [-1])
    ])),
    ('nb', GaussianNB())
]).fit(X_train, y_train)

### Determine agreement between the models with Cohen's Kappa

In [None]:
import itertools
from sklearn.metrics import cohen_kappa_score

models = zip(
    ['random forest', 'gradient boosting', 'knn', 'logistic regression', 'naive bayes'], 
    [rf, gb, knn, lr, nb]
)

def get_preds(model, test_X_data):
    return model.predict(test_X_data)

for ((model_1_name, model_1), (model_2_name, model_2)) in itertools.combinations(models, 2):
    score = cohen_kappa_score(*map(get_preds, [model_1, model_2], itertools.repeat(X_test)))
    print(f"Cohen's Kappa between {model_1_name} and {model_2_name} is: {score:.2f}")

## Voting Classifier with Majority Rules

In [None]:
%%capture
from sklearn.ensemble import VotingClassifier

majority_rules = VotingClassifier(
    [('rf', rf.best_estimator_), ('gb', gb.best_estimator_), 
     ('knn', knn.best_estimator_), ('lr', lr.best_estimator_),
     ('nb', nb)],
    voting='hard',
    weights=[1, 1, 1, 1, .5]
).fit(X_train, y_train)

### Evaluate model

In [None]:
majority_rules.score(X_test, y_test)

Get predictions:

In [None]:
preds = majority_rules.predict(X_test)

Examine the classification report:

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

Look at the confusion matrix:

In [None]:
from ml_utils.classification import confusion_matrix_visual

confusion_matrix_visual(y_test, preds, np.sort(wine_y.unique()))

<hr>
<div>
    <a href="./exercise_4.ipynb">
        <button>&#8592; Previous Solution</button>
    </a>
    <a href="../../ch_11/1-EDA_unlabeled_data.ipynb">
        <button style="float: right;">Chapter 11 &#8594;</button>
    </a>
</div>
<hr>