# Module 03

## Session 08 Ensemble Methods

Analyze data white_wine.csv
* Apply voting classifier
    - target: quality (quality > 6 ---> y = 1)
    - features: density alcohol
* Validate the model using precision, recall and f1 score in 20% testing data
* Apply soft voting classifier method using these following methods:
    - Logistic regression
    - Decision tree: max depth 5
    - Knn: nearest neighbor 3
* Apply soft voting classifier method using these following methods:
    - 3rd degree polynomial features + logistic regression
    - Decision tree: max depth 5
    - Standard scaler + knn: nearest neighbor 3


# Library

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

# Data

In [5]:
wine = pd.read_csv('./datasets/winequality-white.csv', delimiter=';')
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
wine['alcohol'].fillna(wine['alcohol'].mean(), inplace=True)

In [7]:
y = np.where(wine['quality'] > 6, 1, 0)
X = wine[['alcohol', 'density']]

# Data Splitting

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=2020
)

# Model

## Ensemble 1

In [11]:
lr = LogisticRegression(solver='liblinear', C = 0.01)
dt = DecisionTreeClassifier(max_depth=5)
knn = KNeighborsClassifier(n_neighbors=3)

vc = VotingClassifier(
    [
        ('logistic', lr),
        ('tree', dt),
        ('knn', knn)
    ],
    voting='soft'
)

In [12]:
vc.fit(X_train, y_train)

VotingClassifier(estimators=[('logistic',
                              LogisticRegression(C=0.01, solver='liblinear')),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn', KNeighborsClassifier(n_neighbors=3))],
                 voting='soft')

In [13]:
y_pred = vc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.97      0.89       768
           1       0.66      0.20      0.30       212

    accuracy                           0.80       980
   macro avg       0.74      0.58      0.60       980
weighted avg       0.78      0.80      0.76       980



## Ensemble 2

In [14]:
poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
lr = LogisticRegression(solver='liblinear', C = 0.01)
pipe_lr = Pipeline(
    [
        ('poly', poly),
        ('clf', lr)
    ]
)

dt = DecisionTreeClassifier(max_depth=5)

scaler = StandardScaler()
knn = KNeighborsClassifier(n_neighbors=3)
pipe_knn = Pipeline(
    [
        ('scaler', scaler),
        ('knn', knn)
    ]
)

vc2 = VotingClassifier(
    [
        ('logistic', pipe_lr),
        ('tree', dt),
        ('knn', pipe_knn)
    ],
    voting='soft'
)

In [15]:
vc2.fit(X_train, y_train)

VotingClassifier(estimators=[('logistic',
                              Pipeline(steps=[('poly',
                                               PolynomialFeatures(degree=3,
                                                                  include_bias=False)),
                                              ('clf',
                                               LogisticRegression(C=0.01,
                                                                  solver='liblinear'))])),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('knn',
                                               KNeighborsClassifier(n_neighbors=3))]))],
                 voting='soft')

In [16]:
y_pred = vc2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88       768
           1       0.59      0.31      0.40       212

    accuracy                           0.80       980
   macro avg       0.71      0.62      0.64       980
weighted avg       0.78      0.80      0.78       980

