## Python Exercise : Voting Classifier

Analyze data white_wine.csv<br>
- Apply Voting Classifier
    - Target : quality (quality > 6 -> y=1)
    - Features : density, alcohol
- Validate the model using precision, recall and f1 score in 20% testing data
- Apply soft voting classifier method, using these following method:
    - Logistic regression
    - Decision tree : max depth 5
    - KNN : nearest neighbor 3
- Apply soft voting classifier method, using these following method
    - 3rd degree polynomial features + logistic regression
    - Decision tree : max depth 5
    - Standard scaler + knn : nearest neighbor 3

In [1]:
import pandas as pd
import numpy as np

# preprocessing
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from imblearn.pipeline import Pipeline

# model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, f1_score, plot_precision_recall_curve, plot_roc_curve

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
white_wine_df = pd.read_csv('data/white_wine.csv')
white_wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
515,6.1,0.31,0.26,2.2,0.051,28.0,167.0,0.9926,3.37,0.47,10.4,6.0
516,6.8,0.18,0.37,1.6,0.055,47.0,154.0,0.9934,3.08,0.45,9.1,5.0
517,7.4,0.15,0.42,1.7,0.045,49.0,154.0,0.9920,3.00,0.60,10.4,6.0
518,5.9,0.13,0.28,1.9,0.050,20.0,78.0,0.9918,3.43,0.64,10.8,6.0


In [3]:
white_wine_df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      1
sulphates               1
alcohol                 1
quality                 1
dtype: int64

In [4]:
white_wine_df['quality'].value_counts()

6.0    232
5.0    167
7.0     82
4.0     18
8.0     16
3.0      4
Name: quality, dtype: int64

In [5]:
white_wine_df.fillna(white_wine_df['alcohol'].mean(), inplace=True)

In [6]:
# white_wine_df.fillna(6, inplace=True)

### Data Splitting

In [7]:
x = white_wine_df[['density','alcohol']]
y = np.where(white_wine_df['quality']>6, 1,0)

x_trainval, x_test, y_trainval, y_test = train_test_split(
    x,
    y,
    stratify=y,
    test_size=0.2,
    random_state=2020
)

### Ensemble 1

In [8]:
logreg = LogisticRegression(solver='liblinear',C=0.01)
tree = DecisionTreeClassifier(max_depth=5)
knn = KNeighborsClassifier(n_neighbors=3)

vc = VotingClassifier([
    ('logreg', logreg),
    ('tree', tree),
    ('knn', knn)
], voting='soft')

In [9]:
vc.fit(x_trainval, y_trainval)

VotingClassifier(estimators=[('logreg',
                              LogisticRegression(C=0.01, solver='liblinear')),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn', KNeighborsClassifier(n_neighbors=3))],
                 voting='soft')

In [10]:
y_pred = vc.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97        84
           1       0.94      0.80      0.86        20

    accuracy                           0.95       104
   macro avg       0.95      0.89      0.92       104
weighted avg       0.95      0.95      0.95       104



### Ensemble 2 (Feature Engineering)

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
x = white_wine_df[['density','alcohol']]
y = np.where(white_wine_df['quality']>6, 1,0)

x_trainval, x_test, y_trainval, y_test = train_test_split(
    x,
    y,
    stratify=y,
    test_size=0.2,
    random_state=2020
)

In [13]:
poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
logreg = LogisticRegression(solver='liblinear',C=0.01)
pipe_logreg = Pipeline([
    ('poly', poly),
    ('clf',logreg)
])
tree = DecisionTreeClassifier(max_depth=5)

scaler = StandardScaler()
knn = KNeighborsClassifier(n_neighbors=3)
pipe_knn = Pipeline([
    ('scaler', scaler),
    ('knn',knn)
])

random_forest = RandomForestClassifier()

vc2 = VotingClassifier([
    ('logreg', pipe_logreg),
    ('tree', tree),
    ('knn', pipe_knn),
    ('rf', random_forest)
], voting='soft')

In [14]:
vc2.fit(x_trainval, y_trainval)

VotingClassifier(estimators=[('logreg',
                              Pipeline(steps=[('poly',
                                               PolynomialFeatures(degree=3,
                                                                  include_bias=False)),
                                              ('clf',
                                               LogisticRegression(C=0.01,
                                                                  solver='liblinear'))])),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('knn',
                                               KNeighborsClassifier(n_neighbors=3))])),
                             ('rf', RandomForestClassifier())],
                 voting='soft')

In [15]:
y_pred = vc2.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98        84
           1       0.94      0.85      0.89        20

    accuracy                           0.96       104
   macro avg       0.95      0.92      0.94       104
weighted avg       0.96      0.96      0.96       104



In [16]:
y_pred = vc.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97        84
           1       0.94      0.80      0.86        20

    accuracy                           0.95       104
   macro avg       0.95      0.89      0.92       104
weighted avg       0.95      0.95      0.95       104



## Python Exercise : Random Forest

Analyze data bankloan.csv
- Apply Random Forest:
    - Target : default
    - Features : age, employ, debtinc, creddebt, othdebt
- Splitting ratio 80:20 stratified
- Apply Random Forest n_estimator 20 max features 4 max depth 3
- Compute precision, recall, f1 score in test set

In [17]:
import pandas as pd
import numpy as np

# preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, f1_score

# visualisation
import seaborn as sns
import matplotlib.pyplot as plt

In [18]:
bank_loan_df = pd.read_csv('data/bankloan.csv')
bank_loan_df

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


### Data Splitting

In [19]:
x = bank_loan_df[['employ','debtinc','creddebt','othdebt']]
y = bank_loan_df['default']

In [20]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    stratify = y,
    test_size=0.2,
    random_state = 2020
)

### Tree 

In [21]:
tree = DecisionTreeClassifier(max_depth=2)
tree.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=2)

In [22]:
skf = StratifiedKFold(n_splits=5)
cv = cross_val_score(tree, x_train,y_train, scoring='f1')
print(cv, cv.mean(), cv.std())

[0.27777778 0.38461538 0.38888889 0.34782609 0.24242424] 0.3283064761325631 0.05856764386721204


In [23]:
y_pred = tree.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.74      0.97      0.84       103
           1       0.40      0.05      0.10        37

    accuracy                           0.73       140
   macro avg       0.57      0.51      0.47       140
weighted avg       0.65      0.73      0.64       140



### Random Forest

In [24]:
rf = RandomForestClassifier(n_estimators=300, max_features=2, max_depth=2, random_state=2020)
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=2, max_features=2, n_estimators=300,
                       random_state=2020)

In [25]:
skf = StratifiedKFold(n_splits=5)
cv = cross_val_score(rf, x_train,y_train, scoring='f1')
print(cv, cv.mean(), cv.std())

[0.45454545 0.3        0.45       0.27027027 0.4       ] 0.3749631449631449 0.07638165857931667


In [26]:
y_pred = rf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.97      0.87       103
           1       0.77      0.27      0.40        37

    accuracy                           0.79       140
   macro avg       0.78      0.62      0.63       140
weighted avg       0.78      0.79      0.75       140



**Kesuksesan ensemble tergantung pada metode2 yg digabungkan.**
1. Random forest bisa meningkatkan performa model, tp tidak selalu harus dicoba2
    - diganti dari max_features harus diperkecil (max_feature dibanding jml feature yg digunakan / x)
    - n_estimator di perbesar
    - max_depth diperkecil agar tidak terlalu overfitting
    - bisa jadi random_state nya lg ambil yg jelek