# VARIOUS TYPE

# 1. Soft Voting

- Analyze data white_wine.csv
- Apply Voting Classifier
    - Target: Quality(quality > 6 --> Y = 1)
    - Features: density alcohol
- Validate the model using precision, recall and f1 score in 20% testing data
- Apply soft voting classifier method, using these following method:
    - Logistic Regression
    - Decision Tree: max depth 5
    - KNN : Nearest negohbor 3
- Apply soft voting classifier method, using these following method
    - 3rd degree polynomial features + logistic regression
    - Decision Tree: Max depth 5
    - Standard scaler + KNN: Nearest neighbor 3

## 1.1 Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

import warnings
warnings

## 1.2 Data

In [2]:
white_wine = pd.read_csv(r'C:\Users\dheof\Desktop\Help\Purwadhika\Csv_Files\white_wine.csv')
white_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
515,6.1,0.31,0.26,2.2,0.051,28.0,167.0,0.9926,3.37,0.47,10.4,6.0
516,6.8,0.18,0.37,1.6,0.055,47.0,154.0,0.9934,3.08,0.45,9.1,5.0
517,7.4,0.15,0.42,1.7,0.045,49.0,154.0,0.9920,3.00,0.60,10.4,6.0
518,5.9,0.13,0.28,1.9,0.050,20.0,78.0,0.9918,3.43,0.64,10.8,6.0


In [3]:
white_wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      1
sulphates               1
alcohol                 1
quality                 1
dtype: int64

In [6]:
white_wine['alcohol'].fillna(white_wine['alcohol'].median(), inplace = True)

In [7]:
white_wine['quality'].fillna(white_wine['quality'].median(), inplace = True)

## 1.3 Data Splitting

In [9]:
y = np.where(white_wine['quality'] > 6, 1, 0)
x = white_wine[['alcohol', 'density']]

In [14]:
pd.DataFrame(y).value_counts()

0    422
1     98
dtype: int64

In [15]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    stratify = y,
    test_size = 0.2,
    random_state = 2020
)

## 1.4 Essemble 1 (tanpa feature engineering)

In [18]:
logreg = LogisticRegression(solver = 'liblinear', C = 0.01)
tree = DecisionTreeClassifier(max_depth = 5)
knn = KNeighborsClassifier(n_neighbors = 3)

vc = VotingClassifier([
    ('logistic', logreg),
    ('tree', tree),
    ('knn', knn)
], voting = 'soft')

In [19]:
vc.fit(x_train, y_train)

VotingClassifier(estimators=[('logistic',
                              LogisticRegression(C=0.01, solver='liblinear')),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn', KNeighborsClassifier(n_neighbors=3))],
                 voting='soft')

In [20]:
y_pred = vc.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        84
           1       1.00      0.85      0.92        20

    accuracy                           0.97       104
   macro avg       0.98      0.93      0.95       104
weighted avg       0.97      0.97      0.97       104



## 1.5 Ensemble 2 (dengan feature engineering)

In [21]:
poly = PolynomialFeatures(degree = 3, interaction_only= False, include_bias = False)
logreg = LogisticRegression(solver = 'liblinear', C = 0.01)
pipe_logreg = Pipeline([
    ('poly', poly),
    ('clf', logreg)
])

tree = DecisionTreeClassifier(max_depth = 5)

scaler = StandardScaler()
knn = KNeighborsClassifier(n_neighbors = 3)
pipe_knn = Pipeline([
    ('scaler', scaler),
    ('knn', knn)
])

vc2 = VotingClassifier([
    ('logistic', pipe_logreg),
    ('tree', tree),
    ('knn', pipe_knn)
], voting = 'soft')

In [22]:
vc2.fit(x_train, y_train)

VotingClassifier(estimators=[('logistic',
                              Pipeline(steps=[('poly',
                                               PolynomialFeatures(degree=3,
                                                                  include_bias=False)),
                                              ('clf',
                                               LogisticRegression(C=0.01,
                                                                  solver='liblinear'))])),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('knn',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('knn',
                                               KNeighborsClassifier(n_neighbors=3))]))],
                 voting='soft')

In [23]:
y_pred = vc2.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        84
           1       1.00      0.90      0.95        20

    accuracy                           0.98       104
   macro avg       0.99      0.95      0.97       104
weighted avg       0.98      0.98      0.98       104



In [24]:
y_pred = vc.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        84
           1       1.00      0.85      0.92        20

    accuracy                           0.97       104
   macro avg       0.98      0.93      0.95       104
weighted avg       0.97      0.97      0.97       104



# SIMILIAR TYPE

# 2. Random Forest

Analyze data bankloan.csv
- Apply Random Forest:
    - Target: Default
    - Features: age, employ, debtinc, creddebt, othdebt
- Splitting ratio 80:20 stratified
- Apply random forest n_estimator 20 max features 4 max depth 3
- Computer precision, recall, f1 score in test set

## 2.1 Library

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report


import warnings

## 2.2 Data

In [32]:
bank = pd.read_csv(r'C:\Users\dheof\Desktop\Help\Purwadhika\Csv_Files\bankloan.csv')
bank

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


## 2.3 Data Splitting

In [33]:
x = bank[['age', 'employ', 'debtinc', 'creddebt', 'othdebt']]
y = bank['default']

In [34]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    stratify = y,
    test_size = 0.2    
)

## 2.4 Tree

In [35]:
tree = DecisionTreeClassifier(max_depth = 3)
tree.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=3)

In [36]:
tree_cv = cross_val_score(tree, x_train, y_train, scoring = 'f1')
print(tree_cv)
print(tree_cv.mean())
print(tree_cv.std())

[0.43478261 0.43478261 0.12121212 0.4        0.54545455]
0.38724637681159424
0.14176465805384922


In [37]:

y_pred = tree.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84       103
           1       0.50      0.14      0.21        37

    accuracy                           0.74       140
   macro avg       0.63      0.54      0.53       140
weighted avg       0.69      0.74      0.68       140



## 2.5 Random Forest

In [38]:
rf = RandomForestClassifier(n_estimators = 300, max_features = 2, max_depth = 3)
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=3, max_features=2, n_estimators=300)

In [39]:
rf_cv = cross_val_score(rf, x_train, y_train, scoring = 'f1')
print(rf_cv)
print(rf_cv.mean())
print(rf_cv.std())

[0.35897436 0.42857143 0.53333333 0.34146341 0.57777778]
0.44802406265820904
0.09357042765409986


In [40]:
y_pred = rf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.96      0.86       103
           1       0.69      0.24      0.36        37

    accuracy                           0.77       140
   macro avg       0.74      0.60      0.61       140
weighted avg       0.76      0.77      0.73       140

