In [1]:
## EDA Standard Libary

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats as ss

In [2]:
#ML Library

#ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#ML TrainTest Split
from sklearn.model_selection import train_test_split
#ML Report
from sklearn.metrics import  accuracy_score

In [6]:
wine = pd.read_csv('/Users/Dwika/My Projects/DATASETS/white_wine.csv')

In [8]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         520 non-null    float64
 1   volatile acidity      520 non-null    float64
 2   citric acid           520 non-null    float64
 3   residual sugar        520 non-null    float64
 4   chlorides             520 non-null    float64
 5   free sulfur dioxide   520 non-null    float64
 6   total sulfur dioxide  520 non-null    float64
 7   density               520 non-null    float64
 8   pH                    519 non-null    float64
 9   sulphates             519 non-null    float64
 10  alcohol               519 non-null    float64
 11  quality               519 non-null    float64
dtypes: float64(12)
memory usage: 48.9 KB


In [9]:
wine = wine.dropna()

In [10]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
Index: 519 entries, 0 to 518
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         519 non-null    float64
 1   volatile acidity      519 non-null    float64
 2   citric acid           519 non-null    float64
 3   residual sugar        519 non-null    float64
 4   chlorides             519 non-null    float64
 5   free sulfur dioxide   519 non-null    float64
 6   total sulfur dioxide  519 non-null    float64
 7   density               519 non-null    float64
 8   pH                    519 non-null    float64
 9   sulphates             519 non-null    float64
 10  alcohol               519 non-null    float64
 11  quality               519 non-null    float64
dtypes: float64(12)
memory usage: 52.7 KB


In [15]:
wine['quality_class'] = wine['quality'].apply(lambda x: 1 if x >= 7 else 0)
wine['quality_class'].value_counts()

quality_class
0    421
1     98
Name: count, dtype: int64

In [17]:
#QUality Class Proportion
wine['quality_class'].value_counts(normalize=True)*100

quality_class
0    81.117534
1    18.882466
Name: proportion, dtype: float64

>Imbalance proportion on quality class

In [18]:
wine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_class
count,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0,519.0
mean,6.811368,0.289162,0.338324,6.613102,0.04778,37.204239,146.748555,0.995376,3.210289,0.486936,10.119461,5.805395,0.188825
std,0.730952,0.106464,0.120543,5.308326,0.023028,16.880048,44.402195,0.002615,0.148275,0.101472,1.087835,0.880226,0.391747
min,5.0,0.1,0.0,0.8,0.02,3.0,47.0,0.9899,2.87,0.27,8.5,3.0,0.0
25%,6.3,0.225,0.27,1.6,0.038,25.5,113.0,0.9934,3.11,0.41,9.3,5.0,0.0
50%,6.8,0.27,0.34,5.2,0.045,36.0,147.0,0.9954,3.2,0.48,9.9,6.0,0.0
75%,7.2,0.33,0.4,10.45,0.052,48.0,174.5,0.9974,3.31,0.54,10.7,6.0,0.0
max,10.2,0.905,0.88,22.0,0.346,131.0,313.0,1.0033,3.72,0.85,12.9,8.0,1.0


## Create Schema

In [20]:
#Create Prerpocessing Schema

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline


from imblearn.over_sampling import SMOTE

In [23]:
transform = ColumnTransformer([
    ('Scaler', RobustScaler(), ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density','pH', 'sulphates', 'alcohol'])
    
], remainder='passthrough')

In [24]:
transform

## Train Test Split

In [25]:
#Splitting Data
x = wine.drop(['quality', 'quality_class'], axis=1)
y = wine['quality_class']

In [32]:
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score, RandomizedSearchCV

In [33]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, stratify=y, random_state=2023)

In [34]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((389, 11), (130, 11), (389,), (130,))

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report



## Loop through models with & without imbalance data handling

In [95]:
#Create Pipeline

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, NearMiss


smote = SMOTE()
ROS = RandomOverSampler()
RUS = RandomUnderSampler()  
NM = NearMiss()

model_list = [dtc, knn, svc, lr, vc, sc]
resample = [smote, ROS, RUS, NM]

model_names = ['DT', 'KNN', 'SVC', 'LR', 'VC', 'SC']
resample_names = ['SMOTE', 'ROS', 'RUS', 'NM']

#Model selection score list
model_score = []
f1_score = []
std = []

#Loop through the model list
for i in model_list:
    for j in resample:  
        model_pipe = Pipeline([
            ('prep', transform),
            ('imba', j),
            ('algo', i)
        ])
        
        pipecv = cross_val_score(model_pipe, xtrain, ytrain, cv=5, scoring='f1')
        model_score.append(pipecv)
        f1_score.append(pipecv.mean())
        std.append(pipecv.std())

# Create Dataframe
scoresheet = pd.DataFrame({
    'Model': [modelname+"-"+resamplename for modelname in model_names for resamplename in resample_names],
    'F1 Score': f1_score,
    'Std': std
}).sort_values(by='F1 Score', ascending=False)

scoresheet


Unnamed: 0,Model,F1 Score,Std
12,LR-SMOTE,1.0,0.0
13,LR-ROS,1.0,0.0
21,SC-ROS,1.0,0.0
20,SC-SMOTE,1.0,0.0
17,VC-ROS,1.0,0.0
9,SVC-ROS,1.0,0.0
15,LR-NM,1.0,0.0
14,LR-RUS,1.0,0.0
22,SC-RUS,0.993548,0.012903
18,VC-RUS,0.993548,0.012903


## Apply Best Model

In [96]:
best_model = Pipeline([
    ('prep', transform),
    ('imba', SMOTE()),
    ('algo', LogisticRegression())
])

In [99]:
#Apply LogReg

best_model.fit(xtrain, ytrain)
pred = best_model.predict(xtest)
print(classification_report(ytest, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       105
           1       1.00      1.00      1.00        25

    accuracy                           1.00       130
   macro avg       1.00      1.00      1.00       130
weighted avg       1.00      1.00      1.00       130



1.0