In [388]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report,make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

In [389]:
df = pd.read_csv("Wine_Quality_Data.csv")

In [390]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

df['color'] = label_encoder.fit_transform(df['color'])

df['color'].unique()

array([0, 1])

In [391]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         6497 non-null   float64
 1   volatile_acidity      6497 non-null   float64
 2   citric_acid           6497 non-null   float64
 3   residual_sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free_sulfur_dioxide   6497 non-null   float64
 6   total_sulfur_dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  color                 6497 non-null   int32  
dtypes: float64(11), int32(1), int64(1)
memory usage: 634.6 KB


In [392]:
df.drop_duplicates()
df.drop('total_sulfur_dioxide',axis=1,inplace=True)
df.drop('density',axis=1,inplace=True)


In [393]:
y= df['quality'].apply(lambda x: 1 if x>=7 else 0)
df.drop('quality',axis=1,inplace=True)


In [394]:
from imblearn.under_sampling import EditedNearestNeighbours
strategy = {0 : 0.6, 1 : 'not minority'}
undersample = EditedNearestNeighbours()

# Undersample the data
X_undersampled, y_undersampled = undersample.fit_resample(df, y)
y_undersampled.value_counts(normalize=True)

quality
0    0.746476
1    0.253524
Name: proportion, dtype: float64

Feature selection with randomForest and RFE

In [395]:
estimator = RandomForestClassifier(random_state=42)
selector = RFE(estimator, n_features_to_select=7, step=1)
selector = selector.fit(X_undersampled, y_undersampled)

In [396]:
selected_features_indices = selector.get_support(indices=True)
selected_features_names = X_undersampled.columns[selected_features_indices]
print(selected_features_names)

Index(['volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides',
       'free_sulfur_dioxide', 'pH', 'alcohol'],
      dtype='object')


In [397]:
df = df[selected_features_names]


In [398]:
# df.drop(['fixed_acidity','free_sulfur_dioxide','pH','color','sulphates','quality','citric_acid','density','residual_sugar'],axis=1,inplace=True)

In [399]:
df.head()

Unnamed: 0,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,pH,alcohol
0,0.7,0.0,1.9,0.076,11.0,3.51,9.4
1,0.88,0.0,2.6,0.098,25.0,3.2,9.8
2,0.76,0.04,2.3,0.092,15.0,3.26,9.8
3,0.28,0.56,1.9,0.075,17.0,3.16,9.8
4,0.7,0.0,1.9,0.076,11.0,3.51,9.4


In [400]:
from imblearn.under_sampling import EditedNearestNeighbours
undersample = EditedNearestNeighbours()

# Undersample the data
X_undersampled, y_undersampled = undersample.fit_resample(df, y)
y_undersampled.value_counts(normalize=True)

quality
0    0.748126
1    0.251874
Name: proportion, dtype: float64

In [401]:
X_train, X_test, y_train, y_test= train_test_split(X_undersampled,y_undersampled,test_size=0.2,random_state=42)

In [402]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.fit_transform(X_test)

In [403]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)

In [404]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print('Accuracy:', accuracy)
print("f1 score :", f1_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.9063116370808678
f1 score : 0.810379241516966
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       736
           1       0.91      0.73      0.81       278

    accuracy                           0.91      1014
   macro avg       0.91      0.85      0.87      1014
weighted avg       0.91      0.91      0.90      1014



In [405]:
hyperparameters = {
    'n_estimators': [50, 100, 200],
    'max_depth': [ 5, 10, 20],
    'min_samples_split': [2, 3, 4],
}

In [406]:
scoring = make_scorer(f1_score)
#perform cross validation on the model with the hyperparam grid
grid_search = GridSearchCV(model, hyperparameters,cv=5,scoring=scoring)
grid_search.fit(X_train, y_train)

print('Best Hyperparams : \n', grid_search.best_params_)
y_pred5 =grid_search.predict(X_test)
print('\nClassification Report:\n', classification_report(y_test,y_pred5))

Best Hyperparams : 
 {'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 200}

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.94       736
           1       0.91      0.72      0.81       278

    accuracy                           0.91      1014
   macro avg       0.91      0.85      0.87      1014
weighted avg       0.91      0.91      0.90      1014



In [407]:
y_pred34 = model.predict(X_test)

In [408]:
model = grid_search.best_estimator_

In [409]:
import pickle

filename = 'model.sav'
pickle.dump(model, open(filename, 'wb'))

In [410]:
loaded_model = pickle.load(open('model.sav','rb'))

In [412]:
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test, columns=['quality'])

test_data = pd.concat([X_test_df, y_test_df], axis=1)

test_data.to_csv('test_data.csv', index=False)