##Using Logistic regression to determine the classe of a wine (its quality) based on datas from a dataset


In [None]:
import pandas as pd
import seaborn as sns

In [None]:
df_wine = pd.read_csv('https://raw.githubusercontent.com/BriceNW/datasets_and_others/main/Wine_Quality.csv')
df_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [None]:
#Display the statistics indicators:
df_wine.describe()

In [None]:
#We split wines into 2 classes: the "good" and "bad ones" based on the dataset's median quality

sns.boxplot(df_wine.quality)

#on voit que mediane ≈ 5.5

df_wine['classes']=df_wine.quality.apply(lambda x: '0' if x<= 5 else '1')

In [None]:
# Here we split wines based on the values of their fixed acidity:
sns.boxplot(df_wine['fixed acidity'])

df_wine['fixed acidity'] = df_wine['fixed acidity'].apply(lambda x: 'low' if x < 8 else 'medium' if 8  <= x < 12 else 'high' )

In [None]:
df_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,low,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,low,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,low,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,medium,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,low,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,low,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,low,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,low,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,low,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [None]:
# On a alors une colonne avec des valeurs non numériques, or pour séparer des variables il faut des valeurs numériques
# On utilise alors get_dummies

# Dropfirst = True => supprime une colonne sur les 3 (si on est à 1 dans une colone, 
#on connait notre valeur et si on est à 0 dans les 2, on sait que l'on est à 1 dans la 3ème qui a été enlevée)

df_dummies = pd.get_dummies(df_wine['fixed acidity'], drop_first=True)

df_dummies.head()

Unnamed: 0,low,medium
0,1,0
1,1,0
2,1,0
3,0,1
4,1,0


In [None]:
#On supprime alors les colonnes que l'on ne va plus utiliser et on fusionne les 2 df

df_wine2 = df_wine.drop(['fixed acidity', 'quality'], axis=1)
#df_wine2.head()
df_wine2 = pd.concat([df_wine2, df_dummies],axis=1)

In [None]:
y = df_wine2.classes
X = df_wine2.drop(['classes'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=16, stratify=y, train_size = 0.75)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
#On "norme" toutes les valeurs de test et de train
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

#Creating the Logistic Regression model

In [None]:
logistic_model = LogisticRegression(class_weight='balanced')
logistic_model.fit(X_train_scaled, y_train)

In [None]:
print('score sur le train: ', logistic_model.score(X_train_scaled, y_train))
print('score sur le test: ', logistic_model.score(X_test_scaled, y_test))

In [None]:
import matplotlib.pyplot as plt

In [None]:
# We use the confusion matrix to help adjust the model:
# We may want to avoid false positives or false negatives
# And we need to adjust the model accordingly

from sklearn.metrics import confusion_matrix


y_train_pred = logistic_model.predict(X_train_scaled)
y_test_pred = logistic_model.predict(X_test_scaled)

matrix_train = confusion_matrix(y_train, y_train_pred)
matrix_test = confusion_matrix(y_test, y_test_pred)

fig, axes = plt.subplots(ncols=2, figsize=(15,5))

ax1,ax2 = axes

sns.heatmap(matrix_train, annot=True, ax=ax1, fmt='d', annot_kws={'size':14})
ax1.set_xlabel('Prediction')
ax1.set_ylabel('Reality')
ax1.set_title('Matrix Train')
sns.heatmap(matrix_test, annot=True, ax=ax2, fmt='d', annot_kws={'size':14})
ax2.set_xlabel('Prediction')
ax2.set_ylabel('Reality')
ax2.set_title('Matrix Train')

plt.show()

In [None]:
# Specifying classes' weights rather than using a balanced weight:
logistic_model2 = LogisticRegression(class_weight = {'0':0.6, '1':0.4})
logistic_model2.fit(X_train_scaled, y_train)

In [None]:
print('score sur le train: ', logistic_model2.score(X_train_scaled, y_train))
print('score sur le test: ', logistic_model2.score(X_test_scaled, y_test))

##Decision Tree


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree, DecisionTreeRegressor

In [None]:
# Computing the decision tree (we can adjust tree's depth by changing max_depth)
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train_scaled, y_train)

In [None]:
print('score sur le train: ', model_tree.score(X_train_scaled, y_train))
print('score sur le test: ', model_tree.score(X_test_scaled, y_test))

score sur le train:  1.0
score sur le test:  0.745


#Here we can notice overfiting (score train = 1 while score test is lower)

We train various trees by varying the depth


In [None]:
# Making 10 distinct trees with increasing depth
for p in range(2,20,2):
  model_tree = DecisionTreeClassifier(class_weight = 'balanced', max_depth = p)
  model_tree.fit(X_train_scaled,y_train )
  print("profondeur de l'abre: " ,p)
  print( " Score train: ", model_tree.score(X_train_scaled, y_train))
  print( " Score test: ", model_tree.score(X_test_scaled, y_test))
  print('------')

In [None]:
# Displaying the results to chose the best parameter
plt.figure(figsize=(20,5))

plot_tree(model_tree, max_depth=2, feature_names = X.columns.tolist())

In [None]:
# Displays each variable's importance:

model_tree.feature_importances_.tolist()

In [None]:
df_features = pd.DataFrame()
df_features['variables'] = X.columns.tolist()
df_features['score'] = model_tree.feature_importances_.tolist()

df_features.sort_values('score', ascending=False)