The following code shows the classification of the wine quality (low, medium, high) and wine type (red or white) based on the various predictors. 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 


In [3]:
# load the data
df = pd.read_csv("all_wines.csv")
# print(df.head(2))
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  winetype              6497 non-null   object 
 13  wine-quality          6497 non-null   object 
dtypes: float64(11), int64(1), object(2)
memory usage: 710.7+ KB
None


In [4]:
# change wine quality to categorical variable
df['wine-quality'] = pd.Categorical(df['wine-quality'], categories=['low','medium', 'high'])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   fixed acidity         6497 non-null   float64 
 1   volatile acidity      6497 non-null   float64 
 2   citric acid           6497 non-null   float64 
 3   residual sugar        6497 non-null   float64 
 4   chlorides             6497 non-null   float64 
 5   free sulfur dioxide   6497 non-null   float64 
 6   total sulfur dioxide  6497 non-null   float64 
 7   density               6497 non-null   float64 
 8   pH                    6497 non-null   float64 
 9   sulphates             6497 non-null   float64 
 10  alcohol               6497 non-null   float64 
 11  quality               6497 non-null   int64   
 12  winetype              6497 non-null   object  
 13  wine-quality          6497 non-null   category
dtypes: category(1), float64(11), int64(1), object(1)
memory 

In [9]:
# select only the features
allwine_features = df.iloc[:,:-3]
print(allwine_features.head(3))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            6.1              0.22         0.38             2.8      0.144   
1            5.7              0.29         0.16             7.9      0.044   
2            6.9              0.26         0.27             4.2      0.031   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 12.0                  65.0  0.99080  2.95       0.64   
1                 48.0                 197.0  0.99512  3.21       0.36   
2                 20.0                  80.0  0.99089  3.12       0.39   

   alcohol  
0     11.4  
1      9.4  
2     11.5  


In [10]:
type_of_wine = df['winetype']
quality_of_wine = df['wine-quality']

Next, we attempt to predict the tpe of wine and the quality of wine using the various classifier models. The best model will be selected using the GridSearch.

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

In [13]:
X_train, X_test, y_train, y_test = train_test_split(allwine_features, type_of_wine, test_size=0.2, random_state=42)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# Create a pipeline with scaler and classifier
pipeline = Pipeline([
    ('classifier', None)
])
# Define parameter grid for each classifier
param_grid = [
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__max_iter':[150,300,500,1000]
    },
    {
        'classifier': [SVC()],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [10, 50, 100, 200]
    }
]


In [17]:
# Grid search for the best classifier
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)


In [18]:
# Get the best classifier
best_classifier = grid_search.best_estimator_
print("Best Classifier:", best_classifier.named_steps['classifier'])

Best Classifier: SVC(C=10, gamma=0.01)


In [21]:
# Evaluate the best classifier
y_pred = best_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy,3))

Accuracy: 0.994


In [25]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

    red-wine       0.99      0.98      0.99       313
  white-wine       0.99      1.00      1.00       987

    accuracy                           0.99      1300
   macro avg       0.99      0.99      0.99      1300
weighted avg       0.99      0.99      0.99      1300



In [26]:
#Next, we print the accuracy, confusion matrix, and cross validation score for all the classifiers.
from sklearn.model_selection import cross_val_score
classifiers = [
    ('Logistic Regression', LogisticRegression()),
    ('SVM', SVC()),
    ('Random Forest', RandomForestClassifier())
]

for name, clf in classifiers:
    print("\n Classifier: ",name)
    clf.fit(X_train_scaled,y_train)
    preds = clf.predict(X_test_scaled)

    cvscores = cross_val_score(clf, X_train_scaled,y_train,cv=5,scoring='accuracy')

    print("**********")
    print(f"Accuracy of model {name}: {accuracy_score(y_test,preds)}")
    print("Classification Report for: ", name)
    print(classification_report(y_test, preds))

    print("********")
    print("Cross validation report for: ", name)
    print(f"scores: {cvscores}, Average accuracy: {np.mean(cvscores)}")


 Classifier:  Logistic Regression
**********
Accuracy of model Logistic Regression: 0.9930769230769231
Classification Report for:  Logistic Regression
              precision    recall  f1-score   support

    red-wine       0.99      0.98      0.99       313
  white-wine       0.99      1.00      1.00       987

    accuracy                           0.99      1300
   macro avg       0.99      0.99      0.99      1300
weighted avg       0.99      0.99      0.99      1300

********
Cross validation report for:  Logistic Regression
scores: [0.99038462 0.99038462 0.99422522 0.99518768 0.99518768], Average accuracy: 0.9930739616495151

 Classifier:  SVM
**********
Accuracy of model SVM: 0.9946153846153846
Classification Report for:  SVM
              precision    recall  f1-score   support

    red-wine       0.99      0.98      0.99       313
  white-wine       0.99      1.00      1.00       987

    accuracy                           0.99      1300
   macro avg       0.99      0.99    

Next, we do the same for wine quality.

In [27]:
X_train, X_test, y_train, y_test = train_test_split(allwine_features, quality_of_wine, test_size=0.2, random_state=42)


In [28]:
grid_search.fit(X_train_scaled, y_train)


In [29]:
best_classifier = grid_search.best_estimator_
print("Best Classifier:", best_classifier.named_steps['classifier'])

Best Classifier: RandomForestClassifier()


In [30]:
# Evaluate the best classifier
y_pred = best_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy,3))

Accuracy: 0.808


In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.94      0.35      0.51        43
         low       0.79      0.72      0.75       464
      medium       0.82      0.89      0.85       793

    accuracy                           0.81      1300
   macro avg       0.85      0.65      0.70      1300
weighted avg       0.81      0.81      0.80      1300



In [32]:
for name, clf in classifiers:
    print("\n Classifier: ",name)
    clf.fit(X_train_scaled,y_train)
    preds = clf.predict(X_test_scaled)

    cvscores = cross_val_score(clf, X_train_scaled,y_train,cv=5,scoring='accuracy')

    print("**********")
    print(f"Accuracy of model {name}: {accuracy_score(y_test,preds)}")
    print("Classification Report for: ", name)
    print(classification_report(y_test, preds))

    print("********")
    print("Cross validation report for: ", name)
    print(f"scores: {cvscores}, Average accuracy: {np.mean(cvscores)}")b


 Classifier:  Logistic Regression
**********
Accuracy of model Logistic Regression: 0.7238461538461538
Classification Report for:  Logistic Regression
              precision    recall  f1-score   support

        high       0.00      0.00      0.00        43
         low       0.68      0.58      0.63       464
      medium       0.74      0.84      0.79       793

    accuracy                           0.72      1300
   macro avg       0.48      0.48      0.47      1300
weighted avg       0.70      0.72      0.71      1300

********
Cross validation report for:  Logistic Regression
scores: [0.71153846 0.69903846 0.70837344 0.71799808 0.70644851], Average accuracy: 0.7086793884652403

 Classifier:  SVM


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**********
Accuracy of model SVM: 0.75
Classification Report for:  SVM
              precision    recall  f1-score   support

        high       0.00      0.00      0.00        43
         low       0.72      0.64      0.68       464
      medium       0.76      0.85      0.81       793

    accuracy                           0.75      1300
   macro avg       0.49      0.50      0.50      1300
weighted avg       0.72      0.75      0.73      1300

********
Cross validation report for:  SVM
scores: [0.7375     0.73942308 0.74013474 0.74494706 0.73917228], Average accuracy: 0.7402354334789368

 Classifier:  Random Forest


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**********
Accuracy of model Random Forest: 0.806923076923077
Classification Report for:  Random Forest
              precision    recall  f1-score   support

        high       0.93      0.33      0.48        43
         low       0.78      0.73      0.75       464
      medium       0.82      0.88      0.85       793

    accuracy                           0.81      1300
   macro avg       0.84      0.64      0.69      1300
weighted avg       0.81      0.81      0.80      1300

********
Cross validation report for:  Random Forest
scores: [0.80192308 0.81442308 0.79307026 0.80076997 0.79307026], Average accuracy: 0.8006513289405494
