In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


data = pd.read_csv('winequality-red.csv', delimiter=';')

X = data.drop('quality', axis=1)
y = data['quality']

#checking for null instances
data = data.fillna(0)
print("\nNull Instances:\n")
print(data.isna().sum())

#Getting the top features from the dataset
top_features = SelectKBest(score_func=chi2, k=11)

dfscores = pd.DataFrame(top_features.fit(X,y).scores_)  #Store predictor scores in a column 
dfcolumns = pd.DataFrame(X.columns)  #Store predictor variable names in a column

#List of features with heaviest weight/importance
predScores = pd.concat([dfcolumns,dfscores],axis=1)
predScores.columns = ['Predictor','Score']   #naming the dataframe columns
print(predScores.nlargest(11,'Score'))       #print top (by score) 10 features

#Drop the bottom two features (smallest score)
data = data.drop('density', axis=1)
data = data.drop('pH', axis=1)

X= data.loc[:, 'fixed acidity':'alcohol']
y= data['quality']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=417)

wine_dataset_file = "winequality-red.csv"

full_df = pd.read_csv(wine_dataset_file, header = 0, delimiter=";")



Null Instances:

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
               Predictor        Score
6   total sulfur dioxide  2755.557984
5    free sulfur dioxide   161.936036
10               alcohol    46.429892
1       volatile acidity    15.580289
2            citric acid    13.025665
0          fixed acidity    11.260652
9              sulphates     4.558488
3         residual sugar     4.123295
4              chlorides     0.752426
8                     pH     0.154655
7                density     0.000230


In [18]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print("Best Random Forest Parameters: ", best_rf)

y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("=====================RANDOM FOREST CLASSIFIER=====================\n")
print("Accuracy: ", accuracy)
print('\nClassification Report: \n', report)



Best Random Forest Parameters:  RandomForestClassifier(min_samples_split=5)

Accuracy:  0.6116071428571429

Classification Report: 
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        36
           5       0.65      0.76      0.70       461
           6       0.57      0.63      0.60       455
           7       0.64      0.32      0.43       151
           8       0.50      0.09      0.15        11

    accuracy                           0.61      1120
   macro avg       0.39      0.30      0.31      1120
weighted avg       0.59      0.61      0.59      1120



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# Define the parameter grid for the ANN
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)
mlp = MLPClassifier(**best_params)
mlp.fit(X_train, y_train)
# Predict the test set
y_pred = mlp.predict(X_test)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("=====================ANN CLASSIFIER=====================\n")

# Print the evaluation metrics
print("Accuracy: ", accuracy)
print("Classification Report: \n", report)

# X = full_df.iloc[:, :-1]
# Y = full_df.iloc[:, -1]


# train_score = []
# test_score = []

# # more layers does not improve test data
# layers = list(range(10,50,5))
# for i in layers:

#     mlp = MLPClassifier(**best_params)
#     # scaling / normalizing data helps increase test
#     pipe = make_pipeline(StandardScaler(), mlp)

#     pipe.fit(X_train, y_train)
#     train_score.append(pipe.score(X_train,y_train))
#     test_score.append(pipe.score(X_test, y_test))

# plt.plot(layers,train_score,'.',label = 'train set')
# plt.plot(layers,test_score,'-',label = 'test set')
# plt.xlabel('layers')
# plt.ylabel('score')
# plt.legend()

# y_pred = mlp.predict(X_test)

# # Calculate the evaluation metrics
# accuracy = accuracy_score(y_test, y_pred)
# conf_matrix = confusion_matrix(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# # Print the evaluation metrics
# print("Classification Report: \n", report)

# optimal_index = test_score.index(max(test_score))
# print(f'Best number of hidden nodes: {layers[optimal_index]}, with a test accuracy of {test_score[optimal_index]}')

Fitting 3 folds for each of 96 candidates, totalling 288 fits
Best parameters found:  {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'solver': 'adam'}
Accuracy:  0.5196428571428572
Confusion Matrix: 
 [[  0   0   4   2   0   0]
 [  0   0  28   8   0   0]
 [  0   0 314 147   0   0]
 [  0   0 187 267   1   0]
 [  0   0  34 116   1   0]
 [  0   0   5   6   0   0]]
Classification Report: 
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00        36
           5       0.55      0.68      0.61       461
           6       0.49      0.59      0.53       455
           7       0.50      0.01      0.01       151
           8       0.00      0.00      0.00        11

    accuracy                           0.52      1120
   macro avg       0.26      0.21      0.19      1120
weighted avg       0.49      0.52      0.47      1120




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
