# Wine Quality

## Importing Libraries

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np

## Importing Dataset

In [6]:
dataset = pd.read_csv("dataset/winequality-white.csv", delimiter = ';')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Choose implementation

In [8]:
# Represents the separation to binary categories ('good' or 'bad' quality) or evaluation by score degree.
# If value is equal to '0', then will be made a score separation above 6.5 or less.
# From the other hand a precision score implementation will be adopt.
implementation = 0
if implementation == 0:
    temp_v = []
    for i in range(y.shape[0]):
        if y[i] > 6.5:
            temp_v.append(1)
        else:
            temp_v.append(0)
    y = np.array(temp_v)

In [9]:
print(X)

[[ 7.    0.27  0.36 ...  3.    0.45  8.8 ]
 [ 6.3   0.3   0.34 ...  3.3   0.49  9.5 ]
 [ 8.1   0.28  0.4  ...  3.26  0.44 10.1 ]
 ...
 [ 6.5   0.24  0.19 ...  2.99  0.46  9.4 ]
 [ 5.5   0.29  0.3  ...  3.34  0.38 12.8 ]
 [ 6.    0.21  0.38 ...  3.26  0.32 11.8 ]]


In [10]:
print(y)

[0 0 0 ... 0 1 0]


## Split data to Training and Testing datasets

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Define the Machine Learning models

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Hyperparameter Tuning function for RandomForectClassifier.
def rfc():
    from sklearn.model_selection import GridSearchCV
    param_grid = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10, 20]
    }
    grid_search = GridSearchCV(estimator = RandomForestClassifier(), param_grid = param_grid, cv = 3, n_jobs = -1)
    grid_search.fit(X_train, y_train)
    best_rf_model = grid_search.best_estimator_
    y_pred_rf = best_rf_model.predict(X_test)
    
    return y_pred_rf

# Defines models and passes some basic parameters for each model.
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbours Classifier": KNeighborsClassifier(n_neighbors = len(np.unique(y)), metric = 'minkowski', p = 2, n_jobs = -1),
    "Support Vector Classifier": SVC(C = 1, gamma = 0.1),
    "Gaussian Naive Bayes": GaussianNB(),
    "Decision Tree Classifier": DecisionTreeClassifier(criterion = 'entropy'),
    "Random Forest Classifier": rfc()
}

## Define metrics to measure the efficieny of each model

In [18]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Initializes an empty list to store results.
results = []
for name, model in models.items():
    if name != "Random Forest Classifier":
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    else:
         y_pred = model
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average = 'weighted')
    precision = precision_score(y_test, y_pred, average = 'weighted')
    recall = recall_score(y_test, y_pred, average = 'weighted')
    
    results.append({
        "model": name,
        "accuracy": round(accuracy, 4),
        "f1-Score": round(f1, 4),
        "precision": round(precision, 4),
        "recall": round(recall, 4),
    })
    
df_results = pd.DataFrame(results)

In [19]:
print(df_results)

                       model  accuracy  f1-Score  precision  recall
0        Logistic Regression    0.8000    0.7687     0.7702  0.8000
1    K-Neighbours Classifier    0.8433    0.8259     0.8320  0.8433
2  Support Vector Classifier    0.8212    0.7950     0.8028  0.8212
3       Gaussian Naive Bayes    0.7363    0.7557     0.8008  0.7363
4   Decision Tree Classifier    0.8098    0.8136     0.8185  0.8098
5   Random Forest Classifier    0.8710    0.8617     0.8646  0.8710


In [20]:
# Initializes dictionary in order to reveal the best model implementation.
best_model = {
    'model': '',
    'accuracy': 0.0
}

for index, row in df_results.iterrows():
    accuracy = row['accuracy']
    model = row['model']
    
    if best_model['accuracy'] < accuracy:
        best_model['model'] = model
        best_model['accuracy'] = accuracy
        
print(f"Best model is {best_model['model']} with accuracy {round(best_model['accuracy'] * 100, 2)}%")

Best model is Random Forest Classifier with accuracy 87.1%
