In [None]:
from ucimlrepo import fetch_ucirepo, list_available_datasets

# check which datasets can be imported
#list_available_datasets()

# import dataset
wine_dataset = fetch_ucirepo(id=186)

In [None]:
wine_dataset.data.features

In [None]:
wine_dataset.data.targets

In [None]:
y = wine_dataset.data.targets
X = wine_dataset.data.features

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from pprint import pprint
pprint(wine_dataset.metadata.abstract)

In [None]:
pprint(wine_dataset.metadata)

Note:
This dataset has an additional column where you can see if the wine was red or white, could be a great binary classification problem.  Right now we will focus on predicting the quality of the wine from 0-10 which makes this a multi-classification problem.

In [None]:
# No missing values
X.isnull().sum()

In [None]:
print(f'Size of full dataset:{X.shape}')
print(f'Size of training dataset:{X_train.shape}')

In [None]:
import pandas as pd
X_full = pd.concat([X_train,y_train],axis=1)
X_full.groupby('quality').describe().transpose()

In [None]:
print(y.value_counts())

In [None]:
y_train.value_counts()

### Notes:
- The dataset is imbalanced, with the majority of the data being of quality 5 and 6.
- The dataset has no missing values.
- The dataset has 11 features and 1 target variable.
- Fitting a model to this dataset would not predict the quality of the wine well, as the dataset is imbalanced.The model would predict the majority class most of the time.


In [None]:
## Lets look at the distribution of features per class
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
sns.boxplot(data=X_full, x='quality', y='fixed_acidity',
            showmeans=True, meanprops={"marker":"o",
                                        "markerfacecolor":"white", 
                                        "markeredgecolor":"black"},
                                        showfliers=False, palette='viridis')
plt.title('Fixed Acidity vs Quality')


No outliershow means are inflated because plot excludes outliers but uses them to calculate the means in the data.
Wine acicity decreases with quality of the wine,

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(10,20))
i = 0;j = 0
for feature in X_full.columns[:-1]:
    sns.boxplot(data=X_full,x='quality', y=feature,
                showmeans=True, meanprops={"marker":"o",
                                            "markerfacecolor":"white", 
                                            "markeredgecolor":"black"},
                                            showfliers=False, palette='viridis',
                                            ax=axes[i,j])
    axes[i,j].set_title(f'{feature} vs Quality')
    
    if i < 5:
        i += 1
    else:
        j += 1
        i = 0

plt.tight_layout()
plt.show()

In [None]:
import numpy as np

sns.heatmap(X_train.corr(), annot = True, mask = np.triu(X.corr()), 
            cmap='viridis', fmt='.2f')            
plt.title('Pearson Correlation Heatmap')

### Notes:

- PH, residual_sugar, sulphates and total sulfur dioxide seems unaffected by quality based on mean, although there are variable variance mostly due to variation in sample sizes per category.
- Correlation between free_sulfur dioxide and total_sulfur which may indicate that total sulfur is a measurement that might involve in itself the addition of free_sulfur (redundant variable).
- Negative correlation between density and alcohol "strong" after quality 5 and above. No strong relationship found in other variables. 
- These findings would be similar to what would be found under an ANOVA test.

In [None]:
X_train.drop('free_sulfur_dioxide', axis=1, inplace=True)
X_test.drop('free_sulfur_dioxide', axis=1, inplace=True)

## Distribution of features
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(10,20))
i = 0;j = 0
for feature in X_train.columns:
    sns.histplot(data=X_train,x=feature, kde=True, ax=axes[i,j], color='blue')
    axes[i,j].set_title(f'{feature} Distribution')
    axes[i,j].set_xlabel('') #remove x label

    #Update axes
    if i < 4:
        i += 1
    else:
        j += 1
        i = 0

Some features have a right skew so lets apply a log transformation to shift its distribution to a more normal one.  
Then I will apply a standarization,  before doing the downstream analysis I would like to do feature selection and move forward with the best features for the model, this to prevent overfitting.

In [None]:
import numpy as np

#Before transformation of features I will create a copy of the original dataset
X_train_original = X_train.copy()
X_test_original = X_test.copy()


features_to_scale = ['volatile_acidity','chlorides','residual_sugar','density','pH','sulphates','alcohol']
for f in features_to_scale:
    X_train['log_'+f] = np.log(X_train[f])
    X_test['log_'+f] = np.log(X_test[f])

X_train.drop(features_to_scale, axis=1, inplace=True)
X_test.drop(features_to_scale, axis=1, inplace=True)    

## Distribution of features
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(10,20))
i = 0;j = 0
for feature in X_train.columns:
    sns.histplot(data=X_train, x=feature, kde=True, ax=axes[i,j], color='blue')
    axes[i,j].set_title(f'{feature} Distribution')
    axes[i,j].set_xlabel('') #remove x label

    #Update axes
    if i < 4:
        i += 1
    else:
        j += 1
        i = 0


In [None]:
import numpy as np

sns.heatmap(X_train.corr(), annot = True, mask = np.triu(X_train.corr()), 
            cmap='viridis', fmt='.2f')            
plt.title('Vars Pearson Correlation Heatmap')

In [None]:
## Feature Selection
# LETS standarize the data prior to feature selection
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
y_train

In [None]:
## Save the dataset
import os
os.makedirs('Data', exist_ok=True)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_scaled['quality'] = y_train
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
X_test_scaled['quality'] = y_test
X_train_scaled.to_csv('Data/X_train_scaled.csv', index=False)
X_test_scaled.to_csv('Data/X_test_scaled.csv', index=False)

In [None]:
## Feature Selection using Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1, class_weight='balanced')
clf.fit(X_train, y_train)

# Feature importance
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]



In [None]:
#gini_importance
importances

In [None]:
top4 = X_train.columns[indices][:4]
top4

In [None]:
import pandas as pd
#Impurity based importance
forest_importances = pd.Series(importances, index=X_train.columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

before training... I think that re-classifying the target variable is a good idea, due to the imbalance of the classes. I will classify the wines as good or bad based on the quality. 
- Wines with a quality of 3-4 will be classified as quality low
- Wines with a quality of 5-6 will be classified as quality medium
- Wines with a quality of 7-9 will be high

In [None]:
y_train.value_counts()

In [None]:
type(y_train)
y_train = y_train.astype('str')
y_test = y_test.astype('str')

In [None]:
from collections import Counter
Counter(np.array(y_train_mapped).flatten())

In [None]:
## Lets now deal with the imbalanced dataset using SMOTE -- Synthetic Mibirotic Over Sampling Technique

from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

oversample = SMOTE(k_neighbors=2)

X_resampled, y_train_resampled = oversample.fit_resample(X_train[top4], y_train)

print(f'Original dataset shape {y_train.value_counts()}')
print(f'Resampled dataset shape {y_train_resampled.value_counts()}')

## Multi-class classification
Try:
1. Multi-class AdaBoost
2. CatBoost
3. XGBoost
4. LightGBM
5. Random Forest
6. SVM
7. Neural Network, etc.

In [None]:
## Fit models on the resampled dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

remap_dict = {'3':'low',
            '4':'low', 
            '5':'medium',
            '6':'medium',
            '7':'medium',
            '8':'high',
            '9':'high'}

y_train_mapped = y_train_resampled.replace(remap_dict)

# Initialize the encoder
encoder = LabelEncoder()

# Fit and transform the categorical columns
y_train_encoded = encoder.fit_transform(y_train_mapped.values)
y_test_encoded = encoder.fit_transform(y_test.replace(remap_dict))

for n in [50,100,200]:
    clf = RandomForestClassifier(n_estimators=n, random_state=0, n_jobs=-1, class_weight='balanced')
    clf.fit(X_resampled, y_train_encoded)
    y_pred = clf.predict(X_test[top4])

    print(f'Classification report for {n} estimators')
    print(classification_report(y_test_encoded, y_pred))
    print((f'Accuracy score for {n} estimators: %s') % (accuracy_score(y_test_encoded, y_pred)))


In [None]:
### Multi-Layer Perceptron
from sklearn.neural_network import MLPClassifier

for activation in ['tanh', 'relu','logistic']:
    clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=0,
                        solver='adam', activation=activation)
    clf.fit(X_resampled, y_train_encoded)
    y_predNN = clf.predict(X_test[top4])

    print(f"Accuracy with activation: {activation}  is {round(accuracy_score(y_test_encoded, y_predNN))}")


## CatBoost
- CatBoost is a gradient boosting library that is designed for handling categorical data. It is based on decision trees and is designed to work with categorical data. It is a powerful library that can handle categorical data without the need for one-hot encoding.
- Reduces time in parameter tunning 
- Reduction of overfitting with gradient boosting on decision trees.
- Incorporation of various types of data, doesnt need to be from a homogeneous source
- Training data limited whereas deep learning requires a lot of data.
 - **boosting methods sequentially learn from weak classifiers  with the goal of minimizing errors from preceding classifiers.**

In [1]:
## CatBoost
from catboost import CatBoostClassifier
CatModel = CatBoostClassifier()

ModuleNotFoundError: No module named 'catboost'