# Model selection for a classification Machine Learning project. <a href="https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data">UCI ML Wine Data Set</a></h1>

## Imports

In [54]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from plotly import express as px

## Data analysis

### Database load and analysis

In [67]:
RANDOM_STATE = 1
data = load_wine(as_frame = True)

df = pd.DataFrame(data['data'])
target = data['target']
feature_names = data['feature_names']
target_classes = data['target_names']
num_features = len(df.columns)
num_observations = len(df)

print(f'DataFrame info ({num_observations} observations)')
print()
print(df.info())
print()
print('Null feature values:', df.isna().any().any())
print('Possible target values:', target_classes)


DataFrame info (178 observations)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                      

### Feature analysis

In [56]:
print('Feature analysis:')
df.describe()

Feature analysis:


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


## Data preparation

### PCA dimension reduction

In [57]:
pca_scaler = StandardScaler()
X_scaled = pca_scaler.fit_transform(df.values)

pca = PCA()
pca.fit(X_scaled)
variance_ratio = pca.explained_variance_ratio_

pca_features_cum_dict = {}

for i in range(len(feature_names)):
    pca_features_cum_dict[i] = sum(variance_ratio[:i +1])

threshold = 0.9
var_cum_sum = 0
pca_id = 0

for i in range(len(pca_features_cum_dict)):
    if var_cum_sum <= threshold:
        var_cum_sum = list(pca_features_cum_dict.values())[i]
        pca_id = i -1

key_features = feature_names[:pca_id]

### Variance ratio

In [58]:
fig = px.bar(x = key_features, y = variance_ratio[:pca_id])
fig.add_bar(x = feature_names[pca_id:], y = variance_ratio[pca_id:], name ='excluded')

fig.update_layout(
    title = 'Variance ratio per feature (scaled)',
    plot_bgcolor = 'rgba(0,0,0,0)',
    font = dict(family = 'sans-serif'))

fig.update_xaxes(title = '')
fig.update_yaxes(title = '')
fig.update_traces(hovertemplate = '<b>Feature:</b> %{x}<br><b>Variance ratio:</b> %{y:.3f}')
fig.show()

### Data split

In [59]:
X = df.iloc[:, :pca_id]
test_size = 0.3

X_train, X_test, y_train, y_test = train_test_split(
    X, target,
    test_size = test_size,
    random_state = RANDOM_STATE,
    stratify = target)

print(f'Array lengths (test size: {test_size}, random state: {RANDOM_STATE}):')
print('--------------------------------------------------------------------------------')
print('Training features:\t', len(X_train))
print('Training target:\t', len(y_train))
print('Testing features:\t', len(X_test))
print('Testing target:\t\t', len(y_test))

Array lengths (test size: 0.3, random state: 1):
--------------------------------------------------
Training features:	 124
Training target:	 124
Testing features:	 54
Testing target:		 54


### Standard scaling

In [60]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Models

### Hyperparameter tuning function

In [61]:
def hyperparameter_tune(model_name:str, hyperparameters:dict) -> dict:
    
    scores = {
        f'{hyperparameter}': {
            'values': [],
            'scores': [],
            'fig' : []
        } for hyperparameter in hyperparameters
    }
    
    for hyperparameter, values in hyperparameters.items():
        
        for value in values:
            if model_name == 'knn':
                model = KNeighborsClassifier(**{hyperparameter:value})
                
            elif model_name == 'ridge':
                model = RidgeClassifier(**{hyperparameter:value})
            
            elif model_name == 'forest':
                model = RandomForestClassifier(**{hyperparameter:value}, random_state = RANDOM_STATE)
        
            model.fit(X_train_scaled, y_train)
            scores[hyperparameter]['scores'].append(model.score(X_test_scaled, y_test))
            
        fig = px.line(x = values, y = scores[hyperparameter]['scores'])
        
        fig.update_layout(
            title = f'<b>{model_name.title()}</b>. Score value by {hyperparameter}',
            font = dict(family = 'sans-serif'),
            plot_bgcolor = 'rgba(0,0,0,0)')

        fig.update_traces(hovertemplate='<b>Hyperparameter:</b> %{x}<br><b>Score:</b> %{y:.2f}')
        fig.update_xaxes(visible = False)
        fig.update_yaxes(visible = False)
        
        scores[hyperparameter]['fig'].append(fig)
        scores[hyperparameter]['values'].extend(values)
        
    return scores

### K Nearest Neighbors

In [62]:
n_neighbors = [1, 2, 3, 5, 7, 9, 10, 11, 13]

knn = hyperparameter_tune(
    model_name = 'knn',
    hyperparameters = {'n_neighbors': n_neighbors}
)

knn_fig = knn['n_neighbors']['fig'][0]
knn_scores = knn['n_neighbors']['scores']
knn_max_score = max(knn_scores)

knn_fig.show()
print('Max score:', round(knn_max_score, 3))

Max score: 0.889


### Ridge Classifier

In [63]:
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

ridge = hyperparameter_tune(
    model_name = 'ridge',
    hyperparameters = {'alpha': alpha}
)

ridge_fig = ridge['alpha']['fig'][0]
ridge_scores = ridge['alpha']['scores']
ridge_max_score = max(ridge_scores)

ridge_fig.show()
print('Max score:', round(ridge_max_score, 3))

Max score: 0.889


### Random Forest Classifier

In [64]:
max_depth = [5, 10, 20, 40, 80, 100]
n_estimators = [10, 100, 500, 800]

rndf = hyperparameter_tune(
    model_name = 'forest',
    hyperparameters = {
        'max_depth': max_depth,
        'n_estimators': n_estimators
    }
)

rndf_max_depth_fig = rndf['max_depth']['fig'][0]
rndf_max_depth_scores = rndf['max_depth']['scores']
rndf_max_depth_max_score = max(rndf_max_depth_scores)

rndf_n_estimators_fig = rndf['n_estimators']['fig'][0]
rndf_n_estimators_scores = rndf['n_estimators']['scores']
rndf_n_estimators_max_score = max(rndf_n_estimators_scores)

rndf_n_estimators_fig.show()
rndf_max_depth_fig.show()
print('Max score (max_depth):', round(rndf_max_depth_max_score, 3))
print('Max score (n_estimators):', round(rndf_n_estimators_max_score, 3))

Max score (max_depth): 0.907
Max score (n_estimators): 0.907


### Final model

In [70]:
n_estimators = 500
model = RandomForestClassifier(n_estimators = n_estimators)
model.fit(X_train_scaled, y_train)
score = round(model.score(X_test_scaled, y_test), 3)

print('Chosen model: Random Forest Classifier')
print(f'Key hyperparameter: n_estimators = {n_estimators}')
print('Model mean accuracy:', score)

Chosen model: Random Forest Classifier
Key hyperparameter: n_estimators = 500
Model mean accuracy: 0.907
