In [1]:
%cd ..

D:\SoftUni\Machine Learning\mushrooms project


In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, recall_score, roc_auc_score 
from sklearn.model_selection import cross_validate

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

from src import functions

In [3]:
mushroom_data = pd.read_csv('data/cleaned_dataset.csv')

In [4]:
mushroom_data

Unnamed: 0,label,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,w,o,e,w,v,l


### Spliting the dataset
Sklearn "train_test_split" method is used to split the data to train and test subsets. It is shuffled before the split and is stratified by the target variable to preserve it's classes proportion. Accordingly to the size of the dataset and the presence of some rare categories, a 75 / 25 split us used to ensure a decent size and quality of the testing set.

In [5]:
RANDOM_SEED = 11

In [6]:
mushroom_attributes = mushroom_data.drop(columns = ['label'])

In [7]:
mushroom_target = mushroom_data['label']

In [8]:
mushroom_attr_train, mushroom_attr_test, mushroom_target_train, mushroom_target_test = train_test_split(
    mushroom_attributes, mushroom_target, test_size = 0.25, random_state = RANDOM_SEED, stratify = mushroom_target
)

In [9]:
mushroom_attr_train.shape, mushroom_attr_test.shape, mushroom_target_train.shape, mushroom_target_test.shape

((6093, 21), (2031, 21), (6093,), (2031,))

The proportions of target variable have remained the same after the split:

In [10]:
mushroom_data['label'].value_counts(normalize = True)

label
e    0.517971
p    0.482029
Name: proportion, dtype: float64

In [11]:
mushroom_target_train.value_counts(normalize = True)

label
e    0.517971
p    0.482029
Name: proportion, dtype: float64

In [12]:
mushroom_target_test.value_counts(normalize = True)

label
e    0.517971
p    0.482029
Name: proportion, dtype: float64

### Encoding
All feature's values have to be encoded from "str" to numerical data type as needed for proper work with ML algorithms. I will use one - hot encoding since the categories within the features are not ranked or hierarchical, thus label encoding would not be appropriate. Because the presence of some rare categhories, handle_unknown = "ignore" and a list which contains all possible categories are provided as arguments to the encoder.

In [13]:
mushroom_columns = mushroom_attr_train.columns.values

In [14]:
mushroom_categories = [mushroom_attr_train[col].unique() for col in mushroom_columns]

In [15]:
attr_encoder = ColumnTransformer(
    transformers = [
        ('one-hot', OneHotEncoder(handle_unknown = 'ignore', categories = mushroom_categories), mushroom_columns)
    ]
)

### Algorithms

Algorithm selection is based on the applied EDA, characteristics of the dataset and results of previous research. Even with simple algorithms, high values of evaluation metrics have been achieved previously. Indeed, the results of the data analysis showed that some of the features can be of great importance in determining the class of the target variable. At the other hand, the size of the dataset may be not big enough to complex algorithms like XGBoost which are prone to overfitting. Simple classification algorithm like Logistic Regression which can be used initially as a baseline. Applying Random Forest and SVC could rise the performance(w.r.t the evaluation metric) afterwards if needed. Coefficients of Logistic Regression and RF feature importances are usable  

### Evaluation and hyperparameter tuning
To assess the performance of the models is used stratified k - fold cross - validation technique. This approach is suitable because the dataset is relatively small and do not have enough obervations for separate validation set. For this reasos the training set is randomly divided into 5 folds, ensuring that every data point, including those with rare categories, is used in both training and validation.

To distinguish the toxic mushrooms is of great importance. Labeling a poisonous mushroom as edible can be fatal in the real world. For this reasont I want to ensure that when the model predicts "edible" it is actually edible. The appropriate evaluation metric in this case is **recall**:

$$ Recall = \frac{TP}{TP + FN} $$

It shows what part of all "poisonous" labeled mushrooms was retrieved by the model. I will use it as main evaluation metric to asses the performance of the classification models in the project.

Hyperopt library is used for hyperparameter tuning. Negative recall is set as loss metric for optimization since the algorithm tries to minimize it. In addition, accuracy and area under the ROC metrics are observed. Appropriate search space and objective function is defined for each of the used algorithms.

In [16]:
eval_metrics = {
        'recall': make_scorer(recall_score, pos_label = 'p'),
        'accuracy': make_scorer(accuracy_score),
        'roc_auc': 'roc_auc'
    }

#### Logarithmic Regression

In [17]:
# def logr_objective(params):
#     pipeline = Pipeline(steps = [
#         ('encode', attr_encoder),
#         ('classifier', LogisticRegression(solver = 'liblinear', random_state = RANDOM_SEED))
#     ])

#     pipeline.set_params(
#         classifier__C = params['C'],
#         classifier__penalty = params['penalty']
#     )

#     skf = StratifiedKFold(n_splits = 5)
    
#     scores = cross_validate(pipeline, mushroom_attr_train, mushroom_target_train, cv = skf, scoring = eval_metrics)

#     return {
#         'loss': -np.mean(scores['test_recall']),
#         'status': STATUS_OK,
#         'other_metrics': {
#             'mean_recall': np.mean(scores['test_recall']),
#             'mean_accuracy': np.mean(scores['test_accuracy']),
#             'mean_roc_auc': np.mean(scores['test_roc_auc'])
#         }
#     }

In [20]:
logr_search_space = {
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'C': hp.loguniform('C', np.log(1e-3), np.log(1e2))
}

In [21]:
logr_trials = Trials()

logr_fmin = fmin(
    fn = lambda params: functions.hyperopt_objective(
        params = params,
        classifier = 'LogisticRegression', 
        attr_train = mushroom_attr_train,
        target_train = mushroom_target_train,
        eval_metrics = eval_metrics,
        encoder = attr_encoder,
        random_seed = RANDOM_SEED
    ),
    space = logr_search_space,
    algo = tpe.suggest,
    max_evals = 80,
    trials = logr_trials,
    rstate = np.random.default_rng(RANDOM_SEED)
)

100%|██████████████████████████████████████████████████████████████| 80/80 [00:49<00:00,  1.61trial/s, best loss: -1.0]


In [22]:
logr_fmin

{'C': 68.29736138744246, 'penalty': 1}

Minimization function returns "C" = 68.29736138744246 and 'penalty' = 1(L2) as the best values of hyperparameters.

The relevant information from each trial is extracted and visualized as pandas dataframe.

In [23]:
logr_metrics = []
for trial in logr_trials.trials:
    result = trial['result']
    logr_metrics.append({
        'loss': result['loss'],
        'mean_recall': result['other_metrics']['mean_recall'],
        'mean_accuracy': result['other_metrics']['mean_accuracy'],
        'mean_roc_auc': result['other_metrics']['mean_roc_auc'],
        'params': trial['misc']['vals']
    })

logr_trials_df = pd.DataFrame(logr_metrics)

In [24]:
logr_trials_df

Unnamed: 0,loss,mean_recall,mean_accuracy,mean_roc_auc,params
0,-0.995574,0.995574,0.997866,0.999953,"{'C': [0.09375928061459672], 'penalty': [1]}"
1,-1.000000,1.000000,1.000000,1.000000,"{'C': [68.29736138744246], 'penalty': [1]}"
2,-1.000000,1.000000,1.000000,1.000000,"{'C': [7.951969983018893], 'penalty': [0]}"
3,-1.000000,1.000000,1.000000,1.000000,"{'C': [4.220650205931255], 'penalty': [0]}"
4,-0.965613,0.965613,0.974396,0.996803,"{'C': [0.004064644237754485], 'penalty': [1]}"
...,...,...,...,...,...
75,-0.997957,0.997957,0.999015,0.999989,"{'C': [0.15577941981187796], 'penalty': [1]}"
76,-1.000000,1.000000,1.000000,1.000000,"{'C': [16.785735046250956], 'penalty': [1]}"
77,-1.000000,1.000000,1.000000,1.000000,"{'C': [23.29286159258232], 'penalty': [1]}"
78,-1.000000,1.000000,1.000000,1.000000,"{'C': [8.806770443443561], 'penalty': [1]}"


There are many combinations of hyperparameters which result in maximum value for the evaluation metrics. This may be sign of overfitting. To confirm whether it is occurring, the model will be evaluated on the test set. I suggest that the high values of the evaluation metrics ​​are due to the relatively small size of the dataset, balanced distribution of the target variable with easily separable classes, and presence of clear patterns in the data. 

From the trials with perfect score the combination of hyperparameters with the highest regularization strength among these using L1 penalty, is choosen. Since the results of EDA suggest that some features have low significance w.r.t the task of predicting the target variable label, Lasso is applied to sieve only the significant ones. Pushing the coefficients of the less important features to zero allows the importance of features to be interpreted. Since the modelling produces high scores of evaluation metrics, regularization strength is chosen to be of maximum value(among trials with L1 penalty) to reduce eventual overfitting.

In [25]:
logr_best_trial = logr_trials_df.sort_values(
    by = ['params'], 
    key = lambda col: col.apply(lambda x: (x['penalty'][0], x['C'][0]))
).sort_values(
    by = ['mean_recall', 'mean_accuracy', 'mean_roc_auc'], 
    ascending = False
).head(1)

In [26]:
logr_best_trial

Unnamed: 0,loss,mean_recall,mean_accuracy,mean_roc_auc,params
59,-1.0,1.0,1.0,1.0,"{'C': [0.8859574659241594], 'penalty': [0]}"


In [27]:
logr_c = logr_best_trial['params'].values[0]['C'][0]
logr_c

0.8859574659241594

In [28]:
logr_penalty = 'l1'

In [16]:
# def rf_objective(params):
#     pipeline = Pipeline(steps = [
#         ('encode', encoder),
#         ('classifier', RandomForestClassifier())
#     ])

#     pipeline.set_params(
#         classifier__n_estimators = int(params['n_estimators']),
#         classifier__max_depth = int(params['max_depth'])
#     )

#     skf = StratifiedKFold(n_splits = 5)
    
#     scores = cross_validate(pipeline, mushroom_attr_train, mushroom_target_train, cv = skf, scoring = eval_metrics)

#     return {
#         'loss': np.mean(scores['test_recall']),
#         'status': STATUS_OK,
#         'other_metrics': {
#             'mean_accuracy': np.mean(scores['test_accuracy']),
#             'mean_precision': np.mean(scores['test_precision']),
#             'mean_f1': np.mean(scores['test_f1']),
#         }
#     }

In [17]:
# rf_search_space = {
#     'n_estimators': hp.quniform('n_estimators', 20, 100, 20),
#     'max_depth': hp.quniform('max_depth', 5, 8, 1)
# }

In [18]:
# rf_trials = Trials()

# rf_best = fmin(
#     fn = lambda params: rf_objective(params),
#     space = rf_search_space,
#     algo = tpe.suggest,
#     max_evals = 40,
#     trials = rf_trials,
# )

100%|████████████████████████████████████████████████| 40/40 [01:28<00:00,  2.21s/trial, best loss: 0.9899886983731854]


In [19]:
# rf_best

{'max_depth': 5.0, 'n_estimators': 100.0}

In [20]:
# results_df = pd.DataFrame([
#     {
#         'iteration': i,
#         'loss': trial['result']['loss'],
#         'accuracy': trial['result']['other_metrics']['mean_accuracy'],
#         'precision': trial['result']['other_metrics']['mean_precision'],
#         'f1': trial['result']['other_metrics']['mean_f1'],
#         **trial['misc']['vals']
#     }
#     for i, trial in enumerate(rf_trials.trials)
# ])

In [42]:
# results_df

Unnamed: 0,iteration,loss,accuracy,precision,f1,max_depth,n_estimators
0,0,0.992286,0.992286,0.992406,0.992283,[5.0],[60.0]
1,1,1.0,1.0,1.0,1.0,[7.0],[60.0]
2,2,0.992286,0.992286,0.992414,0.992283,[5.0],[20.0]
3,3,0.993435,0.993435,0.993542,0.993432,[6.0],[60.0]
4,4,1.0,1.0,1.0,1.0,[8.0],[40.0]
5,5,0.998687,0.998687,0.998692,0.998687,[6.0],[20.0]
6,6,1.0,1.0,1.0,1.0,[8.0],[20.0]
7,7,0.991301,0.991301,0.991454,0.991298,[5.0],[80.0]
8,8,0.997867,0.997867,0.997886,0.997867,[6.0],[80.0]
9,9,1.0,1.0,1.0,1.0,[8.0],[100.0]


In [32]:
# best_params

{'LogisticRegression': {'C': 1.4333552786419543},
 'RandomForest': {'max_depth': 8.0, 'n_estimators': 70.0},
 'SVC': {'C': 2.890974943886087, 'kernel': 0}}