# Supervised Machine Learning competition
## Multi-class classification

In [None]:
!pip install flaml

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

from flaml import AutoML
from catboost import CatBoostClassifier, Pool

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Import data and store in DataFrames

In [None]:
df_feat_test = pd.read_csv('/kaggle/input/data-challenge-supervised-ml202208/features_test.csv')
df_targ_train = pd.read_csv('/kaggle/input/data-challenge-supervised-ml202208/target_train.csv')
df_feat_train = pd.read_csv('/kaggle/input/data-challenge-supervised-ml202208/features_train.csv')

In [None]:
X_test_raw = df_feat_test.copy()
X_train_raw = df_feat_train.copy()
y_train = df_targ_train['Expected']

X_train = X_train_raw.drop(columns = ['Id'])
X_test = X_test_raw.drop(columns = ['Id'])

Data inspection

In [None]:
X_test.shape, X_train.shape, y_train.shape

In [None]:
X_train.info()

In [None]:
X_train.describe()

In [None]:
X_train.isnull().sum().unique()

In [None]:
sns.clustermap(X_train.corr(), cmap = "seismic", figsize = (7,7), vmin = -1, vmax = 1)
plt.title('Cross-correlation of all features')
plt.show()

In [None]:
sns.histplot(X_train.corr().unstack().sort_values(ascending=False).drop_duplicates())
plt.xlim(-0.05, 0.05)
plt.axvline(0, color = 'C3', linestyle = '--')
plt.title('Distribution of correlation coefficients')
plt.show()

The majority of the features have close to 0 correlation with each other.

## First use a Logistic Regression model as baseline

In [None]:
log_reg = LogisticRegression(random_state=23, solver='liblinear')

cv_log_reg = cross_val_score(log_reg, X_train, y_train, cv = 5, scoring='f1_macro')
print(cv_log_reg)
print(np.mean(cv_log_reg))

In [None]:
log_reg.fit(X_train, y_train)
pred_log_reg = log_reg.predict(X_train)
print(confusion_matrix(y_train, pred_log_reg))
print(classification_report(y_train, pred_log_reg))

fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(y_train, pred_log_reg), annot=True, vmin=0, ax=ax)
plt.show()

Logistic regression performs very poorly on this dataset, let's try to find a better model!

## Use AutoML from FLAML to search for most promising ML model class

In [None]:
automl = AutoML()

automl_settings = {
                    "time_budget": 300, # 5 min. to find best model
                    "metric": 'macro_f1',
                    "task": 'classification',
                    "log_file_name": 'mylog.log',
                    "eval_method": 'cv',
                    "n_splits": 5       
                    }

In [None]:
automl.fit(
            X_train = X_train,
            y_train = y_train.values,
            **automl_settings,
            verbose=1
            )

In [None]:
print(automl.best_estimator)
print(automl.best_config)
best_f1_macro = 1 - automl.best_loss
print('Best F1_macro score:', best_f1_macro )

Catboost seems to be the most promising model and performs much better than logistic regression!

### Use a CatBoostClassifier for feature selection

In [None]:
model_Cat = CatBoostClassifier(
                                early_stopping_rounds = 10,
                                learning_rate = 0.1,
                                n_estimators = 180,
                                thread_count = -1,
                                verbose = False,
                                random_seed = 23
                                )

In [None]:
model_Cat.fit(X_train, y_train)

In [None]:
pred_model_Cat = model_Cat.predict(X_train)
print(classification_report(y_train, pred_model_Cat))

fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(y_train, pred_model_Cat), annot=True, vmin=0, ax=ax)
plt.show()

In [None]:
list_feat_imp = model_Cat.get_feature_importance(data = Pool(X_train, label=y_train))
sns.histplot(list_feat_imp);

Most features have a feature importance score of close to zero! Let's select only those features that have a score > 1!

In [None]:
len(list_feat_imp[list_feat_imp<1])

In [None]:
len(list_feat_imp[list_feat_imp>1])

We are left with 16 relevant features, we eliminate all the other ones

In [None]:
feats = model_Cat.select_features(
                                    X_train,
                                    y_train,
                                    features_for_select = list(range(120)),
                                    num_features_to_select = 16,
                                    verbose=0,
                                    train_final_model=False,
                                    )

In [None]:
# Select best features
X_train_red = X_train.iloc[:,feats['selected_features']]
X_train_red.shape

In [None]:
X_test_red = X_test.iloc[:,feats['selected_features']]
X_test_red.shape

## Use another round of AutoML to find the best model for the reduced dataset

In [None]:
automl_red = AutoML()

automl_settings_red = {
                    "time_budget": 2400, # 40 min. to find best model
                    "metric": 'macro_f1',
                    "task": 'classification',
                    "log_file_name": 'mylog.log',
                    "eval_method": 'cv',
                    "n_splits": 5       
                    }

automl_red.fit(
                X_train = X_train_red,
                y_train = y_train.values,
                **automl_settings_red,
                verbose=0
                )

In [None]:
print(automl_red.best_estimator)
print(automl_red.best_config)
best_f1_macro = 1 - automl_red.best_loss
print('Best F1_macro score:', best_f1_macro )

In [None]:
automl_red.best_config

In [None]:
pred_automl_red = automl_red.predict(X_train_red)
cf = confusion_matrix(y_train, pred_automl_red)
print(classification_report(y_train, pred_automl_red))

fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(y_train, pred_model_Cat), annot=True, vmin=0, ax=ax)
plt.show()

## Train another CatBoostClassifer with reduced features

As the AutoML approach suggests that a high number of estimators is useful, we increase the n_estimator parameter to 5000.

In [None]:
model_Cat_red = CatBoostClassifier(
                                early_stopping_rounds = 20,
                                learning_rate = 0.025,
                                n_estimators = 5000,
                                thread_count = -1,
                                verbose = False,
                                random_seed = 23
                                )

In [None]:
cv_model_Cat_red = cross_val_score(model_Cat_red, X_train_red, y_train, cv = 5, scoring='f1_macro')
print(cv_model_Cat_red)
print(np.mean(cv_model_Cat_red))

In [None]:
model_Cat_red.fit(X_train_red, y_train)

In [None]:
pred_model_Cat_red = model_Cat_red.predict(X_train_red)
cf = confusion_matrix(y_train, pred_model_Cat_red)
print(classification_report(y_train, pred_model_Cat_red))

fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(y_train, pred_model_Cat_red), annot=True, vmin=0, ax=ax)
plt.show()

In [None]:
y_pred_Cat_red = model_Cat_red.predict(X_test_red)

In [None]:
df_pred_Cat_red = pd.DataFrame()
df_pred_Cat_red['Id']=X_test_raw['Id']
df_pred_Cat_red.set_index('Id', inplace=True)
df_pred_Cat_red['Predicted'] = y_pred_Cat_red2.ravel()

# Save file to csv
#df_pred_Cat_red.to_csv('../data/raw/df_pred_Cat_red.csv')

In [None]:
df_pred_Cat_red

## Try out if an FLAML AutoML ensemble approach yields even better results

In [None]:
automl_ens = AutoML()

automl_settings_ens = {
                    "time_budget": 3600, # 40 min. to find best model
                    "metric": 'macro_f1',
                    "task": 'classification',
                    "log_file_name": 'mylog.log',
                    "eval_method": 'cv',
                    "n_splits": 5       
                    }

automl_ens.fit(
                X_train = X_train_red,
                y_train = y_train.values,
                ensemble = True,
                **automl_settings_red,
                verbose = 0,
                )

In [None]:
automl_ens.best_estimator

In [None]:
best_f1_macro = 1 - automl_ens.best_loss
print('Best F1_macro score:', best_f1_macro )

In [None]:
automl_ens.model

In [None]:
pred_automl_ens = automl_ens.model.predict(X_train_red)
cf = confusion_matrix(y_train, pred_automl_ens)
print(classification_report(y_train, pred_automl_ens))

fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(y_train, pred_automl_ens), annot=True, vmin=0, ax=ax)
plt.show()

The ensemble model is much worse than the previous Catboost model

## Finetune CatBoost using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

catboost_tune = CatBoostClassifier(random_seed = 23,
                                   verbose = False,
                                   thread_count = -1)
                                  
grid_search = {
            #'early_stopping_rounds': [10, 20],
            #'min_data_in_leaf': [50, 100],
            'learning_rate': [0.01, 0.02],
            'n_estimators': [5000, 7500]
            }

grid_search_obj = GridSearchCV(
                                estimator = catboost_tune,
                                param_grid = grid_search,
                                scoring = 'f1_macro',
                                cv = 3,
                                verbose = 2,
                                n_jobs = -1
                                )

In [None]:
grid_search_obj.fit(X_train_red, y_train)

In [None]:
pred_cat_grid = grid_search_obj.best_estimator_.predict(X_train_red)
print(classification_report(y_train, pred_cat_grid))

fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(y_train, pred_cat_grid), annot=True, vmin=0, ax=ax)
plt.show()

In [None]:
pd.set_option('display.max_colwidth', None)

cv_result_df = pd.DataFrame({
    'Model Rank': grid_search_obj.cv_results_['rank_test_score'],
    'Model Hyperparams': grid_search_obj.cv_results_['params'],
    'Avg CV F1-macro': grid_search_obj.cv_results_['mean_test_score'],
    'Std Dev CV F1-macro': grid_search_obj.cv_results_['std_test_score'],
    'CV Fold 1 F1-macro': grid_search_obj.cv_results_['split0_test_score'],
    'CV Fold 2 F1-macro': grid_search_obj.cv_results_['split1_test_score'],
    'CV Fold 3 F1-macro': grid_search_obj.cv_results_['split2_test_score']
})

cv_result_df.sort_values(by=['Model Rank'], ascending=True)

No performance improvement to model_Cat_red

## Finetune CatBoost via Bayesian hyperparameter tuning using hyperopt

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [None]:
space = {
        'learning_rate':  hp.choice('learning_rate', [0.0075, 0.01, 0.015]),
        'n_estimators': hp.choice('n_estimators', [5000, 7500, 10000]),
        }

def objective(space):
    cat_model_opt = CatBoostClassifier(
                                        learning_rate = space['learning_rate'],
                                        n_estimators = space['n_estimators'],
                                        min_data_in_leaf = 50,
                                        early_stopping_rounds = 20,
                                        random_seed = 23,
                                        verbose = False,
                                        thread_count = -1
                                        )
    
   
       
    f1 = cross_val_score(cat_model_opt, X_train_red, y_train, cv=3, scoring='f1_macro').mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -f1, 'status': STATUS_OK }

trials = Trials()

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest, # Tree parzen estimator
            max_evals=5,
            trials=trials)
best

In [None]:
lrate = {0: 0.0075, 1: 0.01, 2:0.015}
n_est = {0: 5000, 1:7500, 2:10000}
#depth = {0: 6}
#l2_reg = {0: 10, 1: 30, 2: 100}

optimal_cat = CatBoostClassifier(
                                  learning_rate = lrate[best['learning_rate']],
                                  n_estimators = n_est[best['n_estimators']],
                                  #l2_leaf_reg = l2_reg[best['l2_leaf_reg']],                              
                                  #min_data_in_leaf = 50,
                                  #early_stopping_rounds = 20,
                                  random_seed = 23,
                                  verbose = False,
                                  thread_count = -1,
                                  ).fit(X_train_red,y_train)

In [None]:
pred_best_cat = optimal_cat.predict(X_train_red)
print(classification_report(y_train, pred_best_cat))

fig, ax = plt.subplots(figsize=(3,3))
sns.heatmap(confusion_matrix(y_train, pred_best_cat), annot=True, vmin=0, ax=ax)
plt.show()

In [None]:
y_pred_Cat_opt = optimal_cat.predict(X_test_red)

In [None]:
df_pred_Cat_opt = pd.DataFrame()
df_pred_Cat_opt['Id']=X_test_raw['Id']
df_pred_Cat_opt.set_index('Id', inplace=True)
df_pred_Cat_opt['Predicted'] = y_pred_Cat_opt.ravel()

df_pred_Cat_opt.to_csv('submission.csv')

Similar performance to model_Cat_red

### The model 'optimal_cat' achieved the highest score in the kaggle competition among the models in this notebook.