In [1]:
import math
import pandas as pd

from time import perf_counter 
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

## Read dataset

In [2]:
%%time
df_full = pd.read_csv('../data/processed/newsgroup_vectorized.gz', compression='gzip')

Wall time: 3min 22s


## Configure models to test

In [3]:
dict_models = {}

### Gaussian Naive Bayes

In [4]:
dict_models['Gaussian Naive Bayes (var_smoothing=1e-09)'] = {
    'model' : GaussianNB()
}

### Multi-layer Perceptron

In [5]:
for activation in ['relu', 'tanh']:
    for lr in [0.001, 0.01, 0.1]:
        for n_layers in [10, 100, round(math.sqrt(df_full.shape[1]))]:
            dict_models[f'MLP (layers={n_layers}, lr={lr}, {activation})'] = {
                'model' : MLPClassifier(hidden_layer_sizes=(n_layers, ), activation=activation, learning_rate_init=lr)
            }

### Random Forest

In [6]:
for max_features in ["sqrt", "log2"]:
    for n_estimators in [1, 10, 100, 500, 1000]:
        dict_models[f'Random Forest (n_tree={n_estimators}, max_feat={max_features})'] = {
            'model' : RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
        }

### KNN

In [7]:
for n_neighbors in [5, 27, 81, round(math.sqrt(df_full.shape[0]))]:
    dict_models[f'KNN (n_neighbors={n_neighbors})'] = {
        'model' : KNeighborsClassifier(n_neighbors=n_neighbors)
    }

## Stratified K-fold Cross Validation

In [8]:
%%time

skf = StratifiedKFold(n_splits=5, random_state=42)

X = df_full.drop(columns={'target'})
y = df_full[['target']]

fold_count = 1
list_result = []
for train_index, test_index in tqdm(skf.split(X, y)):
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]['target']
    y_test = y.iloc[test_index]['target']
    
    for model_name in dict_models:
        
        # training
        start_time = perf_counter()
        model = dict_models[model_name]['model']
        model.fit(X_train, y_train)
        end_time = perf_counter()
        
        # application
        start_time_app = perf_counter()
        y_hat = model.predict(X_test)
        end_time_app = perf_counter()        

        # append result
        list_result.append({
            'model_name' : model_name,
            'fold' : fold_count,
            'time_train' : round(end_time - start_time, 3),
            'time_application' : round(end_time_app - start_time_app, 3),
            'y_hat' : y_hat,
            'accuracy' : accuracy_score(y_test, y_hat),
        })
        
    fold_count = fold_count + 1

5it [23:48:33, 17142.62s/it]


Wall time: 23h 48min 34s


In [14]:
df_metrics = pd.DataFrame(list_result)

## Save result 

In [11]:
df_metrics.drop(columns={'y_hat'}).to_csv('../metrics/models_evaluation.csv', sep=';', index=False)

## Evaluate

### Top 10 models

avg accuracy for all folds

In [24]:
df_metrics.groupby('model_name').agg({'accuracy' : 'mean'}).sort_values('accuracy', ascending=False).head(10)

Unnamed: 0_level_0,accuracy
model_name,Unnamed: 1_level_1
"MLP (layers=100, lr=0.001, tanh)",0.843374
"MLP (layers=132, lr=0.001, tanh)",0.843
"MLP (layers=132, lr=0.001, relu)",0.842948
"MLP (layers=100, lr=0.001, relu)",0.84284
"MLP (layers=10, lr=0.001, tanh)",0.823194
"Random Forest (n_tree=1000, max_feat=log2)",0.817243
"Random Forest (n_tree=500, max_feat=log2)",0.816711
"MLP (layers=10, lr=0.001, relu)",0.815386
"MLP (layers=10, lr=0.01, tanh)",0.807417
"MLP (layers=132, lr=0.01, relu)",0.804397


### Best model 

We look the best model overall (mean accuracy in all folds).

In [35]:
best_model = df_metrics \
    .groupby('model_name') \
    .agg({'accuracy' : 'mean'}) \
    .sort_values('accuracy', ascending=False).head(1).index[0]

best_fold = df_metrics[df_metrics['model_name'] == best_model] \
    .sort_values('accuracy', ascending=False)['fold'] \
    .values[0]

In [39]:
df_metrics[(df_metrics['model_name'] == best_model) & (df_metrics['fold'] == best_fold)]

Unnamed: 0,model_name,fold,time_train,time_application,y_hat,accuracy
44,"MLP (layers=100, lr=0.001, tanh)",2,781.306,0.354,"[alt.atheism, soc.religion.christian, sci.spac...",0.862298


### Error analysis

In [None]:
# Get ground truth for best fold
fold_count = 1
for train_index, test_index in skf.split(X, y):
    if fold_count == best_fold:
        y_test = y.iloc[test_index]['target']
        break
    fold_count = fold_count + 1

# Create dataset
df_validation = pd.DataFrame()
df_validation['y'] = y_test
df_validation['y_hat'] = df_metrics[(df_metrics['model_name'] == best_model) & (df_metrics['fold'] == best_fold)]['y_hat'].values[0]

In [68]:
df_tp = df_validation[df_validation['y'] == df_validation['y_hat']].groupby('y_hat').size()
df_fn = df_validation[df_validation['y'] != df_validation['y_hat']].groupby('y').size()
df_fp = df_validation[df_validation['y'] != df_validation['y_hat']].groupby('y_hat').size()

list_metrics = []
for target_class in df_validation['y'].unique():
    list_metrics.append({
        'class' : target_class,
        'precision' : round(df_tp.loc[target_class] / (df_tp.loc[target_class] + df_fp.loc[target_class]), 2),
        'recall' : round(df_tp.loc[target_class] / (df_tp.loc[target_class] + df_fn.loc[target_class]), 2),
    })

In [71]:
df_error = pd.DataFrame(list_metrics).sort_values('class')
df_error.to_csv('../metrics/best_model_error_by_class.csv', sep=';', index=False)

In [72]:
df_error

Unnamed: 0,class,precision,recall
0,alt.atheism,0.85,0.89
1,comp.graphics,0.73,0.81
2,comp.os.ms-windows.misc,0.75,0.62
3,comp.sys.ibm.pc.hardware,0.7,0.79
4,comp.sys.mac.hardware,0.78,0.83
5,comp.windows.x,0.87,0.87
6,misc.forsale,0.75,0.81
7,rec.autos,0.91,0.87
8,rec.motorcycles,0.97,0.93
9,rec.sport.baseball,0.97,0.94
