In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
%%time
df_full = pd.read_csv('../data/processed/newsgroup_vectorized.gz', compression='gzip')

Wall time: 3min 16s


## Split dataset in train, test and validation

In [3]:
%%time
X = df_full.drop(columns={'target'})
y = df_full[['target']]

X, X_test, y, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

Wall time: 5.15 s


## Stratified K Fold Cross Validation

In [9]:
%%time

skf = StratifiedKFold(n_splits=5, random_state=42)

list_metrics = []
for train_index, val_index in skf.split(X, y):
    X_train = X.iloc[train_index]
    X_val = X.iloc[val_index]
    y_train = y.iloc[train_index]['target']
    y_val = y.iloc[val_index]['target']
    
    model = MLPClassifier()
    model.fit(X_train, y_train)
    
    list_metrics.append(round(accuracy_score(y_val, model.predict(X_val)), 3))   

Wall time: 1h 22min 23s


## Evaluate Metrics 

In [14]:
list_metrics

[0.89, 0.897, 0.893, 0.891, 0.899]

In [18]:
round(np.mean(list_metrics), 2)

0.89

## Validate Model with test set

In [17]:
accuracy_score(y_test, model.predict(X_test))

0.8900690387679235