In [19]:
# Sci-kit Learn imports
import os
import csv
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import wandb  # Optional for experiment tracking
# Other Python library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from custom_functions import MY_SEED
from sklearn.model_selection import StratifiedShuffleSplit

from custom_functions import save_submission_file, save_model_config, load_model_config, get_model_names, get_pipeline, extract_pipeline_config, best_model, CONFIG_FILE, MY_SEED, get_training_splits, STEP_CLASS_MAP

In [1]:
0.80256 - 0.79644

0.006120000000000014

In [5]:
le = LabelEncoder()

train_data_file = '../data/processed/train_feature_clean.csv'
test_data_file = '../data/processed/test_feature_clean.csv'

df_train = pd.read_csv(train_data_file)
df_test = pd.read_csv(test_data_file)

features = df_train.columns[3:]

X_train = df_train[features].values
y_train = le.fit_transform(df_train['activity'])

# y_train = df_train['activity']

X_test = df_test[features].values

In [4]:
pipeline_shell_1 = make_pipeline(
    StandardScaler(),
    MLPClassifier(random_state=MY_SEED)
)

pipeline_shell_2 = make_pipeline(
    StandardScaler(),
    PCA(random_state=MY_SEED),
    MLPClassifier(random_state=MY_SEED)
)

pipeline_shell_3 = make_pipeline(
    StandardScaler(),
    SMOTE(random_state=MY_SEED),
    MLPClassifier(random_state=MY_SEED)
)
pipeline_shell_4 = make_pipeline(
    StandardScaler(),
    SMOTE(random_state=MY_SEED),
    PCA(random_state=MY_SEED),
    MLPClassifier(random_state=MY_SEED, )
)

In [28]:
baseline_params = {
    'mlpclassifier__hidden_layer_sizes': [(64,), (128,)],
    'mlpclassifier__alpha': [0.0001, 0.001],
    'mlpclassifier__learning_rate_init': [0.001, 0.01],
    'mlpclassifier__max_iter': [500],
    'mlpclassifier__activation': ['relu']
} #mlpclassifier1

param_grid = {
    'mlpclassifier__hidden_layer_sizes': [(128,), (64, 64), (128, 64)],
    'mlpclassifier__alpha': [0.01, 0.1],
    'mlpclassifier__learning_rate': ['constant', 'invscaling', 'adaptive'],
} #mlpclassifier2

param_grid = {
    'mlpclassifier__hidden_layer_sizes': [(128,128), (64, 64), (64, 64, 64)],
    'mlpclassifier__activation': ['logistic', 'tanh', 'relu', 'identity']
} #mlpclassifier3


config = {
    'mlpclassifier1': {
        'max_iter': 500,
        'learning_rate_init': 0.01,
        # 'hidden_layer_sizes': (128,),
        # 'alpha': 0.001,
        'activation': 'relu'
        },
    'mlpclassifier2': {
        'learning_rate': 'adaptive',
        # 'hidden_layer_sizes': (64, 64),
        'alpha': 0.01},
    'mlpclassifier3': {
        'hidden_layer_sizes': (64, 64),
        # 'activation': 'tanh'
    }
}

pipeline = make_imb_pipeline(
    StandardScaler(),
    # SMOTE(k_neighbors=5),
    PCA(n_components=40),
    MLPClassifier(random_state=MY_SEED),
                #   **config['mlpclassifier1'],
                #   **config['mlpclassifier2'],
                #   **config['mlpclassifier3'])
)
param_grid = {
    'smote__k_neighbors': np.arange(2, 6)
}

stratified_cv = StratifiedKFold(shuffle = True, random_state=MY_SEED)
stratified_sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=MY_SEED)

# param_search = RandomizedSearchCV(pipeline, param_grid, n_iter=5, scoring='f1_macro', cv = stratified_cv, random_state=MY_SEED)

# param_search = RandomizedSearchCV(pipeline, param_grid, n_iter=10, scoring='f1_macro', cv = stratified_sss, random_state=MY_SEED)
param_search = GridSearchCV(pipeline, param_grid, scoring='f1_macro', cv = stratified_sss)

In [21]:
param_search.fit(X_train, y_train)

In [23]:
# print(param_search.cv_results_['param_mlpclassifier__alpha'])
# print(param_search.cv_results_['param_mlpclassifier__activation'])
print(param_search.cv_results_['param_smote__k_neighbors'])
print(param_search.cv_results_['mean_test_score'])
print(param_search.cv_results_['mean_fit_time'])

[2 3 4 5]
[0.97540698 0.97556525 0.976488   0.97926128]
[28.99611457 28.90452234 33.39288354 31.23563488]


In [24]:
param_search.best_params_

{'smote__k_neighbors': 5}

In [None]:
pipeline = param_search.best_estimator_

In [29]:
save_model_config(pipeline, model_name='mlp_default_pca', notes = "default mlp settings + PCA(40)")

In [30]:
save_submission_file(train_data_file=train_data_file,
                     test_data_file=test_data_file,
                     model_name='mlp_default_pca')