In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler


In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

In [3]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalización
SS = StandardScaler()
SS.fit(X_train)
X_train = SS.transform(X_train)
X_valid = SS.transform(X_valid)


In [4]:
X.shape

(67029, 79)

In [5]:
mlp_model = clf = MLPClassifier(random_state=42)

param_grid_mlp = [
    {'solver': ['adam'],
     'alpha': [1e-5],
     'activation': ['relu', 'logistic'],
     'learning_rate': ['adaptive'],
     'hidden_layer_sizes': [79]
    }
]

gscv_mlp = GridSearchCV(mlp_model, param_grid_mlp, cv=3, scoring=['accuracy'], refit=False, n_jobs=-1, verbose=4)
gscv_mlp.fit(X_train, y_train);

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.7min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.7min finished


In [6]:
gscv_mlp_res = pd.DataFrame(gscv_mlp.cv_results_)
gscv_mlp_res.sort_values(by='rank_test_accuracy') #.to_csv('./prueba3.csv')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_alpha,param_hidden_layer_sizes,param_learning_rate,param_solver,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,98.689252,0.492558,0.097417,0.006723,relu,1e-05,"(79, 2)",adaptive,adam,"{'activation': 'relu', 'alpha': 1e-05, 'hidden...",0.522059,0.525384,0.525959,0.524467,0.001719,1
1,73.774154,16.672646,0.071928,0.02439,logistic,1e-05,"(79, 2)",adaptive,adam,"{'activation': 'logistic', 'alpha': 1e-05, 'hi...",0.472442,0.447826,0.463683,0.461317,0.010188,2


In [10]:
best = gscv_mlp_res[gscv_mlp_res.rank_test_accuracy==1]
best.to_csv('./pg_submission_best_config_mlp.csv')

In [11]:
mlp_model = MLPClassifier(**best.params.values[0], random_state=42)
mlp_model.fit(X_train, y_train)

y_pred_train = mlp_model.predict(X_train)
y_pred_valid = mlp_model.predict(X_valid)

accuracy_score(y_train, y_pred_train),accuracy_score(y_valid, y_pred_valid)



(0.7516624040920716, 0.6879506688547417)

In [12]:
XXn = SS.transform(XX)
y2send = mlp_model.predict(XXn)

In [13]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, y2send)), columns=["VisitNumber", "TripType"])
submission.to_csv("./pg_submission_mlp.csv", header=True, index=False)

https://scikit-learn.org/stable/modules/neural_networks_supervised.html

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
