# Caso de mejor desempeño utilizando una Red Neuronal de Una capa
## Importando librerias y modulos

In [1]:
# Import the required packages
import os
import ast

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler


## Curación del dataset y estrucuturacion de las features

In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

### Ejecutando la transformación del dataset

In [3]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalización
SS = StandardScaler()
SS.fit(X_train)
X_train = SS.transform(X_train)
X_valid = SS.transform(X_valid)

#### Se entrena el modelo con los parámetros con mejor desempeño

In [4]:
best = pd.read_csv('./20200816_submits_00/submission_best_config_mlp.csv')

mlp_model = MLPClassifier(**ast.literal_eval(best.params.values[0]), random_state=42)
mlp_model.fit(X_train, y_train)

y_pred_train = mlp_model.predict(X_train)
y_pred_valid = mlp_model.predict(X_valid)

accuracy_score(y_train, y_pred_train),accuracy_score(y_valid, y_pred_valid)



(0.7516624040920716, 0.6879506688547417)

#### Prediccion y guardado de los resultados

In [5]:
XXn = SS.transform(XX)
y2send = mlp_model.predict(XXn)

In [6]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, y2send)), columns=["VisitNumber", "TripType"])
submission.to_csv("./submission_mlp.csv", header=True, index=False)