In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler


In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

In [3]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalización
SS = StandardScaler()
SS.fit(X_train)
X_train = SS.transform(X_train)
X_valid = SS.transform(X_valid)


In [4]:
dtc_model = DecisionTreeClassifier(random_state=42)

param_grid_dtc = [
#   {'splitter': ['best'],
#           'criterion': ['gini', 'entropy'],
#           'max_depth': list(np.arange(25,125,25))+[None],
#           'min_samples_leaf': np.arange(1,10,2),
#           'min_samples_split': np.arange(2,10,2),
#           'max_features': [None, 'sqrt', 'log2']
#           }
    {'splitter': ['best'],
        'criterion': ['gini'],
        'max_depth': [None],
        'min_samples_leaf': np.arange(1,10,2),
        'min_samples_split': np.arange(100,200,25),
        'max_features': [None]
    }
# ,
#   {'splitter': ['random'],
#           'criterion': ['gini', 'entropy'],
#           'max_depth': list(np.arange(25,125,25))+[None],
#           'min_samples_leaf': np.arange(1,10,2),
#           'min_samples_split': np.arange(2,10,2)
#           }
]

gscv_dtc = GridSearchCV(dtc_model, param_grid_dtc, cv=3, scoring=['accuracy'], refit=False, n_jobs=-1, verbose=4) #n_jobs es paralelizar
gscv_dtc.fit(X_train, y_train);

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   20.1s finished


In [5]:
gscv_dtc_res = pd.DataFrame(gscv_dtc.cv_results_)
gscv_dtc_res.sort_values(by='rank_test_accuracy') #.to_csv('./prueba3.csv')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_splitter,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
8,1.080115,0.009358,0.016124,0.001031,gini,,,5,100,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.626854,0.62954,0.634719,0.630371,0.003264,1
12,1.119245,0.010971,0.016809,0.000481,gini,,,7,100,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.627749,0.629476,0.63376,0.630328,0.002527,2
9,1.095671,0.021999,0.018432,0.002316,gini,,,5,125,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.627558,0.628772,0.633312,0.629881,0.002477,3
4,1.175571,0.007287,0.017599,0.001862,gini,,,3,100,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.62711,0.629795,0.632225,0.62971,0.002089,4
5,1.220766,0.049982,0.017202,0.000605,gini,,,3,125,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.627238,0.630179,0.630946,0.629454,0.001598,5
13,1.120036,0.018395,0.016796,0.000891,gini,,,7,125,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.626982,0.628772,0.632417,0.62939,0.002261,6
16,1.086185,0.007787,0.017177,0.001409,gini,,,9,100,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.62532,0.630179,0.631202,0.6289,0.002566,7
1,1.262296,0.030976,0.020691,0.002095,gini,,,1,125,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.627366,0.628708,0.630115,0.62873,0.001123,8
0,1.253388,0.022667,0.020715,0.002663,gini,,,1,100,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.626918,0.627813,0.630627,0.628453,0.00158,9
10,1.101925,0.029035,0.01824,0.001564,gini,,,5,150,best,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.624808,0.6289,0.631202,0.628303,0.002644,10


In [14]:
best = gscv_dtc_res[gscv_dtc_res.rank_test_accuracy==1]
best.to_csv('./pg_submission_best_config.csv')

In [7]:
dtc_model = DecisionTreeClassifier(**best.params.values[0], random_state=42)
dtc_model.fit(X_train, y_train)

y_pred_train = dtc_model.predict(X_train)
y_pred_valid = dtc_model.predict(X_valid)

accuracy_score(y_train, y_pred_train),accuracy_score(y_valid, y_pred_valid)

(0.6757033248081842, 0.6344422895221046)

In [8]:
XXn = SS.transform(XX)
y2send = dtc_model.predict(XXn)

In [11]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, y2send)), columns=["VisitNumber", "TripType"])
submission.to_csv("./pg_submission_dtc.csv", header=True, index=False)