# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [18]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    ### Creamos la variable cantidad de productos por departamento que contiene cada tipo de viaje
    
    #Department_products=dict(df.groupby('DepartmentDescription')['Upc'].count())
    #df['products_department']=df['DepartmentDescription'].apply(lambda x:Department_products.get(x))
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)
    
    # Creamos la variable cantidad de departamentos visitados
    Total_Dep = df.groupby('VisitNumber').DepartmentDescription.nunique().reset_index(name='Total_Dep')

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)
    
    # Unimos la variable Total_Dept al DF
    df = pd.merge(df, Total_Dep, on='VisitNumber')
    
    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # Creamos la variable "fin de semana"
    df['Weekend'] = df['Weekday'].apply(lambda x : x in ('Saturday', 'Sunday'))
    df['Weekend'] = df['Weekend'].astype(int)
    
    # We do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)
    
    ### Hacemos one-hot encoding para cantidad de departamentos visitados - no es relevante, bajo el puntaje.
    # df = pd.get_dummies(df, columns=["Total_Dep"], dummy_na=True)
    
    ### Creamos variable para "compras negativas"
    df['Neg_purch'] = df['ScanCount'].apply(lambda x : x <=0)
    df["Neg_purch"] = df["Neg_purch"].astype(int)
    
    ### Creamos variable para mas de 8 unidades  compradas (promedio del DF)
    #df['more_8'] = df['ScanCount'].apply(lambda x : x >=8)
    #df["more_8"] = df["more_8"].astype(int)
    
    ### Hacemos one-hot encoding para cantidad de productos comprados - No mejora el desempeño.
    #df = pd.get_dummies(df, columns=["ScanCount"], dummy_na=True)
    
    
    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

Load the data...

In [34]:
X, y, XX, yy = transform_data("https://raw.githubusercontent.com/DiploDatos/AprendizajeSupervisado/master/practico/data/train.csv", "https://raw.githubusercontent.com/DiploDatos/AprendizajeSupervisado/master/practico/data/test.csv")

Create the model and evaluate it

In [35]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [36]:
# results dataframe is used to store the computed results
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [37]:
# we will use a DesicionTree to classify and GridSearch to determine the parameters
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

tree_param = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(4,5,8,15),
              'min_samples_split':(25,100,150)}
tree = DT(random_state=42)
tree_clf = GridSearchCV(tree, tree_param, cv=3, scoring='accuracy') #scoring='balanced_accuracy')
tree_clf.fit(X_train, y_train)
best_tree_clf = tree_clf.best_estimator_

In [38]:
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.638299232736573
DecisionTreeClassifier(min_samples_leaf=8, min_samples_split=100,
                       random_state=42)
The best classifier so far is: 
DecisionTreeClassifier(min_samples_leaf=8, min_samples_split=100,
                       random_state=42)


In [26]:
tree_clf.best_params_

{'criterion': 'gini', 'min_samples_leaf': 4, 'min_samples_split': 100}

In [27]:
yy_tree = results.clf.iloc[0].predict(XX)

In [12]:
submission_tree = pd.DataFrame(list(zip(XX.VisitNumber, yy_tree)), columns=["VisitNumber", "TripType"])
submission_tree.to_csv("../competencia-kaggle/submission_tree.csv", header=True, index=False)

In [39]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [40]:
forest_param = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(3,4,5),
              'min_samples_split':[100]}
forest = RF(random_state=42)
forest_clf = GridSearchCV(forest, forest_param, cv=3, scoring='accuracy') #scoring='balanced_accuracy')
forest_clf.fit(X_train, y_train)
best_forest_clf = forest_clf.best_estimator_
print('Best Random Forest accuracy: ', tree_clf.best_score_)
print(best_forest_clf)
results_rf = results.append({'clf': best_forest_clf, 'best_acc': forest_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results_rf.loc[results_rf['best_acc'].idxmax()]['clf'])

Best Random Forest accuracy:  0.638299232736573
RandomForestClassifier(min_samples_leaf=3, min_samples_split=100,
                       random_state=42)
The best classifier so far is: 
RandomForestClassifier(min_samples_leaf=3, min_samples_split=100,
                       random_state=42)


In [45]:
forest=RF(random_state=42, criterion='gini', min_samples_leaf=3, min_samples_split= 100)
forest.fit(X=X_train, y = y_train)

RandomForestClassifier(min_samples_leaf=3, min_samples_split=100,
                       random_state=42)

In [46]:
y_rf=forest.predict(X_valid)

In [47]:
accuracy_score(y_valid, y_rf)

0.6656720871251678

In [17]:
yy_rf=forest.predict(XX)
submission_rf = pd.DataFrame(list(zip(XX.VisitNumber, yy_rf)), columns=["VisitNumber", "TripType"])
submission_rf.to_csv("../competencia-kaggle/submission_rf.csv", header=True, index=False)

In [30]:
from sklearn.svm import SVC

In [31]:
suport=SVC(random_state=42, kernel='poly')
suport.fit(X_train, y_train)

SVC(kernel='poly', random_state=42)