# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectKBest, chi2, f_classif

In [2]:
# from google.colab import drive

In [3]:
# DATA_DIRECTORY = '/content/drive/My Drive/Colab/AAKaggle/'

In [4]:
# drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0
    
    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    
    df_train[df_train.TripType.isna()] = df_train.TripType.max()
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType
    
    y_mean = y.mean()
    y = y.fillna(y_mean)
    print(y.shape)

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    
    # the next three operations are the ones we have just presented in the previous lines
    
    #df.dropna(inplace= True)
    extra = df.copy()
    Department_num_of_products=dict(df.groupby('DepartmentDescription')['Upc'].count())
    extra['num_of_products_for_department']=df['DepartmentDescription'].apply(lambda x:Department_num_of_products.get(x))
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # Generate new feature: weekend
    
    weekend_dict = {
    'Monday': 0, 
    'Tuesday': 0,
    'Wednesday': 0,
    'Thursday': 0,
    'Friday': 0,
    'Saturday': 1,
    'Sunday': 1,
    }

    df['Weekend'] = df['Weekday'].replace(weekend_dict)
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)
    
    # Create new feature
    aux_data = extra.copy()
    products_per_visit = aux_data.groupby(['VisitNumber'])['Upc'].count()
    products_per_visit_dict=dict(products_per_visit)
    df['quantity_products_per_VisitNumber']=df['VisitNumber'].apply(lambda x:products_per_visit_dict.get(x,0))


    # Obtain maximun values of Upc and FineLineNumber and create new features
    getMax = extra.copy()
    getMax = getMax.groupby(["VisitNumber", "Weekday"], as_index=False).max()
    df['UpcMax'] = getMax['Upc']
    df['FinelineNumberMax'] = getMax['FinelineNumber']
    getMin = extra.copy()
    getMin = getMin.groupby(["VisitNumber", "Weekday"], as_index=False).min()
    df['UpcMin'] = getMin['Upc']
    df['FinelineNumberMin'] = getMin['FinelineNumber']
    
    
    # df['num_of_products_for_department'] = extra['num_of_products_for_department']
    # df['UpcMax'] = extra.groupby(["VisitNumber", "Weekday"], as_index=False)['Upc']
    # df['FinelineNumberMax'] = extra.groupby(["VisitNumber", "Weekday"], as_index=False)['FinelineNumber']
    # df['num_of_products_for_department'] = extra.groupby(["VisitNumber", "Weekday"], as_index=False)['num_of_products_for_department']

    def getValue(value):
      if isinstance(value,float):
        return value
      else:
        if isinstance(value,np.ndarray):
          if value.size == 0:
            return np.nan
          else:
            return np.take(value,0)



    extra = extra.groupby(["VisitNumber", "Weekday"], as_index=False).agg({'Upc' : lambda x: x.mode(), 'FinelineNumber' : lambda x: x.mode(),
                                                                           'num_of_products_for_department' : lambda x: x.mode()})
    
    df['UpcMode'] = extra['Upc'].apply(getValue)
    df['FinelineNumberMode'] = extra['FinelineNumber'].apply(getValue)
    df['num_of_products_for_departmentMode'] = extra['num_of_products_for_department'].apply(getValue)

    # Fill NaN values with means
    df_mean = df.mean()
    df = df.fillna(df_mean)
    print(df.shape)
    
    
    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)
    
    print(X.shape)
    print(XX.shape)

    return X, y, XX, yy

Load the data...

In [3]:
X, y, XX, yy = transform_data('../data/train.csv', '../data/test.csv')

(67029,)
(95674, 89)
(67029, 88)
(28645, 88)


Create the model and evaluate it

In [4]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, random_state=42)

In [5]:
from sklearn import ensemble
clfe = ensemble.RandomForestClassifier(random_state=2, bootstrap=False, min_samples_leaf=1, min_samples_split=10, n_estimators=300, criterion='gini', max_features=18, n_jobs=-1)

clfe.fit(X_train, y_train);

In [6]:
predictions = clfe.predict(X_train)
print ('Accuracy: %f ' % ((np.sum(y_train == predictions))/float(y_train.size)*100))

Accuracy: 99.109569 


In [7]:
predictions = clfe.predict(X_valid)
print ('Accuracy: %f ' % ((np.sum(y_valid == predictions))/float(y_valid.size)*100))

Accuracy: 72.523866 


In [71]:
yy = clfe.predict(XX)

In [72]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [73]:
submission.to_csv('../data/submission.csv', header=True, index=False)
