In [1]:
import xgboost as xgb
import sys
import IPython
import numpy as np
import pandas as pd
import sklearn as sk
import feather
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

def add_category_counts(data):
    alist = []
    for array in np.asarray(data.iloc[:,5:]):
        count = 0
        for item in array:
            if item > 0:
                count += 1
        alist.append(count)
    cat_counts = pd.DataFrame(alist)
    cat_counts = cat_counts.rename(columns={0:"CategoryCount"})
    cat_counts = cat_counts.set_index(data.index)
    data.insert(5, 'CategoryCounts', cat_counts)
    return data


#data = pd.read_csv("../data/train.csv")
data_original = feather.read_dataframe('../data/transformed_data.feather')
data = data_original.copy()
data_original



Unnamed: 0,VisitNumber,TripType,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,DepartmentGroup,numItems,num_purchased
0,10,8,friday,6414410235,11,dsd grocery,2008,food,3,33
1,10,8,friday,2800053970,11,"candy, tobacco, cookies",115,food,3,33
2,10,8,friday,7794800902,11,dsd grocery,7950,food,3,33
3,100,37,friday,4383,11,produce,3102,food,1,11
4,1000,9,friday,32878550911,11,infant consumable hardlines,2009,infant,1,11
5,100002,9,sunday,66572108583,11,bath and shower,1505,health & beauty,1,11
6,100003,999,sunday,87458603436,11,impulse merchandise,8023,other departments,2,21
7,100003,999,sunday,31254778135,10,personal care,5055,health & beauty,2,21
8,100004,25,sunday,76125378024,11,ladies wear,308,cloth,4,44
9,100004,25,sunday,79764251854,11,mens wear,4141,cloth,4,44


In [2]:

label_encoder2 = LabelEncoder()
data['FinelineNumber_l'] = label_encoder2.fit_transform(data_original['FinelineNumber'].astype(str))

data['Weekday'] = data_original['Weekday'].map({"monday": 1, "tuesday": 2, "wednesday": 3, "thursday": 4, "friday": 5, "saturday": 6, "sunday": 7})

for each in ['TripType']:
    data[each] = data[each].astype(int)


topFine = np.asarray(data.FinelineNumber_l.value_counts(sort=True, ascending=False)[:200].index)

dummies = pd.get_dummies(topFine)
columnas_fine = ['FinelineNumber_'+str(each) for each in topFine]
data[columnas_fine] = dummies.astype(int)

for each in columnas_fine:
    data[each] = data[each].fillna(0)

dummies = pd.get_dummies(data.DepartmentDescription)
data[dummies.columns] = dummies
data_dummies = data[dummies.columns] 
data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])

data.loc[data.ScanCount < 0, 'Return'] = 1
data.loc[data.Return != 1, 'Return'] = 0
grouped = data.groupby("VisitNumber")
elset = set(list(data.DepartmentDescription) + columnas_fine)
aggregate = {'Weekday': np.max, "TripType": np.max, 'numItems': np.max, 'Return': np.max}
for each in list(elset):
    if each != np.nan:
        aggregate[each] = np.sum
data = grouped.agg(aggregate).reset_index()

add_category_counts(data)

label_encoder = LabelEncoder()  
data['TripType_l'] = label_encoder.fit_transform(data['TripType'])


mytrain, mytest = train_test_split(data, test_size = .7)
features = ['VisitNumber', 'Weekday', 'numItems', 'Return', 'CategoryCounts'] + list(elset)

train = data.copy()
train.head()

Unnamed: 0,VisitNumber,Weekday,TripType,numItems,Return,CategoryCounts,FinelineNumber_2782,FinelineNumber_575,FinelineNumber_4345,FinelineNumber_1452,FinelineNumber_2120,FinelineNumber_719,FinelineNumber_31,FinelineNumber_0,FinelineNumber_2522,comm bread,ladies socks,FinelineNumber_5195,FinelineNumber_1491,FinelineNumber_4416,FinelineNumber_92,produce,FinelineNumber_2135,FinelineNumber_5057,beauty,FinelineNumber_602,FinelineNumber_1260,FinelineNumber_3061,lawn and garden,FinelineNumber_1505,FinelineNumber_4835,FinelineNumber_853,FinelineNumber_3317,FinelineNumber_1392,FinelineNumber_4176,infant consumable hardlines,FinelineNumber_4337,FinelineNumber_4385,FinelineNumber_2015,FinelineNumber_2123,FinelineNumber_216,celebration,FinelineNumber_4984,FinelineNumber_1418,FinelineNumber_1558,accessories,FinelineNumber_726,household paper goods,FinelineNumber_1236,FinelineNumber_4994,FinelineNumber_3383,electronics,FinelineNumber_1946,bath and shower,FinelineNumber_131,pets and supplies,1-hr photo,FinelineNumber_1829,FinelineNumber_1401,FinelineNumber_3956,FinelineNumber_1483,"candy, tobacco, cookies",FinelineNumber_1389,personal care,FinelineNumber_1043,horticulture and access,FinelineNumber_409,dsd grocery,office supplies,home decor,FinelineNumber_628,FinelineNumber_4200,FinelineNumber_3316,FinelineNumber_169,FinelineNumber_918,FinelineNumber_624,FinelineNumber_1938,FinelineNumber_405,FinelineNumber_3920,FinelineNumber_647,FinelineNumber_303,FinelineNumber_1846,FinelineNumber_765,FinelineNumber_4932,FinelineNumber_2134,FinelineNumber_333,FinelineNumber_1697,FinelineNumber_3175,FinelineNumber_1246,home management,FinelineNumber_1850,FinelineNumber_35,pharmacy,infant apparel,FinelineNumber_4931,ladies wear,FinelineNumber_3053,FinelineNumber_2552,FinelineNumber_1253,jewelry and sunglasses,FinelineNumber_2557,FinelineNumber_1642,FinelineNumber_4359,FinelineNumber_4324,FinelineNumber_306,FinelineNumber_3094,FinelineNumber_294,impulse merchandise,FinelineNumber_12,FinelineNumber_746,pre packed deli,FinelineNumber_5058,FinelineNumber_2460,FinelineNumber_1931,FinelineNumber_4444,FinelineNumber_1205,FinelineNumber_4299,FinelineNumber_4427,FinelineNumber_4978,FinelineNumber_4326,FinelineNumber_401,FinelineNumber_1760,FinelineNumber_301,concept stores,FinelineNumber_1825,FinelineNumber_919,FinelineNumber_1921,grocery dry goods,FinelineNumber_1950,boys wear,large household goods,FinelineNumber_1397,FinelineNumber_5074,"liquor,wine,beer",FinelineNumber_4446,FinelineNumber_2296,FinelineNumber_2140,FinelineNumber_373,FinelineNumber_3462,FinelineNumber_2762,FinelineNumber_368,financial services,FinelineNumber_4330,FinelineNumber_2228,FinelineNumber_956,FinelineNumber_3142,FinelineNumber_2526,FinelineNumber_703,FinelineNumber_2521,service deli,FinelineNumber_2126,FinelineNumber_4177,mens wear,FinelineNumber_676,FinelineNumber_2538,FinelineNumber_2372,FinelineNumber_1926,FinelineNumber_331,FinelineNumber_2740,FinelineNumber_1042,FinelineNumber_94,hardware,FinelineNumber_398,books and magazines,FinelineNumber_4369,FinelineNumber_801,FinelineNumber_4357,dairy,seasonal,FinelineNumber_3911,FinelineNumber_44,FinelineNumber_1826,FinelineNumber_329,FinelineNumber_3652,FinelineNumber_1896,FinelineNumber_10,FinelineNumber_2898,FinelineNumber_1251,frozen foods,FinelineNumber_4811,FinelineNumber_1772,health & beauty,optical,FinelineNumber_4,FinelineNumber_4610,FinelineNumber_1927,FinelineNumber_332,FinelineNumber_3328,furniture,FinelineNumber_1132,FinelineNumber_2025,FinelineNumber_5075,FinelineNumber_1876,FinelineNumber_1244,FinelineNumber_1175,FinelineNumber_4939,FinelineNumber_1824,FinelineNumber_1682,meat - fresh & frozen,FinelineNumber_990,FinelineNumber_3339,FinelineNumber_4332,toys,wireless,household chemicals/supp,FinelineNumber_4276,FinelineNumber_1024,FinelineNumber_4354,FinelineNumber_1023,swimwear/outerwear,FinelineNumber_1409,FinelineNumber_5112,bras & shapewear,FinelineNumber_1264,FinelineNumber_2937,FinelineNumber_2549,FinelineNumber_1488,FinelineNumber_2434,"girls wear, 4-6x and 7-14",seafood,bakery,FinelineNumber_577,cameras and supplies,FinelineNumber_1922,bedding,FinelineNumber_4447,other departments,sheer hosiery,FinelineNumber_443,FinelineNumber_626,FinelineNumber_3394,plus and maternity,fabrics and crafts,FinelineNumber_1845,FinelineNumber_987,FinelineNumber_2755,FinelineNumber_1539,sporting goods,FinelineNumber_1391,FinelineNumber_5,FinelineNumber_360,FinelineNumber_3050,FinelineNumber_988,shoes,FinelineNumber_330,cook and dine,players and electronics,FinelineNumber_2747,FinelineNumber_1556,FinelineNumber_3,FinelineNumber_372,FinelineNumber_4329,paint and accessories,FinelineNumber_1403,automotive,FinelineNumber_2405,FinelineNumber_299,media and gaming,FinelineNumber_3965,sleepwear/foundations,FinelineNumber_254,FinelineNumber_5083,FinelineNumber_4982,FinelineNumber_85,FinelineNumber_2067,FinelineNumber_598,FinelineNumber_2116,FinelineNumber_3304,FinelineNumber_4328,FinelineNumber_1683,FinelineNumber_1332,TripType_l
0,5,5,999,1,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
1,7,5,30,2,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,1,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
2,8,5,26,23,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,2,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,1,0.0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0,0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,16,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18
3,9,5,8,3,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,2,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,10,5,8,3,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,1,0.0,0,0.0,0,0.0,2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [5]:
mytrain, mytest = train_test_split(data, test_size = .2)

dtrain = xgb.DMatrix(np.asarray(mytrain[features]), label = np.asarray(mytrain.TripType_l))
dtest = xgb.DMatrix(np.asarray(mytest[features]), label = np.asarray(mytest.TripType_l))
num_round = 50
param = {'objective': 'multi:softmax', 'num_class':len(set(mytrain.TripType_l)), 
     'eval_metric': 'mlogloss', "max_delta_step": 1}
watchlist = [(dtrain,'train'), (dtest, 'eval')]

%time bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=3)


[0]	train-mlogloss:3.26791	eval-mlogloss:3.27589
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 3 rounds.
[1]	train-mlogloss:2.92201	eval-mlogloss:2.93843
[2]	train-mlogloss:2.60312	eval-mlogloss:2.63003
[3]	train-mlogloss:2.3166	eval-mlogloss:2.35314
[4]	train-mlogloss:2.06835	eval-mlogloss:2.11505
[5]	train-mlogloss:1.86251	eval-mlogloss:1.91959
[6]	train-mlogloss:1.69098	eval-mlogloss:1.75873
[7]	train-mlogloss:1.55395	eval-mlogloss:1.63088
[8]	train-mlogloss:1.44756	eval-mlogloss:1.53325
[9]	train-mlogloss:1.36555	eval-mlogloss:1.45851
[10]	train-mlogloss:1.301	eval-mlogloss:1.40149
[11]	train-mlogloss:1.24938	eval-mlogloss:1.35623
[12]	train-mlogloss:1.20646	eval-mlogloss:1.31953
[13]	train-mlogloss:1.1697	eval-mlogloss:1.28876
[14]	train-mlogloss:1.13943	eval-mlogloss:1.2643
[15]	train-mlogloss:1.11399	eval-mlogloss:1.24481
[16]	train-mlogloss:1.09135	eval-mlogloss:1.22715
[17]	train-mlog

In [11]:
predictions = bst.predict(dtest)

accuracy_score(mytest.TripType_l, predictions)

0.61071335249542724

In [12]:
data_original = feather.read_dataframe('../data/test_transformed_data.feather')
data = data_original.copy()

data['FinelineNumber_l'] = label_encoder2.fit_transform(data_original['FinelineNumber'].astype(str))
data['Weekday'] = data_original['Weekday'].map({"monday": 1, "tuesday": 2, "wednesday": 3, "thursday": 4, "friday": 5, "saturday": 6, "sunday": 7})

dummies = pd.get_dummies(topFine)
data[columnas_fine] = dummies.astype(int)

for each in columnas_fine:
    data[each] = data[each].fillna(0)

dummies = pd.get_dummies(data.DepartmentDescription)
data[dummies.columns] = dummies
data_dummies = data[dummies.columns] 
data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])

data.loc[data.ScanCount < 0, 'Return'] = 1
data.loc[data.Return != 1, 'Return'] = 0
grouped = data.groupby("VisitNumber")
elset = set(list(data.DepartmentDescription) + columnas_fine)
aggregate = {'Weekday': np.max, 'numItems': np.max, 'Return': np.max}
for each in list(elset):
    if each != np.nan:
        aggregate[each] = np.sum
data = grouped.agg(aggregate).reset_index()

add_category_counts(data)

data_test = data.copy()
data_test['health & beauty'] = np.zeros(len(data))
data_test.head()

Unnamed: 0,VisitNumber,Weekday,numItems,Return,FinelineNumber_2782,CategoryCounts,FinelineNumber_575,FinelineNumber_4345,FinelineNumber_1452,FinelineNumber_2120,FinelineNumber_719,FinelineNumber_31,FinelineNumber_0,FinelineNumber_2522,comm bread,ladies socks,FinelineNumber_5195,FinelineNumber_1491,FinelineNumber_4416,FinelineNumber_92,produce,FinelineNumber_2135,FinelineNumber_5057,beauty,FinelineNumber_602,FinelineNumber_1260,FinelineNumber_3061,lawn and garden,FinelineNumber_1505,FinelineNumber_4835,FinelineNumber_853,FinelineNumber_3317,FinelineNumber_1392,FinelineNumber_4176,infant consumable hardlines,FinelineNumber_4337,FinelineNumber_4385,FinelineNumber_2015,FinelineNumber_2123,FinelineNumber_216,celebration,FinelineNumber_4984,FinelineNumber_1418,FinelineNumber_1558,accessories,FinelineNumber_726,household paper goods,FinelineNumber_1236,FinelineNumber_4994,FinelineNumber_3383,electronics,FinelineNumber_1946,bath and shower,FinelineNumber_131,pets and supplies,1-hr photo,FinelineNumber_1829,FinelineNumber_1401,FinelineNumber_3956,FinelineNumber_1483,"candy, tobacco, cookies",FinelineNumber_1389,personal care,FinelineNumber_1043,horticulture and access,FinelineNumber_409,dsd grocery,office supplies,home decor,FinelineNumber_628,FinelineNumber_4200,FinelineNumber_3316,FinelineNumber_169,FinelineNumber_918,FinelineNumber_624,FinelineNumber_1938,FinelineNumber_405,FinelineNumber_3920,FinelineNumber_647,FinelineNumber_303,FinelineNumber_1846,FinelineNumber_765,FinelineNumber_4932,FinelineNumber_2134,FinelineNumber_333,FinelineNumber_1697,FinelineNumber_3175,FinelineNumber_1246,home management,FinelineNumber_1850,FinelineNumber_35,pharmacy,infant apparel,FinelineNumber_4931,ladies wear,FinelineNumber_3053,FinelineNumber_2552,FinelineNumber_1253,jewelry and sunglasses,null,FinelineNumber_2557,FinelineNumber_4359,FinelineNumber_4324,FinelineNumber_1642,FinelineNumber_306,FinelineNumber_3094,FinelineNumber_294,impulse merchandise,FinelineNumber_12,FinelineNumber_746,pre packed deli,FinelineNumber_5058,FinelineNumber_2460,FinelineNumber_1931,FinelineNumber_4444,FinelineNumber_1205,FinelineNumber_4299,FinelineNumber_4427,FinelineNumber_4978,FinelineNumber_4326,FinelineNumber_401,FinelineNumber_1760,FinelineNumber_301,concept stores,FinelineNumber_1825,FinelineNumber_919,FinelineNumber_1921,grocery dry goods,FinelineNumber_1950,boys wear,large household goods,FinelineNumber_1397,FinelineNumber_5074,"liquor,wine,beer",FinelineNumber_4446,FinelineNumber_2296,FinelineNumber_2140,FinelineNumber_373,FinelineNumber_3462,FinelineNumber_2762,FinelineNumber_368,financial services,FinelineNumber_4330,FinelineNumber_2228,FinelineNumber_956,FinelineNumber_3142,FinelineNumber_2526,FinelineNumber_703,FinelineNumber_2521,service deli,FinelineNumber_2126,FinelineNumber_4177,mens wear,FinelineNumber_676,FinelineNumber_2538,FinelineNumber_2372,FinelineNumber_1926,FinelineNumber_331,FinelineNumber_2740,FinelineNumber_1042,FinelineNumber_94,hardware,FinelineNumber_398,books and magazines,FinelineNumber_4369,FinelineNumber_801,FinelineNumber_4357,dairy,seasonal,FinelineNumber_3911,FinelineNumber_44,FinelineNumber_1826,FinelineNumber_329,FinelineNumber_3652,FinelineNumber_1896,FinelineNumber_10,FinelineNumber_2898,FinelineNumber_1251,frozen foods,FinelineNumber_4811,FinelineNumber_1772,optical,FinelineNumber_4,FinelineNumber_4610,FinelineNumber_1927,FinelineNumber_332,FinelineNumber_3328,furniture,FinelineNumber_1132,FinelineNumber_2025,FinelineNumber_5075,FinelineNumber_1876,FinelineNumber_1244,FinelineNumber_1175,FinelineNumber_4939,FinelineNumber_1824,FinelineNumber_1682,meat - fresh & frozen,FinelineNumber_990,FinelineNumber_3339,FinelineNumber_4332,toys,wireless,household chemicals/supp,FinelineNumber_4276,FinelineNumber_1024,FinelineNumber_4354,FinelineNumber_1023,swimwear/outerwear,FinelineNumber_1409,FinelineNumber_5112,bras & shapewear,FinelineNumber_1264,FinelineNumber_2937,FinelineNumber_2549,FinelineNumber_1488,FinelineNumber_2434,"girls wear, 4-6x and 7-14",seafood,bakery,FinelineNumber_577,cameras and supplies,FinelineNumber_1922,bedding,FinelineNumber_4447,other departments,sheer hosiery,FinelineNumber_443,FinelineNumber_626,FinelineNumber_3394,plus and maternity,fabrics and crafts,FinelineNumber_1845,FinelineNumber_987,FinelineNumber_2755,FinelineNumber_1539,sporting goods,FinelineNumber_1391,FinelineNumber_5,FinelineNumber_360,FinelineNumber_3050,FinelineNumber_988,shoes,FinelineNumber_330,cook and dine,players and electronics,FinelineNumber_2747,FinelineNumber_1556,FinelineNumber_3,FinelineNumber_372,FinelineNumber_4329,paint and accessories,FinelineNumber_1403,automotive,FinelineNumber_2405,FinelineNumber_299,media and gaming,FinelineNumber_3965,sleepwear/foundations,FinelineNumber_254,FinelineNumber_5083,FinelineNumber_4982,FinelineNumber_85,FinelineNumber_2067,FinelineNumber_598,FinelineNumber_2116,FinelineNumber_3304,FinelineNumber_4328,FinelineNumber_1683,FinelineNumber_1332,health & beauty
0,1,5,4,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,1.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,1.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,5,4,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,1,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,5,2,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,2,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,5,1,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,5,2,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,2,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:

%time predictions = bst.predict(xgb.DMatrix(np.asarray(data_test[features])))
print(predictions)

CPU times: user 19.4 s, sys: 610 ms, total: 20 s
Wall time: 7.91 s
[ 22.  27.   5. ...,   5.  24.  17.]


In [19]:
data_test['TripType'] = label_encoder.inverse_transform([int(i) for i in predictions])
answers = data_test[['TripType', 'VisitNumber']]
answers = pd.get_dummies(answers, columns =['TripType'])
answers['TripType_14'] = np.zeros(len(answers))
#answers['TripType_36'] = np.zeros(len(answers))
#answers['TripType_44'] = np.zeros(len(answers))
answers.to_csv('../data/final_submission.csv', index=False)
len(answers)


95674