In [237]:
reset

In [107]:
import pandas as pd
import numpy as np
import functions as f
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.model_selection import train_test_split

import time

In [108]:
n_rows = 1000
train_set = pd.read_csv('train.csv',nrows=n_rows)

##### Get Sessions which end in clickout

In [109]:
sessions_id_with_clickout = train_set[train_set.action_type == 'clickout item'].session_id
df_clean = train_set[train_set.session_id.isin(sessions_id_with_clickout)]
df_clean = df_clean.sort_values(by=['session_id','step'])

In [110]:
df_clean.head(10)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
278,0L2TX0JNYVQ6,06e7c29170946,1541041830,1,search for poi,Seoul Station,HK,"Seoul, South Korea",desktop,,,
279,0L2TX0JNYVQ6,06e7c29170946,1541041870,2,clickout item,10091602,HK,"Seoul, South Korea",desktop,,2802232|2733571|5477718|155374|155465|3549258|...,124|176|99|220|191|127|85|54|83|268|78|144|96|...
280,0L2TX0JNYVQ6,06e7c29170946,1541041882,3,interaction item deals,10091602,HK,"Seoul, South Korea",desktop,,,
281,0L2TX0JNYVQ6,06e7c29170946,1541044143,4,search for poi,Myeongdong,HK,"Seoul, South Korea",desktop,,,
282,0L2TX0JNYVQ6,06e7c29170946,1541044151,5,clickout item,10091602,HK,"Seoul, South Korea",desktop,,3549258|155465|155374|363046|3954788|4773608|3...,135|189|219|78|74|135|95|85|176|99|108|83|87|3...
283,0L2TX0JNYVQ6,06e7c29170946,1541044255,6,search for poi,Myeongdong,HK,"Seoul, South Korea",desktop,,,
284,0L2TX0JNYVQ6,06e7c29170946,1541044256,7,search for poi,Dongdaemun Market,HK,"Seoul, South Korea",desktop,,,
285,0L2TX0JNYVQ6,06e7c29170946,1541044294,8,clickout item,1394008,HK,"Seoul, South Korea",desktop,,2736998|9386776|7016734|2867638|797516|5677278...,249|122|110|81|81|42|74|109|62|91|83|104|78|87...
286,0L2TX0JNYVQ6,06e7c29170946,1541044324,9,clickout item,1394008,HK,"Seoul, South Korea",desktop,,2736998|9386776|7016734|2867638|797516|5677278...,249|122|110|81|81|42|74|109|62|91|83|104|78|87...
287,0L2TX0JNYVQ6,06e7c29170946,1541044344,10,clickout item,5109850,HK,"Seoul, South Korea",desktop,,5109850|3505382|377811|155374|4849224|3501452|...,59|52|66|219|104|95|85|69|125|176|99|83|30|268...


## Feature Extraction

##### Range Price

In [111]:
def range_price(df):
    
    if df.prices < 100:
        df['Price less than 100€'] = 1
    elif df.prices < 300:
        df['Price btw 100€ and 300€'] = 1
    else:
        df['Price more than 300€'] = 1
    
    return df

def price_feature(df):
    
    df_ref_clickout = df[df.action_type == 'clickout item'][['session_id','timestamp','reference','impressions']]
    df_ref_clickout = f.explode(df_ref_clickout,'impressions')
    
    df_impr_price = df[df.action_type=='clickout item'][['session_id','timestamp','impressions','prices']]
    df_impr_price = f.explode(df_impr_price,'prices')
    df_impr_price = df_impr_price.prices.apply(int)
    
    df_price_range = pd.concat([df_ref_clickout,df_impr_price],axis=1)
    
    df_price_range['Price less than 100€'] = 0
    df_price_range['Price btw 100€ and 300€'] = 0
    df_price_range['Price more than 300€'] = 0
    df_price_range = df_price_range.apply(range_price,axis=1)
    
    df_price_range = df_price_range[['session_id','timestamp','reference','impressions','Price less than 100€','Price btw 100€ and 300€','Price more than 300€']]
    
    return df_price_range

In [112]:
df_train_one_hot = price_feature(df_clean)

In [113]:
df_train_one_hot.head()

Unnamed: 0,session_id,timestamp,reference,impressions,Price less than 100€,Price btw 100€ and 300€,Price more than 300€
0,06e7c29170946,1541041870,10091602,2802232,0,1,0
1,06e7c29170946,1541041870,10091602,2733571,0,1,0
2,06e7c29170946,1541041870,10091602,5477718,1,0,0
3,06e7c29170946,1541041870,10091602,155374,0,1,0
4,06e7c29170946,1541041870,10091602,155465,0,1,0


- Previous Interaction with the clickout item
- Has filtered
- Has searched for item

In [171]:
def search(df):

    clickout_ref = df[df.action_type == 'clickout item']['reference'].unique()
    
    search_ref = df[df.action_type == 'search for item']['reference'].unique()
    search_ref = pd.Series(search_ref)
       
    if any(search_ref.isin(clickout_ref)):
        df['Has searched for clickout item'] = 1
        
    return df

def search_for_clickout_item(df):
    
    df_search_by_clickout_ref = df[['session_id','timestamp','action_type','reference']]
    df_search_by_clickout_ref = df_search_by_clickout_ref.set_index('session_id',drop=False)
    df_search_by_clickout_ref['Has searched for clickout item'] = 0

    df_search_by_clickout_ref = df_search_by_clickout_ref.groupby('session_id').apply(search)
        

    df_search_by_clickout_ref = df_search_by_clickout_ref[df_search_by_clickout_ref.action_type == 'clickout item'][['session_id','Has searched for clickout item']]
    df_search_by_clickout_ref = df_search_by_clickout_ref.drop_duplicates()
    
    return df_search_by_clickout_ref

In [168]:
del df_search_by_clickout_ref

In [175]:
df_search_clickout_item = search_for_clickout_item(df_clean)
df_search_clickout_item
#df_train_one_hot = df_train_one_hot.merge(df_search_clickout_item, left_on=['session_id'], right_index=True).drop(['session_id_x','session_id_y'],axis=1)

Defaulting to column, but this will raise an ambiguity error in a future version


Unnamed: 0_level_0,session_id,Has searched for clickout item
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
06e7c29170946,06e7c29170946,1
0c086494b8b9b,0c086494b8b9b,0
0c5572e8d19a3,0c5572e8d19a3,0
1b4ae517f7ebd,1b4ae517f7ebd,0
22b9deb2da8f7,22b9deb2da8f7,0
2423aea8cde50,2423aea8cde50,0
325fafb5fa450,325fafb5fa450,0
336d9a46b826f,336d9a46b826f,0
3599a6f709eab,3599a6f709eab,0
3737c2c0740b5,3737c2c0740b5,0


In [174]:
df_train_one_hot

Unnamed: 0,session_id,timestamp,reference,impressions,Price less than 100€,Price btw 100€ and 300€,Price more than 300€,Has searched for clickout item_x,has_clickout,Has searched for clickout item_y,Has searched for clickout item
0,06e7c29170946,1541041870,10091602,2802232,0,1,0,0,0,1,1
1,06e7c29170946,1541041870,10091602,2733571,0,1,0,0,0,1,1
2,06e7c29170946,1541041870,10091602,5477718,1,0,0,0,0,1,1
3,06e7c29170946,1541041870,10091602,155374,0,1,0,0,0,1,1
4,06e7c29170946,1541041870,10091602,155465,0,1,0,0,0,1,1
5,06e7c29170946,1541041870,10091602,3549258,0,1,0,0,0,1,1
6,06e7c29170946,1541041870,10091602,10091602,1,0,0,0,1,1,1
7,06e7c29170946,1541041870,10091602,6508748,1,0,0,0,0,1,1
8,06e7c29170946,1541041870,10091602,844336,1,0,0,0,0,1,1
9,06e7c29170946,1541041870,10091602,4638538,0,1,0,0,0,1,1


### Create the has_clickout label

#### IMPORTANT!!!!

#### Last step in the Feature Extraction Process

In [116]:
df_train_one_hot['has_clickout'] = 0
df_train_one_hot.reference = df_train_one_hot.reference.apply(int)

In [117]:
def has_clickout(df):
    
    if (df.reference == df.impressions):
        df.has_clickout = 1
    return df

In [118]:
df_train_one_hot = df_train_one_hot.apply(has_clickout,axis=1)

In [119]:
#drop the unnecessary columns for training
df_train = df_train_one_hot.drop(['session_id','timestamp','reference','impressions'], axis=1)

## Train the model

In [120]:
x = df_train.drop('has_clickout', axis=1)
y = df_train.has_clickout

In [121]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)

  if getattr(data, 'base', None) is not None and \


In [122]:
param = {'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'logloss'
evallist = [(dtrain, 'train'),(dval, 'eval')]

In [130]:
num_round = 1000
t1 = time.time()
bst = xgb.train(param, dtrain, num_round, evallist, verbose_eval=100)
print("trained in:", np.round(time.time()-t1, 2), "seconds")

[0]	train-logloss:0.47673	eval-logloss:0.4857
[100]	train-logloss:0.165297	eval-logloss:0.216826
[200]	train-logloss:0.165297	eval-logloss:0.216823
[300]	train-logloss:0.165297	eval-logloss:0.216823
[400]	train-logloss:0.165297	eval-logloss:0.216823
[500]	train-logloss:0.165297	eval-logloss:0.216823
[600]	train-logloss:0.165297	eval-logloss:0.216823
[700]	train-logloss:0.165297	eval-logloss:0.216823
[800]	train-logloss:0.165297	eval-logloss:0.216823
[900]	train-logloss:0.165297	eval-logloss:0.216823
[999]	train-logloss:0.165297	eval-logloss:0.216823
trained in: 0.86 seconds


## Test

In [231]:
test_df = pd.read_csv('test.csv')
sub_target = f.get_submission_target(test_df)
len(sub_target)

253573

In [229]:
batch_size = 100

initial_t = time.time()

#Submission rows
sub_target = f.get_submission_target(test_df)

df_impr_by_ord = pd.DataFrame(columns=['session_id','impressions'])

for i in range(0, len(sub_target), batch_size):

    batch = sub_target[i:min(i+batch_size, len(sub_target))]
    
    t1 = time.time()
    
    x_test = price_feature(batch)
    
    df_test_search_clickout_item = search_for_clickout_item(sub_target)
    
    x_test = x_test.merge(df_test_search_clickout_item, left_on=['session_id'], right_index=True).drop(['session_id_x','session_id_y'],axis=1)
    x_test = x_test.drop(['session_id','timestamp','reference'], axis=1).set_index('impressions')
    # Create the DataFrame that will be used to order 
    # the impressions by probability

    df_test_prob_impr = f.explode(batch, 'impressions')[['impressions', 'session_id']]
    
    # Predict the probabilities for each
    # impression to be clicked

    dtest = xgb.DMatrix(x_test)
    pred = bst.predict(dtest)
    
    df_test_prob_impr['prob'] = pd.Series(pred)
    
    df_impr_sort = df_prob_impr.groupby(['session_id']).apply(sort_by_prob)
    df_impr_by_ord = pd.concat([df_impr_by_ord,df_impr_sort])
    df_impr_by_ord = df_impr_by_ord.drop_duplicates(subset=['session_id','impressions'],keep='first')
    df_impr_by_ord = df_impr_by_ord[['session_id','impressions']]
    
time.time()-initial_t

Defaulting to column, but this will raise an ambiguity error in a future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,session_id,impressions
0,1d688ec168932,2059240 2033381 1724779 127131 399441 103357 1...
25,f05ab0de907e2,10884872 7065316
27,26b6d294d66e7,2714480 4476010 3843244 3833012 9017890 198100...
51,07628a0f5be0b,3565720 2947584 4115018 2039671 3836538 801409...
76,4a01c3afbc224,1451247 559056 1045096 1963879 693596 1967173 ...
101,89171d441a304,13361 5647680 116764 898719 8276346 9168 19325...
126,e09591d07cdef,1193320 5488246 3858774 4552034 10620372 22696...
134,7663406cf586c,241961 906477 991561 353701 1149665 77258 4943...
158,725e8adf70e86,109938 164193 632366 1362450 1070666 164220 11...
181,73f4c417ff730,42692 5116230 42876 4342578 42864 3148690 2123...


In [226]:
df_impr_sort

Unnamed: 0,impressions,session_id,prob
0,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
1,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
2,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
3,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
4,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
5,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
6,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
7,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
8,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,
9,2059240 2033381 1724779 127131 399441 103357 1...,1d688ec168932,


In [132]:
#Submission rows
sub_target = f.get_submission_target(test_df)

#Feature Extraction Test Set
x_test = price_feature(sub_target)

df_search_clickout_item = search_for_clickout_item(df_clean)
x_test = x_test.merge(df_search_clickout_item, left_on=['session_id'], right_index=True).drop(['session_id_x','session_id_y'],axis=1)

x_test = x_test.drop(['session_id','timestamp','reference'], axis=1).set_index('impressions')

  del sys.path[0]


In [143]:
# Create the DataFrame that will be used to order 
# the impressions by probability

df_prob_impr = f.explode(sub_target, 'impressions')[['impressions', 'session_id']]

In [144]:
# Predict the probabilities for each
# impression to be clicked

dtest = xgb.DMatrix(x_test)
pred = bst.predict(dtest)

In [145]:
df_prob_impr['prob'] = pd.Series(pred)

In [183]:
def sort_by_prob(df):
    
    try:
        df['impressions']= ' '.join([str(int(x)) for x in np.array(sorted(np.array(([df.impressions,df.prob])).T.tolist(),key=lambda x: x[1], reverse=True)).T[0]])
    
    except:
        pass
    
    return df

In [147]:
df_impr_by_ord = df_prob_impr.groupby(['session_id']).apply(sort_by_prob)
df_impr_by_ord = df_impr_by_ord.drop_duplicates(subset=['session_id','impressions'],keep='first')
df_impr_by_ord = df_impr_by_ord[['session_id','impressions']]
df_impr_by_ord

Unnamed: 0,session_id,impressions
0,1d688ec168932,2059240 2033381 1724779 127131 399441 103357 1...
25,f05ab0de907e2,10884872 7065316
27,26b6d294d66e7,2714480 4476010 3843244 3833012 9017890 198100...
51,07628a0f5be0b,3565720 2947584 4115018 2039671 3836538 801409...
76,4a01c3afbc224,1451247 559056 1045096 1963879 693596 1967173 ...
101,89171d441a304,13361 5647680 116764 898719 8276346 9168 19325...
126,e09591d07cdef,1193320 5488246 3858774 4552034 10620372 22696...
134,7663406cf586c,241961 906477 991561 353701 1149665 77258 4943...
158,725e8adf70e86,109938 164193 632366 1362450 1070666 164220 11...
181,73f4c417ff730,42692 5116230 42876 4342578 42864 3148690 2123...


### Add the impressions ordered to the submission file

In [148]:
submission_file = sub_target[f.GR_COLS].merge(df_impr_by_ord)
submission_file
#submission_file.to_csv('submission_file', sep=',', encoding='utf-8')

Unnamed: 0,user_id,session_id,timestamp,step,impressions
0,004A07DM0IDW,1d688ec168932,1541555799,7,2059240 2033381 1724779 127131 399441 103357 1...
1,009RGHI3G9A3,f05ab0de907e2,1541570940,2,10884872 7065316
2,00Y1Z24X8084,26b6d294d66e7,1541651823,2,2714480 4476010 3843244 3833012 9017890 198100...
3,01V3WDTDM5CU,07628a0f5be0b,1541575643,5,3565720 2947584 4115018 2039671 3836538 801409...
4,02AOAVF9PVYH,4a01c3afbc224,1541681278,46,1451247 559056 1045096 1963879 693596 1967173 ...
5,0339C84S24ET,89171d441a304,1541615683,36,13361 5647680 116764 898719 8276346 9168 19325...
6,0386OH8JDE1Q,e09591d07cdef,1541620536,2,1193320 5488246 3858774 4552034 10620372 22696...
7,03LTH89QY623,7663406cf586c,1541554183,4,241961 906477 991561 353701 1149665 77258 4943...
8,03VT0ODUTZB0,725e8adf70e86,1541632490,23,109938 164193 632366 1362450 1070666 164220 11...
9,03XH0JWCWHAM,73f4c417ff730,1541566143,176,42692 5116230 42876 4342578 42864 3148690 2123...
