In [193]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc

plt.style.use('ggplot')

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        }
    
random = pd.read_csv('train_random_10_percent.csv', dtype=dtypes)
df = random.sample(3000000)
# prepare test data
test = pd.read_csv("test.csv", dtype=dtypes)



In [194]:
df = df.sort_values(['ip','click_time'])
test = test.sort_values(['ip','click_time'])

In [195]:
df.shape

(3000000, 9)

In [196]:
gc.collect()

584

In [197]:
df['click_time'] = pd.to_datetime(df.click_time)
df['attributed_time'] = pd.to_datetime(df.attributed_time)
test['click_time'] = pd.to_datetime(test.click_time)


In [198]:
did_download = df[df.is_attributed==1].ip.values
did_download

array([    32,     88,    222, ..., 364698, 364707, 364717], dtype=uint32)

In [199]:
df[df.is_attributed==1]

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
17637867,49482802,32,230,0,29,213,2017-11-07 12:40:33,2017-11-07 13:07:17,1
9958995,147656990,88,12,1,3,328,2017-11-09 04:44:13,2017-11-09 13:13:49,1
16806920,65707244,222,29,1,19,343,2017-11-07 22:19:15,2017-11-08 09:03:09,1
13723395,10400370,254,35,1,19,274,2017-11-07 00:18:43,2017-11-07 00:33:07,1
6652629,164892511,255,9,1,19,244,2017-11-09 09:58:45,2017-11-09 11:02:40,1
16777703,122215573,268,1,1,19,134,2017-11-08 16:02:31,2017-11-09 05:21:54,1
6610989,136472913,279,120,0,21,213,2017-11-09 01:24:16,2017-11-09 07:57:29,1
5399065,170961097,315,35,1,13,21,2017-11-09 11:48:16,2017-11-09 12:13:31,1
14403182,55148116,488,19,0,24,213,2017-11-07 14:26:52,2017-11-07 14:28:31,1
3679175,163532390,641,19,0,0,333,2017-11-09 09:31:45,2017-11-09 15:53:52,1


In [200]:
#ip of people that downloaded an application at some point
did_download = df[df.ip.apply(lambda x: x in did_download)]
did_download
did_download.shape

(674008, 9)

In [201]:
ip_ad_exposure =  did_download.ip.value_counts()
ip_ad_exposure

5348      20105
5314      19049
73516     12768
73487     12307
53454      8076
26995      6675
95766      6190
17149      5060
105475     4951
100275     4462
86767      4213
43793      4176
105560     4126
137052     3661
5178       3536
201182     3507
5147       3251
49602      3212
48282      3061
48212      3041
48170      2925
123994     2887
93587      2820
209663     2818
45745      2697
114220     2627
119369     2535
53964      2413
44744      2410
77048      2360
          ...  
138497        1
146693        1
193230        1
180507        1
281546        1
209142        1
297984        1
179398        1
321442        1
234723        1
193229        1
297390        1
360483        1
233187        1
238822        1
287667        1
228093        1
242404        1
247623        1
305067        1
213288        1
348728        1
250682        1
262162        1
351211        1
163550        1
298449        1
184089        1
316939        1
154803        1
Name: ip, Length: 6982, 

In [202]:
app_or_channel = did_download[did_download.is_attributed == 1]
app_or_channel.shape

(7578, 9)

In [203]:
downloaded = did_download.dropna() 

In [204]:
#lets explore more just the adds that led to download

time_of_exposure = did_download.attributed_time.dropna().groupby(did_download.attributed_time.dt.hour).count()
time_of_exposure

attributed_time
0.0     213
1.0     334
2.0     424
3.0     405
4.0     442
5.0     423
6.0     459
7.0     466
8.0     449
9.0     450
10.0    459
11.0    507
12.0    524
13.0    559
14.0    509
15.0    390
16.0     98
17.0     72
18.0     48
19.0     43
20.0     41
21.0     34
22.0     79
23.0    150
Name: attributed_time, dtype: int64

In [205]:
t = downloaded.attributed_time - downloaded.click_time

channel_success = did_download.groupby(['channel']).is_attributed.mean()

In [206]:
channel_success.head(10)

channel
0     0.666667
3     0.000578
4     0.285714
5     0.786408
13    0.003361
15    0.000000
17    0.000000
18    0.000000
19    0.000974
21    0.532075
Name: is_attributed, dtype: float64

In [207]:
app_success = did_download.groupby(['app']).is_attributed.mean()
channel_success = channel_success.to_dict()
app_success = app_success.to_dict()

In [208]:
df['channel_success'] = df.channel.map(channel_success)
df['app_success'] = df.channel.map(app_success)

df.channel_success.fillna(0,inplace=True)
df.app_success.fillna(0,inplace=True)


In [209]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,channel_success,app_success
16126773,71606227,6,25,1,13,259,2017-11-08 00:45:31,NaT,0,0.001854,0.0
4644422,75087751,6,2,1,20,236,2017-11-08 01:51:28,NaT,0,0.00131,0.0
6384962,76715966,6,12,1,23,245,2017-11-08 02:21:52,NaT,0,0.000453,0.0
9230170,79598422,6,12,1,23,245,2017-11-08 03:08:52,NaT,0,0.000453,0.0
8358224,79923142,6,15,1,23,245,2017-11-08 03:15:14,NaT,0,0.000453,0.0
9045740,79933067,6,12,1,23,245,2017-11-08 03:15:26,NaT,0,0.000453,0.0
968901,80092803,6,12,1,9,245,2017-11-08 03:18:31,NaT,0,0.000453,0.0
10812216,93752832,6,12,1,19,409,2017-11-08 07:34:32,NaT,0,0.001619,0.0
14971038,97766652,6,3,1,16,409,2017-11-08 08:53:34,NaT,0,0.001619,0.0
1541864,108924647,6,3,1,35,280,2017-11-08 12:16:32,NaT,0,0.001348,0.0


In [210]:
s = df.groupby(['ip']).os.value_counts().to_frame().rename(columns={'os':'ip_os_count'}).reset_index()
u = test.groupby(['ip']).os.value_counts().to_frame().rename(columns={'os':'ip_os_count'}).reset_index()



In [211]:
s.head(10)

Unnamed: 0,ip,os,ip_os_count
0,6,19,4
1,6,23,4
2,6,17,3
3,6,22,2
4,6,1,1
5,6,8,1
6,6,9,1
7,6,12,1
8,6,13,1
9,6,14,1


In [212]:
gc.collect()

49

In [213]:
df = pd.merge(df,s,on=['ip','os'])
df['ip_os_count'] = df.ip_os_count.astype('float')
test = pd.merge(test,u,on=['ip','os'])
test['ip_os_count'] = test.ip_os_count.astype('float')

In [214]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,channel_success,app_success,ip_os_count
0,71606227,6,25,1,13,259,2017-11-08 00:45:31,NaT,0,0.001854,0.0,1.0
1,75087751,6,2,1,20,236,2017-11-08 01:51:28,NaT,0,0.00131,0.0,1.0
2,76715966,6,12,1,23,245,2017-11-08 02:21:52,NaT,0,0.000453,0.0,4.0
3,79598422,6,12,1,23,245,2017-11-08 03:08:52,NaT,0,0.000453,0.0,4.0
4,79923142,6,15,1,23,245,2017-11-08 03:15:14,NaT,0,0.000453,0.0,4.0
5,79933067,6,12,1,23,245,2017-11-08 03:15:26,NaT,0,0.000453,0.0,4.0
6,80092803,6,12,1,9,245,2017-11-08 03:18:31,NaT,0,0.000453,0.0,1.0
7,93752832,6,12,1,19,409,2017-11-08 07:34:32,NaT,0,0.001619,0.0,4.0
8,117604339,6,3,1,19,452,2017-11-08 14:41:15,NaT,0,0.00136,0.0,4.0
9,149039243,6,21,1,19,128,2017-11-09 05:04:36,NaT,0,0.0013,0.428571,4.0


In [215]:
n_chans = df.groupby(['ip','app']).channel.count().reset_index().rename(columns={'channel':'ip_app_count'})
df = df.merge(n_chans,on=['ip','app'],how='left')
x_chans = test.groupby(['ip','app']).channel.count().reset_index().rename(columns={'channel':'ip_app_count'})
test = test.merge(x_chans,on=['ip','app'],how='left')

In [216]:
test.head(10)

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,ip_os_count,ip_app_count
0,2890381,0,0,0,0,101,2017-11-10 04:51:27,2.0,2
1,3036280,0,0,0,0,101,2017-11-10 04:54:10,2.0,2
2,3055961,1,10,1,7,113,2017-11-10 04:54:32,1.0,1
3,11125123,2,15,1,13,245,2017-11-10 10:37:15,5.0,1
4,12490244,2,6,1,13,459,2017-11-10 13:03:15,5.0,1
5,12491143,2,3,1,13,371,2017-11-10 13:03:16,5.0,1
6,12491273,2,1,1,13,371,2017-11-10 13:03:16,5.0,1
7,12491492,2,12,1,13,265,2017-11-10 13:03:17,5.0,1
8,661408,3,3,1,13,452,2017-11-10 04:11:40,15.0,11
9,1439778,3,3,1,13,137,2017-11-10 04:25:21,15.0,11


In [217]:
df['clicked'] = np.ones(df.shape[0],dtype= np.float64)
df['app_exposure'] = df.groupby(['ip','app',]).clicked.cumsum()
df['channel_exposure'] = df.groupby(['ip','channel',]).clicked.cumsum()
test['clicked'] = np.ones(test.shape[0],dtype= np.float64)
test['app_exposure'] = test.groupby(['ip','app',]).clicked.cumsum()
test['channel_exposure'] = test.groupby(['ip','channel',]).clicked.cumsum()

In [218]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,channel_success,app_success,ip_os_count,ip_app_count,clicked,app_exposure,channel_exposure
0,71606227,6,25,1,13,259,2017-11-08 00:45:31,NaT,0,0.001854,0.0,1.0,1,1.0,1.0,1.0
1,75087751,6,2,1,20,236,2017-11-08 01:51:28,NaT,0,0.00131,0.0,1.0,3,1.0,1.0,1.0
2,76715966,6,12,1,23,245,2017-11-08 02:21:52,NaT,0,0.000453,0.0,4.0,6,1.0,1.0,1.0
3,79598422,6,12,1,23,245,2017-11-08 03:08:52,NaT,0,0.000453,0.0,4.0,6,1.0,2.0,2.0
4,79923142,6,15,1,23,245,2017-11-08 03:15:14,NaT,0,0.000453,0.0,4.0,1,1.0,1.0,3.0
5,79933067,6,12,1,23,245,2017-11-08 03:15:26,NaT,0,0.000453,0.0,4.0,6,1.0,3.0,4.0
6,80092803,6,12,1,9,245,2017-11-08 03:18:31,NaT,0,0.000453,0.0,1.0,6,1.0,4.0,5.0
7,93752832,6,12,1,19,409,2017-11-08 07:34:32,NaT,0,0.001619,0.0,4.0,6,1.0,5.0,1.0
8,117604339,6,3,1,19,452,2017-11-08 14:41:15,NaT,0,0.00136,0.0,4.0,6,1.0,1.0,1.0
9,149039243,6,21,1,19,128,2017-11-09 05:04:36,NaT,0,0.0013,0.428571,4.0,1,1.0,1.0,1.0


In [219]:

df['daily_usage'] = df.groupby(['ip',df.click_time.dt.day]).clicked.cumsum()

In [220]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,channel_success,app_success,ip_os_count,ip_app_count,clicked,app_exposure,channel_exposure,daily_usage
0,71606227,6,25,1,13,259,2017-11-08 00:45:31,NaT,0,0.001854,0.0,1.0,1,1.0,1.0,1.0,1.0
1,75087751,6,2,1,20,236,2017-11-08 01:51:28,NaT,0,0.00131,0.0,1.0,3,1.0,1.0,1.0,2.0
2,76715966,6,12,1,23,245,2017-11-08 02:21:52,NaT,0,0.000453,0.0,4.0,6,1.0,1.0,1.0,3.0
3,79598422,6,12,1,23,245,2017-11-08 03:08:52,NaT,0,0.000453,0.0,4.0,6,1.0,2.0,2.0,4.0
4,79923142,6,15,1,23,245,2017-11-08 03:15:14,NaT,0,0.000453,0.0,4.0,1,1.0,1.0,3.0,5.0
5,79933067,6,12,1,23,245,2017-11-08 03:15:26,NaT,0,0.000453,0.0,4.0,6,1.0,3.0,4.0,6.0
6,80092803,6,12,1,9,245,2017-11-08 03:18:31,NaT,0,0.000453,0.0,1.0,6,1.0,4.0,5.0,7.0
7,93752832,6,12,1,19,409,2017-11-08 07:34:32,NaT,0,0.001619,0.0,4.0,6,1.0,5.0,1.0,8.0
8,117604339,6,3,1,19,452,2017-11-08 14:41:15,NaT,0,0.00136,0.0,4.0,6,1.0,1.0,1.0,9.0
9,149039243,6,21,1,19,128,2017-11-09 05:04:36,NaT,0,0.0013,0.428571,4.0,1,1.0,1.0,1.0,1.0


In [221]:
df['hour'] = df.click_time.dt.hour
df['hour_cumative_clicks'] = df.groupby(['ip',df.click_time.dt.hour]).clicked.cumsum()

In [222]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,channel_success,app_success,ip_os_count,ip_app_count,clicked,app_exposure,channel_exposure,daily_usage,hour,hour_cumative_clicks
0,71606227,6,25,1,13,259,2017-11-08 00:45:31,NaT,0,0.001854,0.0,1.0,1,1.0,1.0,1.0,1.0,0,1.0
1,75087751,6,2,1,20,236,2017-11-08 01:51:28,NaT,0,0.00131,0.0,1.0,3,1.0,1.0,1.0,2.0,1,1.0
2,76715966,6,12,1,23,245,2017-11-08 02:21:52,NaT,0,0.000453,0.0,4.0,6,1.0,1.0,1.0,3.0,2,1.0
3,79598422,6,12,1,23,245,2017-11-08 03:08:52,NaT,0,0.000453,0.0,4.0,6,1.0,2.0,2.0,4.0,3,1.0
4,79923142,6,15,1,23,245,2017-11-08 03:15:14,NaT,0,0.000453,0.0,4.0,1,1.0,1.0,3.0,5.0,3,2.0
5,79933067,6,12,1,23,245,2017-11-08 03:15:26,NaT,0,0.000453,0.0,4.0,6,1.0,3.0,4.0,6.0,3,3.0
6,80092803,6,12,1,9,245,2017-11-08 03:18:31,NaT,0,0.000453,0.0,1.0,6,1.0,4.0,5.0,7.0,3,4.0
7,93752832,6,12,1,19,409,2017-11-08 07:34:32,NaT,0,0.001619,0.0,4.0,6,1.0,5.0,1.0,8.0,7,1.0
8,117604339,6,3,1,19,452,2017-11-08 14:41:15,NaT,0,0.00136,0.0,4.0,6,1.0,1.0,1.0,9.0,14,1.0
9,149039243,6,21,1,19,128,2017-11-09 05:04:36,NaT,0,0.0013,0.428571,4.0,1,1.0,1.0,1.0,1.0,5,1.0


In [223]:
gc.collect()

270

In [224]:
test['daily_usage'] = test.groupby(['ip', test.click_time.dt.day]).clicked.cumsum()
test['hour'] = test.click_time.dt.hour
test['hour_cumative_clicks'] = test.groupby(['ip', test.click_time.dt.hour]).clicked.cumsum()

In [225]:
gc.collect()

55

In [226]:

from sklearn.model_selection import train_test_split
X = df[['app','device','os','channel','app_exposure','daily_usage','hour','hour_cumative_clicks','ip_os_count']]
y = df.is_attributed
X_test = test[['app','device','os','channel','app_exposure','daily_usage','hour','hour_cumative_clicks','ip_os_count']]


In [227]:
gc.collect()

0

In [228]:

from catboost import CatBoostClassifier
categorical_features_indices = np.where(X.dtypes != np.float)[0]
categorical_features_indices = np.where(X_test.dtypes != np.float)[0]
cat = CatBoostClassifier()

model = cat.fit(X, y,cat_features=categorical_features_indices,plot=False,verbose=True)

0:	learn: 0.5936537	total: 3.12s	remaining: 51m 56s
1:	learn: 0.5078537	total: 6.03s	remaining: 50m 6s
2:	learn: 0.4337026	total: 9.44s	remaining: 52m 18s
3:	learn: 0.3707005	total: 12.7s	remaining: 52m 34s
4:	learn: 0.3173991	total: 15s	remaining: 49m 47s
5:	learn: 0.2724359	total: 18s	remaining: 49m 49s
6:	learn: 0.2345700	total: 21.1s	remaining: 49m 50s
7:	learn: 0.2027005	total: 23.3s	remaining: 48m 11s
8:	learn: 0.1758700	total: 25.6s	remaining: 47m 2s
9:	learn: 0.1532583	total: 28.7s	remaining: 47m 21s
10:	learn: 0.1341709	total: 31s	remaining: 46m 26s
11:	learn: 0.1180247	total: 33.3s	remaining: 45m 40s
12:	learn: 0.1043332	total: 36.2s	remaining: 45m 48s
13:	learn: 0.0926916	total: 39.1s	remaining: 45m 53s
14:	learn: 0.0827645	total: 42.1s	remaining: 46m 2s
15:	learn: 0.0730997	total: 44.4s	remaining: 45m 32s
16:	learn: 0.0649297	total: 47.1s	remaining: 45m 25s
17:	learn: 0.0575609	total: 50.7s	remaining: 46m 4s
18:	learn: 0.0511883	total: 53.5s	remaining: 46m 2s
19:	learn: 0.0

In [230]:
y_pred_prob = model.predict_proba(X_test)

In [231]:
gc.collect()

output = pd.DataFrame(test['click_id'])
output['is_attributed'] = y_pred_prob[:,1]
output = output.set_index('click_id')

output.to_csv("submission_stackF.csv")