In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold

%matplotlib inline 

### Load the data

In [2]:
data = pd.read_csv('./data/train_sample.csv', low_memory=False, parse_dates=['click_time', 'attributed_time'])
test = pd.read_csv('./data/test.csv', low_memory=False, parse_dates=['click_time'])

In [3]:
data['is_train'] = 1
test['is_train'] = 0

In [4]:
dups = data[data[['ip','click_time']].duplicated()]

In [5]:
dups.shape

(23, 9)

In [6]:
data.loc[(data['ip'].isin(dups['ip'])) & (data['click_time'].isin(dups['click_time']))]\
    .sort_values(by=['ip','click_time'])  

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,is_train
64362,871,12,1,13,178,2017-11-08 10:00:05,NaT,0,1
84125,871,12,1,13,178,2017-11-08 10:00:05,NaT,0,1
40703,5314,18,1,37,379,2017-11-09 15:13:53,NaT,0,1
75203,5314,26,2,13,121,2017-11-09 15:13:53,NaT,0,1
23543,9801,9,1,11,134,2017-11-06 22:17:34,NaT,0,1
36094,9801,8,1,11,145,2017-11-06 22:17:34,NaT,0,1
8423,24008,1,2,20,125,2017-11-08 13:01:16,NaT,0,1
22844,24008,24,2,20,178,2017-11-08 13:01:16,NaT,0,1
42077,26995,1,1,49,124,2017-11-08 11:31:59,NaT,0,1
73851,26995,12,2,13,265,2017-11-08 11:31:59,NaT,0,1


In [7]:
data.loc[(data['ip']==871)]

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,is_train
27486,871,15,1,10,430,2017-11-07 00:44:01,NaT,0,1
33669,871,9,1,37,244,2017-11-08 09:57:37,NaT,0,1
40335,871,15,2,15,3,2017-11-07 16:12:12,NaT,0,1
56614,871,18,1,607,107,2017-11-06 21:16:26,NaT,0,1
64362,871,12,1,13,178,2017-11-08 10:00:05,NaT,0,1
74907,871,3,1,16,280,2017-11-07 00:57:22,NaT,0,1
83643,871,11,1,18,319,2017-11-07 08:06:07,NaT,0,1
84125,871,12,1,13,178,2017-11-08 10:00:05,NaT,0,1


In [8]:
data[['ip', 'click_time']].drop_duplicates().shape

(99977, 2)

In [9]:
full_data = pd.concat([data, test], axis=0)

In [10]:
full_data.head()

Unnamed: 0,app,attributed_time,channel,click_id,click_time,device,ip,is_attributed,is_train,os
0,12,NaT,497,,2017-11-07 09:30:38,1,87540,0.0,1,13
1,25,NaT,259,,2017-11-07 13:40:27,1,105560,0.0,1,17
2,12,NaT,212,,2017-11-07 18:05:24,1,101424,0.0,1,19
3,13,NaT,477,,2017-11-07 04:58:08,1,94584,0.0,1,13
4,12,NaT,178,,2017-11-09 09:00:09,1,68413,0.0,1,1


In [11]:
sub_data = pd.read_csv('./data/train_sample.csv', low_memory=False, parse_dates=['click_time'], 
                      nrows = 10000, usecols=['ip', 'click_time'])

In [12]:
sub_data.to_csv('sub_data.csv', index=False)

### Feature Engineering

There are many ways to generate useful features. We introduce following technique in generating features.

#### Extract Useful Time Information

In [13]:
def date_part(data,fld_name,f_mat):
    prefix='dt'
    date_time = data[fld_name]
    fld = pd.to_datetime(date_time, format=f_mat)
    for n in ('hour', 'minute'):
        data[prefix+'_'+n] = getattr(fld.dt,n.lower())
    

In [14]:
date_part(data, 'click_time', f_mat='%d%b%Y:%H:%M:%S.%f')

In [15]:
date_part(test, 'click_time', f_mat='%d%b%Y:%H:%M:%S.%f')

In [16]:
data.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,is_train,dt_hour,dt_minute
0,87540,12,1,13,497,2017-11-07 09:30:38,NaT,0,1,9,30
1,105560,25,1,17,259,2017-11-07 13:40:27,NaT,0,1,13,40
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,0,1,18,5
3,94584,13,1,13,477,2017-11-07 04:58:08,NaT,0,1,4,58
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,0,1,9,0


### Mean Encoding

In [17]:
def reg_target_encoding(train, col ,target , splits=5):
    """ Computes regularize mean encoding.
    Inputs:
       train: training dataframe
       
    """
    kf = KFold(n_splits=splits,shuffle=False)
    train[col+'_mean_enc']=np.nan
    for train_index, val_index in kf.split(train[col].values):
        x_train,x_val = train.loc[train_index],train.loc[val_index]
        mean_device_type = x_train.groupby(col)[target].mean()
        x_val[col+'_mean_enc']=x_val[col].map(mean_device_type)
        train.loc[val_index]=x_val
    global_mean = train[col+'_mean_enc'].mean()
    train.fillna(global_mean,inplace=True)
    
def mean_encoding_test(test, train, col, target):
    """ Computes target enconding for test data.
    
    This is similar to how we do validation
    """
    # YOUR CODE HERE
    mean_encoded_value = train.groupby(col)[target].mean()
    
    global_mean = train[target].mean()
    
    test[col+'_mean_enc'] = test[col].map(mean_encoded_value)
    test[col+'_mean_enc'].fillna(global_mean, inplace=True)

In [18]:
def mean_encoding(train, test, cols, target):
    """ Computes target enconding for test data.
    
    This is similar to how we do validation
    """
    global_mean = train[target].mean()
    for col in cols:
        mean_encoded_value = train.groupby(col)[target].mean()
        train[col+'_mean_enc'] = train[col].map(mean_encoded_value)
        test[col+'_mean_enc'] = test[col].map(mean_encoded_value)
        test[col+'_mean_enc'].fillna(global_mean, inplace=True)

In [19]:
cols =['app', 'device', 'os', 'channel', 'dt_hour']
mean_encoding(data, test, cols, 'is_attributed')

In [20]:
data.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,is_train,dt_hour,dt_minute,app_mean_enc,device_mean_enc,os_mean_enc,channel_mean_enc,dt_hour_mean_enc
0,87540,12,1,13,497,2017-11-07 09:30:38,NaT,0,1,9,30,7.6e-05,0.001548,0.001178,0.0,0.002055
1,105560,25,1,17,259,2017-11-07 13:40:27,NaT,0,1,13,40,0.0,0.001548,0.001147,0.0,0.00178
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,0,1,18,5,7.6e-05,0.001548,0.001718,0.0,0.001691
3,94584,13,1,13,477,2017-11-07 04:58:08,NaT,0,1,4,58,0.0,0.001548,0.001178,0.0,0.001656
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,0,1,9,0,7.6e-05,0.001548,0.000844,0.0,0.002055


In [21]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,is_train,dt_hour,dt_minute,app_mean_enc,device_mean_enc,os_mean_enc,channel_mean_enc,dt_hour_mean_enc
0,0,5744,9,1,3,107,2017-11-10 04:00:00,0,4,0,0.00089,0.001548,0.000635,0.00022,0.001656
1,1,119901,9,1,3,466,2017-11-10 04:00:00,0,4,0,0.00089,0.001548,0.000635,0.001349,0.001656
2,2,72287,21,1,19,128,2017-11-10 04:00:00,0,4,0,0.0,0.001548,0.001718,0.0,0.001656
3,3,78477,15,1,13,111,2017-11-10 04:00:00,0,4,0,0.000233,0.001548,0.001178,0.0,0.001656
4,4,123080,12,1,13,328,2017-11-10 04:00:00,0,4,0,7.6e-05,0.001548,0.001178,0.0,0.001656


### Count Click

The idea is to calculate how many times a ip showed up before its given click time. The time difference between a ip of given click time since this ip appeared the vert firt time and this ip appeared right before the givin click time. 


In [23]:
merge_ip = train[['ip','click_time']].merge(train[['ip', 'click_time']], 
                                                  on = 'ip', how = 'left', suffixes=('_l', '_r'))

NameError: name 'train' is not defined

In [24]:
merge_ip.head()

NameError: name 'merge_ip' is not defined

In [25]:
data.shape

(100000, 16)

In [26]:
merge_ip.shape

NameError: name 'merge_ip' is not defined

In [252]:
agg = merge_ip.loc[(merge_ip['click_time_l'] > merge_ip['click_time_r'])]\
              .groupby(['ip','click_time_l'])\
              .agg({'click_time_r':['count','min','max']})\
              .reset_index(col_level=1)
        

In [253]:
agg.columns = agg.columns.droplevel()

In [254]:
agg.head()

Unnamed: 0,ip,click_time_l,count,min,max
0,10,2017-11-07 07:33:04,1,2017-11-07 01:37:19,2017-11-07 01:37:19
1,10,2017-11-08 11:19:15,2,2017-11-07 01:37:19,2017-11-07 07:33:04
2,20,2017-11-08 03:06:02,1,2017-11-08 00:27:27,2017-11-08 00:27:27
3,20,2017-11-08 08:13:41,2,2017-11-08 00:27:27,2017-11-08 03:06:02
4,20,2017-11-09 03:46:51,3,2017-11-08 00:27:27,2017-11-08 08:13:41


In [255]:
agg['time_diff_ip_first'] = (agg['click_time_l'] - agg['min']).astype('timedelta64[s]') / 60.0
agg['time_diff_ip_last'] = (agg['click_time_l'] - agg['max']).astype('timedelta64[s]') / 60.0

In [256]:
agg.rename(columns = {'count':'cnt_ip_click'}, inplace=True)

In [257]:
data = data.merge(agg[['ip', 'click_time_l', 'cnt_ip_click', 'time_diff_ip_first', 'time_diff_ip_last']], 
                    left_on = ['ip', 'click_time'], right_on = ['ip', 'click_time_l'], how = 'left')

In [194]:
data.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,dt_hour,dt_minute,app_mean_enc,device_mean_enc,os_mean_enc,channel_mean_enc,dt_hour_mean_enc,click_time_l,cnt_ip_click,time_diff_ip_first,time_diff_ip_last
0,87540,12,1,13,497,2017-11-07 09:30:38,NaT,0,9,30,7.6e-05,0.001548,0.001178,0.0,0.002055,2017-11-07 09:30:38,3.0,458.05,245.083333
1,105560,25,1,17,259,2017-11-07 13:40:27,NaT,0,13,40,0.0,0.001548,0.001147,0.0,0.00178,2017-11-07 13:40:27,41.0,1298.466667,24.066667
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,0,18,5,7.6e-05,0.001548,0.001718,0.0,0.001691,NaT,,,
3,94584,13,1,13,477,2017-11-07 04:58:08,NaT,0,4,58,0.0,0.001548,0.001178,0.0,0.001656,2017-11-07 04:58:08,1.0,316.733333,316.733333
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,0,9,0,7.6e-05,0.001548,0.000844,0.0,0.002055,2017-11-09 09:00:09,3.0,2761.35,550.583333


### possible new features

In [185]:
data.groupby(['ip', 'click_time'])['app'].count().reset_index().sort_values('app',ascending=False)

Unnamed: 0,ip,click_time,app
14953,24008,2017-11-08 13:01:16,2
70637,111025,2017-11-08 15:19:34,2
48384,77048,2017-11-08 00:39:14,2
23860,38866,2017-11-09 01:01:59,2
65447,103463,2017-11-09 14:20:15,2
16719,26995,2017-11-08 11:31:59,2
89218,184859,2017-11-08 01:53:27,2
38583,61667,2017-11-07 05:00:39,2
98478,313289,2017-11-09 12:24:26,2
82531,141432,2017-11-08 17:09:52,2


## Random Forest Model

In [195]:
features = ['app','device', 'os', 'channel', 'dt_hour', 'cnt_ip_click', 'time_diff_ip_first', 'time_diff_ip_last']
label = 'is_attributed'

In [56]:
X = data[features]
y = data[label]
test_X = test[features]

In [57]:
test_X.isnull().sum()

app                 0
device              0
os                  0
channel             0
dt_hour             0
app_mean_enc        0
device_mean_enc     0
os_mean_enc         0
channel_mean_enc    0
dt_hour_mean_enc    0
dtype: int64

In [58]:
test_X.shape[0]

18790469

In [59]:
rf = RandomForestClassifier(n_estimators = 100, min_samples_leaf=50) # define Random Forest with 100 trees and 50 or more samples on each leaf node 
rf.fit(X,y)                                                          # train rf 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [60]:
y_pred_rf = rf.predict_proba(test_X)[:,1]

In [62]:
sample = pd.read_csv('./data/sample_submission.csv')  

In [63]:
sample['is_attributed'] = y_pred_rf
sample.to_csv('sub_5_rf.csv', index=False)