In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

%matplotlib inline 

In [2]:
! ls ./data

sample_submission.csv  test.csv  train_sample.csv


In [3]:
data = pd.read_csv('./sub_1_rf.csv', low_memory=False, parse_dates=True)
#test = pd.read_csv('./data/test.csv', low_memory=False, parse_dates=True)

In [4]:
data.shape

(18790469, 2)

In [5]:
data.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [6]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


#### Simple EDA

In [7]:
# is the data balanced?
pos_ratio = np.sum(data['is_attributed']) * 1.0 / data.shape[0]
pos_ratio

0.00227

It might seem data is highly unbalanced since there are only 0.23 % of samples appeared to download the APP. However, low success rate is highly possible in click rate and fraud detection case in the real world.

In [8]:
def null_ratio(df): return round(np.sum(df.isna() * 1.0 / df.shape[0]), 3)

null_ratio(data)

ip                 0.000
app                0.000
device             0.000
os                 0.000
channel            0.000
click_time         0.000
attributed_time    0.998
is_attributed      0.000
dtype: float64

It seems attributed_time is the only one contains null.

In [9]:
data.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed'],
      dtype='object')

In [10]:
def unique_cat(df): return [(x, df[x].nunique(dropna=False)) for x in df.columns]

unique_cat(data)

[('ip', 34857),
 ('app', 161),
 ('device', 100),
 ('os', 130),
 ('channel', 161),
 ('click_time', 80350),
 ('attributed_time', 228),
 ('is_attributed', 2)]

It is natural to drop ip since it states that every three click is from the same ip. And a different ip address will be assigned to a device once the device is logged in.

### Baseline Model - Logistic Regression

In [11]:
X = data[['app','device', 'os', 'channel']]
y = data['is_attributed']
test_X = test[['app','device', 'os', 'channel']]

In [14]:
lr = LogisticRegression() # define logstic regression 
lr.fit(X,y)               # train lr

y_pred = lr.predict_proba(test_X)[:,1] # get soft predicitons

In [15]:
sample = pd.read_csv('./data/sample_submission.csv')  # load sample submission 
sample['is_attributed'] = y_pred                      # insert SOFT prediction as target variable
sample.to_csv('sub_0.csv', index=False)               # save prediction to csv to submit 

After submitting to kaggle, the area under the ROC curve is 0.6084. We can improve this using feature engineering and other techniques. But first, lets see if rf using the same train data will lead to a better score.

### Baseline Model - Random Forest

In [16]:
rf = RandomForestClassifier(n_estimators = 100, min_samples_leaf=50) # define Random Forest with 100 trees and 50 or more samples on each leaf node 
rf.fit(X,y)                                                          # train rf 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
y_pred_rf = rf.predict_proba(test_X)[:,1]

In [18]:
sample['is_attributed'] = y_pred_rf
sample.to_csv('sub_1_rf.csv', index=False)