In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

In [2]:
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

In [3]:
train_usecols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']

In [4]:
train = pd.read_csv('../data/train.csv', dtype=dtypes, usecols=train_usecols)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184903890 entries, 0 to 184903889
Data columns (total 7 columns):
ip               uint32
app              uint16
device           uint16
os               uint16
channel          uint16
click_time       object
is_attributed    uint8
dtypes: object(1), uint16(4), uint32(1), uint8(1)
memory usage: 3.6+ GB


In [6]:
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0
3,45745,14,1,13,478,2017-11-06 14:34:52,0
4,161007,3,1,13,379,2017-11-06 14:35:08,0


In [7]:
train.loc[:, 'click_time'] = pd.to_datetime(train.click_time, format='%Y-%m-%d %H:%M:%S')

In [8]:
train.click_time.min()

Timestamp('2017-11-06 14:32:21')

In [9]:
train.click_time.max()

Timestamp('2017-11-09 16:00:00')

# Random forest

In [3]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [5]:
weights = [10, 100, 1000, 10000]

In [12]:
split_time = '2017-11-09 00:00:00'

In [13]:
end = 10000

In [None]:
for weight in weights:
    print(weight)
    rfc = RandomForestClassifier(n_estimators=25, class_weight={0: 1, 1: weight}, n_jobs=4)
    rfc.fit(train.loc[train.click_time < split_time].iloc[:, :-2], train.loc[train.click_time < split_time].iloc[:, -1])
    pred_prob = rfc.predict_proba(train.loc[train.click_time >= split_time].iloc[:, :-2])[:, 1]
    with open('../results/random_forest_no_feat_eng_{}.pk'.format(weight), 'wb') as f:
        pickle.dump((weight, roc_auc_score(train.loc[train.click_time >= split_time].iloc[:, -1], pred_prob), rfc), f)

10


In [6]:
rfcs = []
for weight in weights:
    with open('../results/random_forest_no_feat_eng_{}.pk'.format(weight), 'rb') as f:
        rfcs.append(pickle.load(f))

In [7]:
rfcs

[(10,
  0.8544131901378994,
  RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 10},
              criterion='gini', max_depth=None, max_features='auto',
              max_leaf_nodes=None, min_impurity_split=1e-07,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=4,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False)),
 (100,
  0.85198970230448512,
  RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
              criterion='gini', max_depth=None, max_features='auto',
              max_leaf_nodes=None, min_impurity_split=1e-07,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=4,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False)),
 (1000,
  0.72076903807615222,
  RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 1000},

In [9]:
test = pd.read_csv('../data/test.csv', dtype=dtypes)

In [11]:
submit = pd.read_csv('../data/sample_submission.csv', dtype=dtypes)

In [12]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [13]:
pred_prob = rfcs[0][2].predict_proba(test.iloc[:, 1:6])[:, 1]

In [14]:
test.loc[:, 'prob'] = pred_prob

In [15]:
test.prob.max()

1.0

In [16]:
test.sort_values('click_id', inplace=True)

In [17]:
submit.sort_values('click_id', inplace=True)

In [18]:
submit.loc[:, 'is_attributed'] = test.prob

In [19]:
submit.to_csv('../results/no_eng_weight10_partial.csv', index=False)