In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

In [2]:
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

In [3]:
train_usecols = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed']

In [4]:
train = pd.read_csv('../data/train.csv', dtype=dtypes, usecols=train_usecols)

In [5]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed
0,83230,3,1,13,379,0
1,17357,3,1,19,379,0
2,35810,3,1,13,379,0
3,45745,14,1,13,478,0
4,161007,3,1,13,379,0


In [6]:
train_sample = pd.read_csv('../data/train_sample.csv', dtype=dtypes, usecols=train_usecols)

In [25]:
test = pd.read_csv('../data/test.csv', dtype=dtypes, usecols=['click_id'] + train_usecols[:-1])

In [8]:
test_sample = pd.read_csv('../data/test_supplement.csv', dtype=dtypes, usecols=train_usecols[:-1])

In [9]:
test = pd.concat([test, test_sample])

In [10]:
test.head()

Unnamed: 0,ip,app,device,os,channel
0,5744,9,1,3,107
1,119901,9,1,3,466
2,72287,21,1,19,128
3,78477,15,1,13,111
4,123080,12,1,13,328


In [11]:
test.isnull().sum()

ip         0
app        0
device     0
os         0
channel    0
dtype: int64

In [4]:
import pickle

In [6]:
with open('../results/random_forest_no_feat_eng_{}.pk'.format(10), 'rb') as f:
    rfcs = pickle.load(f)

In [7]:
rfcs

(10,
 0.8544131901378994,
 RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 10},
             criterion='gini', max_depth=None, max_features='auto',
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=4,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False))

In [9]:
pred = rfcs[2].predict(test)

In [10]:
test.loc[:, 'is_attributed_old'] = pred

In [11]:
with open('../results/final_leverage.pk', 'rb') as f:
    rfc = pickle.load(f)

In [13]:
pred = rfc.predict(test.iloc[:, :-1])

In [14]:
test.loc[:, 'is_attributed_new'] = pred

In [17]:
pred_prob = rfc.predict_proba(test.iloc[:, :-2])

In [26]:
test.loc[:, 'prob_new'] = pred_prob[:, 1]

In [21]:
(test.is_attributed_old == test.is_attributed_new).sum() / 18790469

0.99953194356138741

In [20]:
test.shape

(18790469, 8)

In [22]:
submit = pd.read_csv('../data/sample_submission.csv')

In [23]:
submit.sort_values('click_id', inplace=True)

In [27]:
test.sort_values('click_id', inplace=True)

In [28]:
submit.loc[:, 'is_attributed'] = test.prob_new

In [29]:
submit.isnull().sum()

click_id         0
is_attributed    0
dtype: int64

In [30]:
submit.head()

Unnamed: 0,click_id,is_attributed
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


In [31]:
submit.is_attributed.max()

1.0

In [32]:
submit.is_attributed.median()

0.0

In [33]:
submit.is_attributed.describe()

count    1.879047e+07
mean     3.004174e-03
std      5.098561e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: is_attributed, dtype: float64

In [34]:
submit.is_attributed.value_counts()

0.00    18687974
1.00       32221
0.04       21994
0.08        7889
0.96        5303
0.12        4388
0.92        3193
0.16        3102
0.88        2495
0.20        2200
0.84        1873
0.24        1665
0.76        1532
0.68        1473
0.64        1439
0.28        1418
0.80        1413
0.72        1290
0.60        1242
0.32        1169
0.56        1122
0.36        1058
0.40         834
0.52         821
0.44         683
0.48         678
Name: is_attributed, dtype: int64

In [35]:
submit.to_csv('../results/no_eng_weight10_partial_final_leverage.csv', index=False)

In [21]:
test.is_attributed.value_counts(normalize=True)

0    0.997094
1    0.002906
Name: is_attributed, dtype: float64

In [18]:
test.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed
0,5744,9,1,3,107,0
1,119901,9,1,3,466,0
2,72287,21,1,19,128,0
3,78477,15,1,13,111,0
4,123080,12,1,13,328,0


In [19]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed
0,83230,3,1,13,379,0
1,17357,3,1,19,379,0
2,35810,3,1,13,379,0
3,45745,14,1,13,478,0
4,161007,3,1,13,379,0


In [22]:
train = pd.concat([train, test])

In [23]:
%who

dtypes	 f	 np	 pd	 pickle	 pred	 rfc	 test	 test_sample	 
train	 train_sample	 train_usecols	 


In [24]:
del pred
del test
del test_sample
del train_sample

In [25]:
import gc

In [26]:
gc.collect()

2220

In [29]:
rfc = rfc[2]

In [None]:
rfc.fit(train.iloc[:, :-1], train.iloc[:, -1])