# RANDOM FOREST by gaurav

### Load libraries and data

In [1]:
import pandas as pd
import numpy as np
# import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# check missing values per column
train.isnull().sum(axis=0)/train.shape[0]

ID             0.000000
datetime       0.000000
siteid         0.099896
offerid        0.000000
category       0.000000
merchant       0.000000
countrycode    0.000000
browserid      0.050118
devid          0.149969
click          0.000000
dtype: float64

In [4]:
train.click.value_counts()/train.shape[0]

0    0.963979
1    0.036021
Name: click, dtype: float64

In [5]:
train[train.click==1].shape

(437214, 10)

### Clean Data and Create Features

In [6]:
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None", inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None", inplace=True)
test['devid'].fillna("None", inplace=True)

In [7]:
# set datatime
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [8]:
# create datetime variable
train['tweekday'] = train['datetime'].dt.weekday
train['thour'] = train['datetime'].dt.hour
train['tminute'] = train['datetime'].dt.minute

test['tweekday'] = test['datetime'].dt.weekday
test['thour'] = test['datetime'].dt.hour
test['tminute'] = test['datetime'].dt.minute

In [9]:
cols = ['siteid','offerid','category','merchant']

for x in cols:
    train[x] = train[x].astype('object')
    test[x] = test[x].astype('object')

In [10]:
cat_cols = cols + ['countrycode','browserid','devid']

In [11]:
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values) + list(test[col].values))
    train[col] = lbl.transform(list(train[col].values))
    test[col] = lbl.transform(list(test[col].values))

In [12]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,tweekday,thour,tminute
0,IDsrk7SoW,2017-01-14 09:42:09,128865,784773,48,127,4,2,2,0,5,9,42
1,IDmMSxHur,2017-01-18 17:50:53,142053,157563,59,65,1,8,0,0,2,17,50
2,IDVLNN0Ut,2017-01-11 12:46:49,2618,458279,69,15,0,1,2,0,2,12,46
3,ID32T6wwQ,2017-01-17 10:18:43,243406,345067,117,507,2,2,1,0,1,10,18
4,IDqUShzMg,2017-01-14 16:02:33,154278,417948,36,276,3,8,0,0,5,16,2


In [23]:
df_ones = train[train.click==1]
df_zero = train[train.click==0]

In [24]:
rows = np.random.choice(df_zero.index.values, 430000)
print(df_ones.shape[0])
new_df = df_ones.append(df_zero.loc[rows])
print(new_df.shape[0])
rows = np.random.choice(new_df.index.values, new_df.shape[0])
new_df = new_df.loc[rows]

437214
867214


### Model Training

In [13]:
cols_to_use = list(set(train.columns) - set(['ID','datetime','click']))

In [14]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=20, random_state=42)
a = 0
for i, j in skf.split(train[cols_to_use], train['click']):
    a = j
len(a)

606889

In [30]:
X_train, X_test, y_train, y_test = train_test_split(train.loc[a][cols_to_use], train.loc[a]['click'], test_size = 0.5, random_state=42)

In [25]:
# handling imbalance for once
X_train, X_test, y_train, y_test = train_test_split(new_df[cols_to_use], new_df['click'], test_size = 0.5, random_state=42)

In [31]:
y_test.value_counts()

0    292472
1     10973
Name: click, dtype: int64

# For trial

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import time
t1 = time.time()
dict3 = {}
dict4 = {}
for i in [20, 40, 50, 60, 70, 80, 90]:
    
    print("Model with ", i, " max_features ")
    model_mod = RandomForestClassifier(n_estimators=470, min_samples_split=i, verbose=1, oob_score=True, n_jobs=-1, random_state=42)#lets use the fucing cores
    start = time.time()
    model_mod.fit(X_train[cols_to_use], y_train)
    print("time to fit: ", time.time()-start)
    start = time.time()
    pred = model_mod.predict(X_test)
    print("time to predict: ", time.time()-start ,"\n")
    print("training score: ", roc_auc_score(y_train, model_mod.predict(X_train)))
    print("crossval score: ", roc_auc_score(y_test, pred))
    print("oob score: ", model_mod.oob_score_)
    dict3[i]=roc_auc_score(y_test, pred)
    dict4[i]=model_mod.oob_score_
    print("Done :: ", i)
print("total time ", time.time()-t1)

Model with  20  max_features 


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 470 out of 470 | elapsed:   46.0s finished


time to fit:  61.34615731239319


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.4s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.6s finished


time to predict:  3.2589190006256104 



[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.4s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.6s finished


training score:  0.842283864365
crossval score:  0.810741862294
oob score:  0.978437537074
Done ::  20
Model with  40  max_features 


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   40.9s
[Parallel(n_jobs=-1)]: Done 470 out of 470 | elapsed:   44.7s finished


time to fit:  59.90058708190918


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.4s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.6s finished


time to predict:  3.2427890300750732 



[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.4s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.6s finished


training score:  0.829496552369
crossval score:  0.813617607516
oob score:  0.978401286564
Done ::  40
Model with  50  max_features 


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   41.8s
[Parallel(n_jobs=-1)]: Done 470 out of 470 | elapsed:   45.6s finished


time to fit:  60.84084892272949


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.4s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.7s finished


time to predict:  3.2547965049743652 



[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.5s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.7s finished


training score:  0.826901148645
crossval score:  0.814501582275
oob score:  0.978397991063
Done ::  50
Model with  60  max_features 


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 470 out of 470 | elapsed:   48.1s finished


time to fit:  63.193058013916016


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.5s finished


time to predict:  3.1826536655426025 



[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.5s finished


training score:  0.82566539075
crossval score:  0.814110289982
oob score:  0.978427650571
Done ::  60
Model with  70  max_features 


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 470 out of 470 | elapsed:   46.1s finished


time to fit:  61.55319142341614


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.5s finished


time to predict:  3.1748204231262207 



[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.3s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.5s finished


training score:  0.824167965919
crossval score:  0.813916057464
oob score:  0.978404582065
Done ::  70
Model with  80  max_features 


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 470 out of 470 | elapsed:   45.0s finished


time to fit:  59.93282699584961


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.2s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.4s finished


time to predict:  3.0674798488616943 



[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.2s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.4s finished


training score:  0.823311800698
crossval score:  0.814450887189
oob score:  0.978332081043
Done ::  80
Model with  90  max_features 


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 470 out of 470 | elapsed:   45.0s finished


time to fit:  59.88937854766846


[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.2s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.4s finished


time to predict:  3.0707204341888428 



[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    2.2s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    2.4s finished


training score:  0.822631013989
crossval score:  0.814421824578
oob score:  0.978328785542
Done ::  90
total time  472.07791090011597


400, njob=-1
training score:  1.0
crossval score:  0.799803514688
oob score:  0.978233216013

0    294387
1      9058
Name: 0, dtype: int64
0    292421
1     11024
Name: click, dtype: int64

600, njob=-1
worse performance

470, 510 looks good to me

d1,2 : nestimators
d 5,6 :
d 3, 4 ; min sample split

# MODEL 1
    oob score:  0.922385828199
    public score : 0.67484
    dont change this one now, add more cells
    imbalanced data treated : yes.
    n_estimators=500, min_samples_leaf=20, min_samples_split=50,  verbose=1, oob_score=True, n_jobs=-1, random_state=42

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import time
t1 = time.time()
dict5 = {}
dict6 = {}
for i in [20]:#, 4, 5, 10, 20, 50, 60, 70, 80, 100, 300, 500]:
    
    print("Model with ", i, " n estimators")
    model_mod = RandomForestClassifier(n_estimators=470, min_samples_split=50,  verbose=1, oob_score=True, n_jobs=-1, random_state=42)#lets use the fucing cores
    start = time.time()
    model_mod.fit(new_df[cols_to_use], new_df['click'])
    print("time to fit: ", time.time()-start)
    start = time.time()
#     pred = model_mod.predict(X_test)
    print("time to predict: ", time.time()-start ,"\n")
    print("training score: ", roc_auc_score(y_train, model_mod.predict(X_train)))
#     print("crossval score: ", roc_auc_score(y_test, pred))
    print("oob score: ", model_mod.oob_score_)
#     dict3[i]=roc_auc_score(y_test, pred)
#     dict4[i]=model_mod.oob_score_
    print("Done :: ", i)
print("total time ", time.time()-t1)

Model with  20  n estimators


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 470 out of 470 | elapsed:  4.0min finished


time to fit:  299.62747049331665
time to predict:  0.0 



[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    1.7s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    4.0s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:    4.4s finished


training score:  0.918312706297
oob score:  0.929364126495
Done ::  20
total time  304.6990611553192


In [40]:
import operator
max(dict2.items(), key=operator.itemgetter(1))[0]

500

In [39]:
max(dict1.items(), key=operator.itemgetter(1))[0]

0.79949729157596672

In [42]:
dict1[500]

0.79893991850948654

In [41]:
dict2[500]

0.97786411990350774

In [47]:
max(dict3.items(), key=operator.itemgetter(1))[0]

80

In [48]:
max(dict4.items(), key=operator.itemgetter(1))[0]

4

In [None]:
max(dict5.items(), key=operator.itemgetter(1))[0]

In [None]:
max(dict6.items(), key=operator.itemgetter(1))[0]

In [18]:
pred = pd.DataFrame(pred)
print(pred[0].value_counts())
y_test = pd.Series(y_test)
print(y_test.value_counts())

0    294463
1      8982
Name: 0, dtype: int64
0    292444
1     11001
Name: click, dtype: int64


In [23]:
model_mod.random_state

42

In [26]:
model_mod.estimators_[0].random_state

1608637542

In [19]:
a = test.columns

In [38]:
pred = model_mod.predict_proba(test[cols_to_use])

[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    3.8s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:   19.4s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:   45.9s
[Parallel(n_jobs=12)]: Done 470 out of 470 | elapsed:   49.9s finished


In [39]:
pred = pd.DataFrame(pred)
pred.shape

(3706907, 2)

In [22]:
model.feature_importances_

NameError: name 'model' is not defined

In [40]:
sub = pd.DataFrame({'ID':test['ID'], 'click':pred[1]})
sub.to_csv('smallData_rf_666.csv', index=False)

In [26]:
train.loc[a].to_csv("small_train.csv")