# Decision Tree by gaurav

### Load libraries and data

In [1]:
import pandas as pd
import numpy as np
# import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# check missing values per column
train.isnull().sum(axis=0)/train.shape[0]

ID             0.000000
datetime       0.000000
siteid         0.099896
offerid        0.000000
category       0.000000
merchant       0.000000
countrycode    0.000000
browserid      0.050118
devid          0.149969
click          0.000000
dtype: float64

In [4]:
train.click.value_counts()/train.shape[0]

0    0.963979
1    0.036021
Name: click, dtype: float64

In [27]:
df_ones = train[train.click==1]
df_zero = train[train.click==0]

In [110]:
rows = np.random.choice(df_zero.index.values, 430000)
print(df_ones.shape[0])
new_df = df_ones.append(df_zero.loc[rows])
print(new_df.shape[0])
rows = np.random.choice(new_df.index.values, new_df.shape[0])
new_df = new_df.loc[rows]

437214
867214


### Clean Data and Create Features

In [5]:
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None", inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None", inplace=True)
test['devid'].fillna("None", inplace=True)

In [7]:
# set datatime
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [8]:
# create datetime variable
train['tweekday'] = train['datetime'].dt.weekday
train['thour'] = train['datetime'].dt.hour
train['tminute'] = train['datetime'].dt.minute

test['tweekday'] = test['datetime'].dt.weekday
test['thour'] = test['datetime'].dt.hour
test['tminute'] = test['datetime'].dt.minute

In [9]:
cols = ['siteid','offerid','category','merchant']

for x in cols:
    train[x] = train[x].astype('object')
    test[x] = test[x].astype('object')

In [10]:
cat_cols = cols + ['countrycode','browserid','devid']

In [11]:
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values) + list(test[col].values))
    train[col] = lbl.transform(list(train[col].values))
    test[col] = lbl.transform(list(test[col].values))

In [12]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,tweekday,thour,tminute
0,IDsrk7SoW,2017-01-14 09:42:09,128865,784773,48,127,4,2,2,0,5,9,42
1,IDmMSxHur,2017-01-18 17:50:53,142053,157563,59,65,1,8,0,0,2,17,50
2,IDVLNN0Ut,2017-01-11 12:46:49,2618,458279,69,15,0,1,2,0,2,12,46
3,ID32T6wwQ,2017-01-17 10:18:43,243406,345067,117,507,2,2,1,0,1,10,18
4,IDqUShzMg,2017-01-14 16:02:33,154278,417948,36,276,3,8,0,0,5,16,2


### Model Training

In [13]:
cols_to_use = list(set(train.columns) - set(['ID','datetime','click']))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train[cols_to_use], train['click'], test_size = 0.5)
# print(y_train)

In [36]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import time

model = DecisionTreeClassifier(class_weight={0:0.02,1:0.98})
start = time.time()
model.fit(X_train, y_train)
print("time to fit: ", time.time()-start)
start = time.time()
pred = model.predict(X_test)
print("time to predict: ", time.time()-start)
print("training score: ", roc_auc_score(y_train, model.predict(X_train)))
print("crossval score: ", roc_auc_score(y_test, pred))

time to fit:  178.91224336624146
time to predict:  1.8202931880950928
training score:  0.999999914537
crossval score:  0.766036743511


In [38]:
from sklearn.metrics import accuracy_score
print("acc. training score: ", accuracy_score(y_train, model.predict(X_train)))
print("acc. crossval score: ", accuracy_score(y_test, pred))

acc. training score:  0.999999835226
acc. crossval score:  0.969501087923


In [39]:
pred = pd.DataFrame(pred)
print(pred[0].value_counts())
y_test = pd.Series(y_test)
print(y_test.value_counts())

0    5863348
1     205557
Name: 0, dtype: int64
0    5850119
1     218786
Name: click, dtype: int64


In [41]:
pred = pd.DataFrame(model.predict_proba(test[cols_to_use]))

In [43]:
sub = pd.DataFrame({'ID':test['ID'], 'click':pred[1]})
sub.to_csv('dt_2.csv', index=False)