In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import datetime
from sklearn.externals import joblib
#from bokeh.io import output_notebook, show
#from bokeh.plotting import figure
#output_notebook() # call only once

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Analyze the data

In [3]:
print("In train set, No of rows are {} and columns are {}".format(train.shape[0], train.shape[1]))
print("In test set, No of rows are {} and columns are {}".format(test.shape[0], test.shape[1]))
print("Data types :", train.dtypes)

In train set, No of rows are 12137810 and columns are 10
In test set, No of rows are 3706907 and columns are 9
Data types : ID              object
datetime        object
siteid         float64
offerid          int64
category         int64
merchant         int64
countrycode     object
browserid       object
devid           object
click            int64
dtype: object


In [4]:
# Parse datetime
# Rounding to hour to group time into large intervals
train['datetime'] = pd.to_datetime(train['datetime'], format="%Y-%m-%d %H:%M:%S").dt.round('60min') # check how this freqency works
# Since data is only for 10 days year month and day doesn't matter really 
# And min and sec are already rounded off
train['weekday'], train['hour'] = train['datetime'].dt.weekday, train['datetime'].dt.hour
train = train.drop('datetime', 1)

In [5]:
# Parse datetime
# Rounding to hour to group time into large intervals
test['datetime'] = pd.to_datetime(test['datetime'], format="%Y-%m-%d %H:%M:%S").dt.round('60min') # check how this freqency works
# Since data is only for 10 days year month and day doesn't matter really 
# And min and sec are already rounded off
test['weekday'], test['hour'] = test['datetime'].dt.weekday, test['datetime'].dt.hour
test = test.drop('datetime', 1)

In [6]:
train.head()

Unnamed: 0,ID,siteid,offerid,category,merchant,countrycode,browserid,devid,click,weekday,hour
0,IDsrk7SoW,4709696.0,887235,17714,20301556,e,Firefox,,0,5,10
1,IDmMSxHur,5189467.0,178235,21407,9434818,b,Mozilla Firefox,Desktop,0,2,18
2,IDVLNN0Ut,98480.0,518539,25085,2050923,a,Edge,,0,2,13
3,ID32T6wwQ,8896401.0,390352,40339,72089744,c,Firefox,Mobile,0,1,10
4,IDqUShzMg,5635120.0,472937,12052,39507200,d,Mozilla Firefox,Desktop,0,5,16


In [7]:
mod_siteid = -1 #mode(train, 'siteid')
mod_browserid = 'unknown' #mode(train, 'browserid')
mod_devid = 'unknown'#mode(train, 'devid')

In [8]:
## impute categoricsl values with mode and encode cat date
train.loc[train['siteid'].isnull(), 'siteid'] = mod_siteid
train.loc[train['browserid'].isnull(), 'browserid'] = mod_browserid
train.loc[train['devid'].isnull(), 'devid'] = mod_devid

In [9]:
## impute categoricsl values with mode and encode cat date
test.loc[test['siteid'].isnull(), 'siteid'] = mod_siteid
test.loc[test['browserid'].isnull(), 'browserid'] = mod_browserid
test.loc[test['devid'].isnull(), 'devid'] = mod_devid

### Convert to right datatype

In [10]:
train = train.drop('ID', 1)
#train['siteid'] = train['siteid'].astype('category')
#train['offerid'] = train['offerid'].astype('category')
#train['category'] = train['category'].astype('category')
#train['merchant'] = train['merchant'].astype('category')
#train['countrycode'] = train['countrycode'].astype('category')
#train['browserid'] = train['browserid'].astype('category')
#train['devid'] = train['devid'].astype('category')
#train['click'] = train['click'].astype('category')
#train['weekday'] = train['weekday'].astype('category')

In [11]:
test_id = test['ID']
test = test.drop('ID', 1)
#test['siteid'] = test['siteid'].astype('category')
#test['offerid'] = test['offerid'].astype('category')
#test['category'] = test['category'].astype('category')
#test['merchant'] = test['merchant'].astype('category')
#test['countrycode'] = test['countrycode'].astype('category')
#test['browserid'] = test['browserid'].astype('category')
#test['devid'] = test['devid'].astype('category')
#test['weekday'] = test['weekday'].astype('category')

In [12]:
#print(train.columns[train.isnull().any()])
#print(train.isnull().sum())
#print(test.columns[test.isnull().any()])
#print(test.isnull().sum())

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [14]:
print("Shape of train set is {} and test set is {}".format(train.shape, test.shape))
test['click'] = -1
train = train.append(test)
del test
print("Shape of all data is {}".format(train.shape))

Shape of train set is (12137810, 10) and test set is (3706907, 9)
Shape of all data is (15844717, 10)


In [15]:
le.fit(train['siteid'])
train['siteid'] = le.transform(train['siteid'])
le.fit(train['offerid'])
train['offerid'] = le.transform(train['offerid'])
le.fit(train['category'])
train['category'] = le.transform(train['category'])
le.fit(train['merchant'])
train['merchant'] = le.transform(train['merchant'])
le.fit(train['countrycode'])
train['countrycode'] = le.transform(train['countrycode'])
le.fit(train['browserid'])
train['browserid'] = le.transform(train['browserid'])
le.fit(train['devid'])
train['devid'] = le.transform(train['devid'])
le.fit(train['weekday'])
train['weekday'] = le.transform(train['weekday'])

In [16]:
print("No of unique siteid :{}".format(len(train['siteid'].unique())))
print("No of unique offerid :{}".format(len(train['offerid'].unique())))
print("No of unique category :{}".format(len(train['category'].unique())))
print("No of unique merchant :{}".format(len(train['merchant'].unique())))
print("No of unique countrycode :{}".format(len(train['countrycode'].unique())))
print("No of unique browserid :{}".format(len(train['browserid'].unique())))
print("No of unique devid :{}".format(len(train['devid'].unique())))
print("No of unique weekday :{}".format(len(train['weekday'].unique())))

No of unique siteid :273596
No of unique offerid :884353
No of unique category :271
No of unique merchant :703
No of unique countrycode :6
No of unique browserid :12
No of unique devid :4
No of unique weekday :7


In [None]:
# bin / group category


In [17]:
# creating dummis for categorical data

#siteid = pd.get_dummies(train['siteid'])
#offerid = pd.get_dummies(train['offerid'])
#merchant = pd.get_dummies(train['merchant'])
#train.drop('merchant', 1)
#category = pd.get_dummies(train['category'])
#train = train.drop('category', 1)
devid = pd.get_dummies(train['devid'])
train = train.drop('devid', 1)
weekday = pd.get_dummies(train['weekday'])
train = train.drop('weekday', 1)
countrycode = pd.get_dummies(train['countrycode'])
train = train.drop('countrycode', 1)
browserid = pd.get_dummies(train['browserid'])
train = train.drop('browserid', 1)

In [18]:
Y = train['click']
data = pd.concat([browserid, countrycode,
                  devid, weekday, train[['hour']]], axis=1)
del train               
del browserid
del countrycode
del devid
del weekday
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2.1,3.1,0.1,1.1,2.2,3.2,4.1,5.1,6.1,hour
0,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,10
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,18
2,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,13
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,10
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,16


In [19]:
X_train = data.iloc[0:12137810]
X_test = data.iloc[12137810:]
del data

In [20]:
print(X_train.shape, X_test.shape)
Y_train, Y_test = Y[0:12137810], Y[12137810:]
del Y
print(Y_train.shape, Y_test.shape)

(12137810, 30) (3706907, 30)
(12137810,) (3706907,)


In [21]:
## Split into train and val set
#from sklearn.model_selection import train_test_split
#X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.20, random_state=42)
t_len = len(Y_train)*0.80
X_train, X_val = X_train.loc[:t_len,:], X_train.loc[t_len:,:]

In [22]:
Y_train, Y_val = Y_train.loc[:t_len], Y_train.loc[t_len:]

In [23]:
#X_train.to_pickle('x_train_02')
#X_test.to_pickle('x_test_02')
#Y_train.to_pickle('Y_train_02')
#Y_test.to_pickle('Y_test_02')
X_train.columns = [i for i in range(X_train.shape[1])]
X_val.columns = [i for i in range(X_val.shape[1])]
X_test.columns = [i for i in range(X_test.shape[1])]

In [24]:
#import xgboost as xgb
#clf = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
#       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=10,
#       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
#       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
#       scale_pos_weight=1, seed=2017, silent=False, subsample=1)

#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(n_jobs=2)
import lightgbm as clf
X_train = clf.Dataset(X_train, Y_train)
X_val = clf.Dataset(X_val, Y_val)
params = {
    
    'num_leaves' : 256,
    'learning_rate':0.03,
    'metric':'auc',
    'objective':'binary',
    'early_stopping_round': 40,
    'max_depth':10,
    'bagging_fraction':0.5,
    'feature_fraction':0.6,
    'bagging_seed':2017,
    'feature_fraction_seed':2017,
    'verbose' : 1   
}

In [25]:
#import pandas as pd
#X_train = pd.read_pickle('x_train')
#X_test = read_pickle('x_test')
#Y_train = pd.read_pickle('y_train')
#y_test = read_pickle('y_test')

In [28]:
clf2 = clf.train(params, X_train,num_boost_round=500,valid_sets=X_val,verbose_eval=20)

  booster = Booster(params=params, train_set=train_set)
  booster.add_valid(valid_set, name_valid_set)


[20]	valid_0's auc: 0.967518
[40]	valid_0's auc: 0.968629
[60]	valid_0's auc: 0.96849
[80]	valid_0's auc: 0.968648
[100]	valid_0's auc: 0.968788
[120]	valid_0's auc: 0.968905
[140]	valid_0's auc: 0.969087
[160]	valid_0's auc: 0.9693
[180]	valid_0's auc: 0.969355
[200]	valid_0's auc: 0.969446
[220]	valid_0's auc: 0.96951
[240]	valid_0's auc: 0.96953
[260]	valid_0's auc: 0.969578
[280]	valid_0's auc: 0.969598
[300]	valid_0's auc: 0.96963
[320]	valid_0's auc: 0.969651
[340]	valid_0's auc: 0.969663
[360]	valid_0's auc: 0.969672
[380]	valid_0's auc: 0.969674
[400]	valid_0's auc: 0.969676
[420]	valid_0's auc: 0.969677
[440]	valid_0's auc: 0.969677
[460]	valid_0's auc: 0.969675
[480]	valid_0's auc: 0.96967
[500]	valid_0's auc: 0.969666


In [29]:
time = "".join(str(datetime.datetime.now()))
joblib.dump(clf2, 'model_'+time+'.pkl')


['model_2017-08-06 10:45:37.280043.pkl']

In [33]:
validated = clf2.predict(X_val)
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_val, validated)

TypeError: Cannot use Dataset instance for prediction, please use raw data instead

In [34]:
predicted = clf2.predict(X_test)
#predicted = pd.DataFrame(data=predicted)
#predicted[0] = test_id
#predicted.columns = [ 'ID', 'click']
predicted = pd.DataFrame({'ID':test_id, 'click':predicted})
predicted.to_csv("predicted"+time+".csv", index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [35]:
predicted.head()

Unnamed: 0,ID,click
0,IDFDJVI,0.000831
1,IDNWkTQ,0.000166
2,ID9pRmM,0.004195
3,IDHaQaj,0.000164
4,IDT2CrF,0.001133
