In [38]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack

from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [13]:
dfs = []

for df in pd.read_csv('train', chunksize=100000):
    df_ = pd.concat([df.loc[df['click'] == 0],
                     df.loc[df['click'] == 1].sample(frac=2, replace=True)])
    
    dfs.append(df_.sample(frac=0.1))
    
df = pd.concat(dfs)
del dfs

In [18]:
df.to_csv('train-subset.csv', index=False)

In [19]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
32399,1.449227e+19,0,14102100,1005,0,6256f5b4,28f93029,f028772b,ecad2386,7801e8d9,...,1,0,16615,320,50,1863,3,39,-1,23
92316,6.264801e+18,0,14102100,1005,1,b554a32a,7e091613,f028772b,ecad2386,7801e8d9,...,1,0,9478,320,50,906,3,1451,100156,61
66072,2.581248e+18,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,0acbeaa3,45a51db4,...,1,0,20596,320,50,2161,0,35,-1,157
73923,3.677267e+17,0,14102100,1005,0,6256f5b4,28f93029,f028772b,ecad2386,7801e8d9,...,1,0,18093,320,50,2060,3,39,100106,23
66017,2.574219e+18,0,14102100,1005,1,93de26ae,7d05db75,335d28a8,ecad2386,7801e8d9,...,1,0,16071,320,50,1784,2,175,-1,95


In [22]:
df.nunique()

id                  4601032
click                     2
hour                    240
C1                        7
banner_pos                7
site_id                3523
site_domain            4432
site_category            24
app_id                 5099
app_domain              309
app_category             29
device_id            559917
device_ip           1813283
device_model           6492
device_type               5
device_conn_type          4
C14                    2462
C15                       8
C16                       9
C17                     427
C18                       4
C19                      67
C20                     167
C21                      60
dtype: int64

In [79]:
C_hasher = FeatureHasher(n_features=20, input_type='string')
id_hasher = FeatureHasher(n_features=8000, input_type='string')

In [81]:
X_train = hstack([hstack([C_hasher.transform(df[c].astype(str))
                          for c in ['C1', 'C15', 'C16', 'C18',
                                    'device_type', 'device_conn_type']]),
                  hstack([id_hasher.transform(df[c].astype(str))
                          for c in ['site_id', 'site_domain', 'site_category',
                                    'app_id', 'app_domain', 'device_model']]),
                  df['banner_pos'].values.reshape(-1,1)])

print X_train.shape

(4729421, 48121)


In [87]:
clf1 = SGDClassifier(loss='log',
                     alpha=0.00005,
                     n_jobs=4,
                     n_iter=20,
                     penalty='elasticnet',
                     verbose=1,
                     class_weight={0: 1, 1: 1.2})

In [88]:
clf1.fit(X=X_train,
         y=df['click'])

-- Epoch 1
Norm: 30.81, NNZs: 97, Bias: -0.085246, T: 4729421, Avg. loss: 0.878405
Total training time: 5.40 seconds.
-- Epoch 2
Norm: 20.75, NNZs: 99, Bias: -0.085769, T: 9458842, Avg. loss: 0.721681
Total training time: 10.45 seconds.
-- Epoch 3
Norm: 16.44, NNZs: 98, Bias: -0.086008, T: 14188263, Avg. loss: 0.668254
Total training time: 15.69 seconds.
-- Epoch 4
Norm: 13.99, NNZs: 101, Bias: -0.086099, T: 18917684, Avg. loss: 0.641154
Total training time: 20.70 seconds.
-- Epoch 5
Norm: 12.37, NNZs: 101, Bias: -0.086328, T: 23647105, Avg. loss: 0.624726
Total training time: 25.42 seconds.
-- Epoch 6
Norm: 11.21, NNZs: 104, Bias: -0.086436, T: 28376526, Avg. loss: 0.613682
Total training time: 30.75 seconds.
-- Epoch 7
Norm: 10.33, NNZs: 105, Bias: -0.086443, T: 33105947, Avg. loss: 0.605742
Total training time: 36.80 seconds.
-- Epoch 8
Norm: 9.65, NNZs: 106, Bias: -0.086573, T: 37835368, Avg. loss: 0.599752
Total training time: 41.69 seconds.
-- Epoch 9
Norm: 9.08, NNZs: 108, Bias:

SGDClassifier(alpha=5e-05, average=False, class_weight={0: 1, 1: 1.2},
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=20, n_jobs=4,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=1, warm_start=False)

In [None]:
y_pred = clf1.predict(X_train)

In [85]:
accuracy_score(y_pred,
               df['click'])

0.72086625402982729

In [86]:
precision_recall_fscore_support(y_pred,
                                df['click'],
                                average='weighted')

(0.85538662734342852, 0.72086625402982729, 0.7696518860382362, None)