# Import package

In [98]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, auc, log_loss
# from tqdm import tqdm
# tqdm.pandas(desc="my bar!")

# Load Data

In [100]:
df_train = pd.read_csv("./train.gz", nrows = 50000, dtype = 'str')
df_train

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1000009418151094273,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,10000169349117863715,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,10000371904215119486,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,10000640724480838376,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,10000679056417042096,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157
5,10000720757801103869,0,14102100,1005,0,d6137915,bb1ef334,f028772b,ecad2386,7801e8d9,...,1,0,16920,320,50,1899,0,431,100077,117
6,10000724729988544911,0,14102100,1005,0,8fda644b,25d4cfcd,f028772b,ecad2386,7801e8d9,...,1,0,20362,320,50,2333,0,39,-1,157
7,10000918755742328737,0,14102100,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,...,1,0,20632,320,50,2374,3,39,-1,23
8,10000949271186029916,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15707,320,50,1722,0,35,-1,79
9,10001264480619467364,0,14102100,1002,0,84c7ba46,c4e18dd6,50e219e0,ecad2386,7801e8d9,...,0,0,21689,320,50,2496,3,167,100191,23


In [101]:
for name in df_train.columns:
    print(name, len(df_train[name].unique()), len(df_train[name].unique())/10000.0)

id 50000 5.0
click 2 0.0002
hour 1 0.0001
C1 6 0.0006
banner_pos 5 0.0005
site_id 693 0.0693
site_domain 593 0.0593
site_category 16 0.0016
app_id 571 0.0571
app_domain 42 0.0042
app_category 18 0.0018
device_id 4255 0.4255
device_ip 25344 2.5344
device_model 2062 0.2062
device_type 4 0.0004
device_conn_type 4 0.0004
C14 374 0.0374
C15 5 0.0005
C16 6 0.0006
C17 126 0.0126
C18 4 0.0004
C19 37 0.0037
C20 132 0.0132
C21 29 0.0029


# Feature Transform -onehot encoding

In [121]:
# one hot encoding
def binary_variance(p):
    return p * (1 - p)

def dum_sign(df, col, threshold=0.01):
    dummy_col = df[col].astype(str).fillna('')
    p = dummy_col.value_counts() / dummy_col.shape[0]
    mask = dummy_col.isin(p[binary_variance(p) >= threshold].index)
    dummy_col[~mask] = np.nan
    res = pd.get_dummies(dummy_col, prefix=col, dummy_na=False)
    return res

def one_hot_encoding(X, threshold):
    dfs = []
    for col in X.columns:
        df = dum_sign(X, col, threshold)
        dfs.append(df)
    res = pd.concat(dfs, axis=1)
    return res

In [25]:
def sample_split(X, y, test_size=0.3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    print('X_train = {}, X_test = {}'.format(X_train.shape[0], X_test.shape[0]))
    return X_train, X_test, y_train, y_test

In [26]:
feature_name = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain',\
                'app_category', 'device_id', 'device_type', 'device_conn_type', 'C14', 'C15', \
                'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

X = one_hot_encoding(df_train[feature_name], binary_variance(0.002)) 
y = df_train.click.astype(np.int8)

In [29]:
X_train, X_test, y_train, y_test = sample_split(X, y) 

X_train = 350000, X_test = 150000


In [133]:
help(LogisticRegression)

Help on class LogisticRegression in module sklearn.linear_model.logistic:

class LogisticRegression(sklearn.base.BaseEstimator, sklearn.linear_model.base.LinearClassifierMixin, sklearn.feature_selection.from_model._LearntSelectorMixin, sklearn.linear_model.base.SparseCoefMixin)
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |  
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the cross-
 |  entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag' and 'newton-cg' solvers.)
 |  
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag' and 'lbfgs' solvers. It can handle
 |  both dense and sparse input. Use C-ordered arrays or CSR matrices
 |  containing 64-bit floats for optimal performance; any other input format
 |  will be converted (and 

# Train and Evaluation 

In [91]:
def run(X_train, X_test, y_train, y_test, C=1.0, tol=0.001, maxIter = 150):
    model = LogisticRegression(C=C, penalty='l2', solver='liblinear', tol=tol, max_iter = maxIter)
    model.fit(X_train, y_train)
    
    train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
    test_auc  = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print('model train auc {:.4}'.format(train_auc))
    print('model test  auc {:.4}'.format(test_auc))
    
    train_logloss = log_loss(y_train, model.predict_proba(X_train)[:, 1])
    test_logloss  = log_loss(y_test, model.predict_proba(X_test)[:, 1])
    print('model train logloss {:.4}'.format(train_logloss))
    print('model test  logloss {:.4}'.format(train_logloss))
    return model

In [92]:
%%time 
clf = run(X_train, X_test, y_train, y_test, 1.0, 0.005, 150)

model train auc 0.7375
model test  auc 0.7356
model train logloss 0.3933
model test  logloss 0.3933
CPU times: user 7.37 s, sys: 2.89 s, total: 10.3 s
Wall time: 11 s


# Predict and output

In [67]:
df_test = pd.read_csv("./test.gz", dtype = {'id':'str'})

In [35]:
c = X_train.columns.tolist()
feature_index = {}
for i in range(len(c)):
    feature_index[c[i]] = i

v = np.zeros(len(X_train.columns))
def feature(rows): 
    v.fill(0)
    for f in rows.keys():
        index = feature_index.get(f+'_'+str(rows[f]), -1)
        if(index != -1):
            v[index] = 1
    return clf.predict_proba(v.reshape(1, -1))[0,1]

In [49]:
res1 = df_test[feature_name].progress_apply(lambda rows: feature(rows), axis=1)

my bar!: 100%|██████████| 4577464/4577464 [32:06<00:00, 2376.01it/s] 


In [76]:
df_test['click'] = res1
df_test[['id', 'click']].to_csv('./res', index = False)

In [132]:
help(pd.read_csv)

Help on function read_csv in module pandas.io.parsers:

read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=False, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, skip_footer=0, doublequote=True, delim_whitespace=False, as_recarray=False, compact_ints=False, use_unsigned=False, low_memory=True, buffer_lines=None, memory_map=False, float_precision=No