In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
# !pip install eli5
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html

In [2]:
PATH_TO_DATA = ''
SEED = 17

In [3]:
def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times


In [4]:
%%time
X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)

Wall time: 29.1 s


In [5]:
print(X_train_sites.shape, X_test_sites.shape)

(253561, 50000) (82797, 50000)


In [6]:
vectorizer.get_feature_names()[:10]

['0.academia-assets.com',
 '0.docs.google.com',
 '0.docs.google.com 0.docs.google.com',
 '0.docs.google.com 0.docs.google.com 0.docs.google.com',
 '0.docs.google.com 0.docs.google.com 0.docs.google.com 0.docs.google.com',
 '0.docs.google.com 0.docs.google.com 0.drive.google.com',
 '0.docs.google.com 0.docs.google.com apis.google.com',
 '0.docs.google.com 0.docs.google.com docs.google.com',
 '0.docs.google.com 0.drive.google.com',
 '0.docs.google.com 0.drive.google.com 0.docs.google.com']

In [7]:
time_split = TimeSeriesSplit(n_splits=10)

In [8]:
logit = LogisticRegression(C=1, random_state=SEED, solver='liblinear')

In [9]:
%%time

cv_scores1 = cross_val_score(logit, X_train_sites, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 7.13 s


In [10]:
cv_scores1, cv_scores1.mean()

(array([0.83124023, 0.65993466, 0.85673565, 0.92824237, 0.84777206,
        0.88954524, 0.88829289, 0.8771044 , 0.92023038, 0.92624125]),
 0.8625339141735202)

In [11]:
logit.fit(X_train_sites, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
eli5.show_weights(estimator=logit, 
                  feature_names=vectorizer.get_feature_names(), top=30)

Weight?,Feature
+5.880,youwatch.org
+5.380,cid-ed6c3e6a5c6608a4.users.storage.live.com
+5.222,fr.glee.wikia.com
+5.114,vk.com
+4.875,www.info-jeunes.net
+4.499,www.banque-chalus.fr
+4.220,www.express.co.uk
+4.147,www.audienceinsights.net
+4.089,www.melty.fr
+4.003,glee.hypnoweb.net


In [13]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [14]:
logit_test_pred = logit.predict_proba(X_test_sites)[:, 1]
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91807

In [15]:
def train_and_predict(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
    
    test_pred = model.predict_proba(X_test)[:, 1]
    write_to_submission_file(test_pred, submission_file_name) 
    
    return cv_scores

In [16]:
cv_scores1 = train_and_predict(model=logit, X_train=X_train_sites, y_train=y_train, 
                  X_test=X_test_sites, site_feature_names=vectorizer.get_feature_names(),              
                  cv=time_split, submission_file_name='subm1.csv')

CV scores [0.83124023 0.65993466 0.85673565 0.92824237 0.84777206 0.88954524
 0.88829289 0.8771044  0.92023038 0.92624125]
CV mean: 0.8625339141735202, CV std: 0.07455724503584943


Weight?,Feature
+5.880,youwatch.org
+5.380,cid-ed6c3e6a5c6608a4.users.storage.live.com
+5.222,fr.glee.wikia.com
+5.114,vk.com
+4.875,www.info-jeunes.net
+4.499,www.banque-chalus.fr
+4.220,www.express.co.uk
+4.147,www.audienceinsights.net
+4.089,www.melty.fr
+4.003,glee.hypnoweb.net


In [17]:
def add_time_features(times, X_sparse, add_hour=True):
    active_hours = [12, 13, 16, 17, 18]
    hour = times['time1'].apply(lambda ts: ts.hour)
    active = ((hour == 12) | (hour == 13) | (hour == 16) | (hour == 17) | (hour == 18)).astype('int').values.reshape(-1, 1)
    #morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
    #day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
    #evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
    #night = ((hour >= 0) & (hour <=6)).astype('int').values.reshape(-1, 1)
    
    objects_to_hstack = [X_sparse, active]
    feature_names = ['active']
    
    if add_hour:
        # we'll do it right and scale hour dividing by 24
        objects_to_hstack.append(hour.values.reshape(-1, 1) / 24)
        feature_names.append('hour')
        
    X = hstack(objects_to_hstack)
    return X, feature_names

In [18]:
%%time
X_train_with_times1, new_feat_names = add_time_features(train_times, X_train_sites)
X_test_with_times1, _ = add_time_features(test_times, X_test_sites)

Wall time: 1.5 s


In [19]:
X_train_with_times1.shape, X_test_with_times1.shape

((253561, 50002), (82797, 50002))

In [20]:


cv_scores2 = train_and_predict(model=logit, X_train=X_train_with_times1, y_train=y_train, 
                               X_test=X_test_with_times1, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names,
                               cv=time_split, submission_file_name='subm2.csv')



CV scores [0.8895664  0.87000646 0.95729638 0.9488776  0.94218305 0.9718683
 0.87805076 0.95697915 0.95567323 0.96854874]
CV mean: 0.9339050068062502, CV std: 0.03695424946467724


Weight?,Feature
+5.218,www.melty.fr
+5.120,hour
+5.085,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.985,www.express.co.uk
+4.572,youwatch.org
+4.418,www.info-jeunes.net
+4.379,www.audienceinsights.net
+4.213,vk.com
+3.960,fr.glee.wikia.com
+3.943,www.banque-chalus.fr


New feature weights:
  feature      coef
0  active  3.296515
1    hour  5.119930


In [21]:
cv_scores2 > cv_scores1

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True])

In [22]:
X_train_with_times2, new_feat_names = add_time_features(train_times, X_train_sites, add_hour=False)
X_test_with_times2, _ = add_time_features(test_times, X_test_sites, add_hour=False)


cv_scores3 = train_and_predict(model=logit, X_train=X_train_with_times2, y_train=y_train, 
                               X_test=X_test_with_times2, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names,
                               cv=time_split, submission_file_name='subm3.csv')

CV scores [0.90555613 0.87570584 0.94493893 0.9434751  0.95276136 0.96275509
 0.85607852 0.95601371 0.9438186  0.95980355]
CV mean: 0.9300906825770386, CV std: 0.035722968594857726


Weight?,Feature
+5.045,www.melty.fr
+5.037,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.923,www.express.co.uk
+4.882,youwatch.org
+4.446,www.audienceinsights.net
+4.379,vk.com
+4.189,www.info-jeunes.net
+4.152,fr.glee.wikia.com
+4.055,www.banque-chalus.fr
+3.824,active


New feature weights:
  feature      coef
0  active  3.824459


In [23]:
cv_scores3 > cv_scores1

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True])

In [24]:


train_durations = (train_times.max(axis=1) - train_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
test_durations = (test_times.max(axis=1) - test_times.min(axis=1)).astype('timedelta64[ms]').astype(int)

scaler = StandardScaler()
train_dur_scaled = scaler.fit_transform(train_durations.values.reshape(-1, 1))
test_dur_scaled = scaler.transform(test_durations.values.reshape(-1, 1))



In [25]:
X_train_with_time_correct = hstack([X_train_with_times2, train_dur_scaled])
X_test_with_time_correct = hstack([X_test_with_times2, test_dur_scaled])

In [26]:


cv_scores5 = train_and_predict(model=logit, X_train=X_train_with_time_correct, y_train=y_train, 
                               X_test=X_test_with_time_correct, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names + ['sess_duration'],
                               cv=time_split, submission_file_name='subm5.csv')



CV scores [0.89948139 0.87781018 0.94524005 0.94351526 0.95309811 0.9635286
 0.85953385 0.95671228 0.9447175  0.96010883]
CV mean: 0.9303746055027144, CV std: 0.035395678852020274


Weight?,Feature
+5.021,www.melty.fr
+5.003,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.920,www.express.co.uk
+4.878,youwatch.org
+4.443,vk.com
+4.411,www.audienceinsights.net
+4.181,www.info-jeunes.net
+4.131,fr.glee.wikia.com
+4.088,www.banque-chalus.fr
+3.825,active


New feature weights:
         feature      coef
0         active  3.824772
1  sess_duration -0.236708


In [27]:
cv_scores5 > cv_scores3

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [28]:


def add_day_month(times, X_sparse):
    day_of_week = (times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1) - 3)/3
    month = times['time1'].apply(lambda t: t.month).values.reshape(-1, 1) 
    # linear trend: time in a form YYYYMM, we'll divide by 1e5 to scale this feature 
    year_month = times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
    scaler = StandardScaler()
    dow = (times['time1'].apply(lambda t: t.weekday()))
    active_days = ((dow == 0) | (dow == 1) | (dow == 3) | (dow == 4)).astype('int').values.reshape(-1, 1)

    
    objects_to_hstack = [X_sparse, active_days, day_of_week]
    feature_names = ['active_days', 'day_of_week'] #'month', #'year_month']
        
    X = hstack(objects_to_hstack)
    return X, feature_names



In [29]:
X_train_final, more_feat_names = add_day_month(train_times, X_train_with_time_correct)
X_test_final, _ = add_day_month(test_times, X_test_with_time_correct)

In [30]:
cv_scores6 = train_and_predict(model=logit, X_train=X_train_final, y_train=y_train, 
                               X_test=X_test_final, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names + ['sess_duration'] + more_feat_names,
                               cv=time_split, submission_file_name='subm6.csv')

CV scores [0.92447354 0.92024536 0.88067116 0.95734662 0.95912079 0.97553006
 0.87117951 0.96576439 0.87993128 0.97243411]
CV mean: 0.9306696816552824, CV std: 0.03907666904787848


Weight?,Feature
+4.950,www.melty.fr
+4.940,www.express.co.uk
+4.859,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.318,youwatch.org
+4.225,www.audienceinsights.net
+4.206,vk.com
+4.147,www.info-jeunes.net
+4.000,www.banque-chalus.fr
+3.743,r4---sn-gxo5uxg-jqbe.googlevideo.com
+3.738,active


New feature weights:
         feature      coef
0         active  3.737618
1  sess_duration -0.269579
2    active_days  2.090268
3    day_of_week -0.749254


In [31]:
def add_year(times, X_sparse):
    
    # linear trend: time in a form YYYYMM, we'll divide by 1e5 to scale this feature 
    year = times['time1'].apply(lambda t: t.year).values.reshape(-1, 1) - 2013.5
    
    objects_to_hstack = [X_sparse, year]
    feature_names = ['year']
        
    X = hstack(objects_to_hstack)
    return X, feature_names

In [32]:
X_train_year, more_feat_names1 = add_year(train_times, X_train_final)
X_test_year, _ = add_year(test_times, X_test_final)

In [None]:
cv_scores7 = train_and_predict(model=logit, X_train=X_train_year, y_train=y_train, 
                               X_test=X_test_year, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names + ['sess_duration'] + more_feat_names + more_feat_names1,
                               cv=time_split, submission_file_name='subm7.csv')

CV scores [0.92524929 0.92099405 0.91069862 0.95761799 0.9604526  0.97681127
 0.87087065 0.96871117 0.88124701 0.97443834]
CV mean: 0.9347091001766149, CV std: 0.03671828942676616


In [None]:
train_len1 = (train_times.count(axis = 1) == 1) * 1 - 0.5
test_len1 = (test_times.count(axis = 1) == 1) * 1 - 0.5

train_len2 = (train_times.count(axis = 1) == 2) * 1 - 0.5
test_len2 = (test_times.count(axis = 1) == 2) * 1 - 0.5

scaler = StandardScaler()
train_len1 = scaler.fit_transform(train_len1.values.reshape(-1, 1))
test_len1 = scaler.transform(test_len1.values.reshape(-1, 1))

train_len2 = scaler.fit_transform(train_len2.values.reshape(-1, 1))
test_len2 = scaler.transform(test_len2.values.reshape(-1, 1))

In [None]:
X_train_len1 = hstack([X_train_year])
X_test_len1 = hstack([X_test_year])

len_features = []

In [None]:
cv_scores8 = train_and_predict(model=logit, X_train=X_train_len1, y_train=y_train, 
                               X_test=X_test_len1, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names + ['sess_duration'] + more_feat_names + more_feat_names1 + len_features,
                               cv=time_split, submission_file_name='subm8.csv')

In [None]:
cv_scores8 > cv_scores7

In [None]:
logit = LogisticRegression(C=3, random_state=17, solver='liblinear')
cv_scores = cross_val_score(logit, X_train_len1, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)
print(cv_scores.mean(), cv_scores.std())

In [None]:
logit.fit(X_train_len1, y_train)
logit_test_pred2 = logit.predict_proba(X_test_len1)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm5.csv')