In [1]:
# Import libraries and set desired options
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV

In [2]:
# Read the training and test data sets
train_df = pd.read_csv('../input/catch-me-if-you-can-alice/train_sessions.csv',
                       index_col='session_id', parse_dates=['time1'])
test_df = pd.read_csv('../input/catch-me-if-you-can-alice/test_sessions.csv',
                      index_col='session_id', parse_dates=['time1'])

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [3]:
sites = ['site%s' % i for i in range(1,11)]
train_df[sites] = train_df[sites].fillna(0).astype(int).to_csv('train_sessions_text.txt',sep=' ',index=None,header=None)
test_df[sites] = test_df[sites].fillna(0).astype(int).to_csv('test_sessions_text.txt',sep=' ',index=None,header=None)

In [4]:
# explaining count vectorizer

In [5]:
cv = CountVectorizer()

In [6]:
cv.fit_transform(['site_1 site_17 site_2',
                 'site_2 site_2 site_1']).todense()

matrix([[1, 1, 1],
        [1, 0, 2]])

In [7]:
X_sparse = cv.fit_transform(['this movie is awful',
 'enjoyed this movie, this movie is'])

In [8]:
X_sparse.todense()

matrix([[1, 0, 1, 1, 1],
        [0, 1, 1, 2, 2]])

In [9]:
X_sparse.data  #non zero elements

array([1, 1, 1, 1, 2, 2, 1, 1])

In [10]:
cv.vocabulary_

{'this': 4, 'movie': 3, 'is': 2, 'awful': 0, 'enjoyed': 1}

In [11]:
X_sparse.indices #where this non zero elements occur

array([4, 3, 2, 0, 4, 3, 2, 1], dtype=int32)

In [12]:
X_sparse.nonzero() , X_sparse.data

((array([0, 0, 0, 0, 1, 1, 1, 1], dtype=int32),
  array([4, 3, 2, 0, 4, 3, 2, 1], dtype=int32)),
 array([1, 1, 1, 1, 2, 2, 1, 1]))

In [13]:
#count vectorizer explained
# it gets all unique words in article and then make table with them as unique words are columns
# then it goes for each sentence and count the words in it if repeated word so take more than 1 if not so its 1 
# and put the count in a row for each sentence in the column specified for each word 

In [14]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [15]:
%%time 
cv = CountVectorizer()
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)    

(253561, 41592) (82797, 41592)
CPU times: user 4.58 s, sys: 66 ms, total: 4.65 s
Wall time: 4.64 s


In [16]:
y_train = train_df['target'].astype('int')

In [17]:
# train logistic regression

In [18]:
logit = LogisticRegression(C=1,random_state=17)

In [19]:
%%time
cv_scores = cross_val_score(logit,X_train,y_train,cv=5,scoring='roc_auc')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

CPU times: user 55.3 s, sys: 1min 10s, total: 2min 5s
Wall time: 32.5 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
cv_scores

array([0.91412172, 0.83254228, 0.87702207, 0.89228333, 0.91380815])

In [21]:
cv_scores.mean()

0.8859555086320896

In [22]:
%%time
logit.fit(X_train,y_train)

CPU times: user 11.8 s, sys: 13.4 s, total: 25.2 s
Wall time: 6.48 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1, random_state=17)

In [23]:
test_pred_logit1 = logit.predict_proba(X_test)[:,1]

In [24]:
test_pred_logit1.shape

(82797,)

In [25]:
# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [26]:
#cv = 0.885
write_to_submission_file(test_pred_logit1,'logit_subm1.txt') # 0.908 ROC AUC public leaderboard

In [27]:
# time features
# - hour when the session started
# - morning
# - day
# - eve
# - night

In [28]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [29]:
%%time
X_train_with_time = add_time_features(train_df.fillna(0),X_train)
X_test_with_time = add_time_features(test_df.fillna(0),X_test)

CPU times: user 6.99 s, sys: 225 ms, total: 7.22 s
Wall time: 7.21 s


In [30]:
X_train_with_time.shape,X_test_with_time.shape

((253561, 41596), (82797, 41596))

In [31]:
%%time
cv_scores = cross_val_score(logit,X_train_with_time,y_train,cv=5,scoring='roc_auc')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

CPU times: user 53.4 s, sys: 1min 5s, total: 1min 59s
Wall time: 30.8 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [32]:
cv_scores

array([0.9248067 , 0.90742607, 0.93214609, 0.94361786, 0.94776394])

In [33]:
cv_scores.mean()

0.9311521321207058

In [34]:
%%time
logit.fit(X_train_with_time,y_train)

CPU times: user 13.8 s, sys: 18.5 s, total: 32.3 s
Wall time: 8.7 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1, random_state=17)

In [35]:
test_pred_logit2 = logit.predict_proba(X_test_with_time)[:,1]

In [36]:
test_pred_logit2.shape

(82797,)

In [37]:
test_pred_logit2

array([6.07569229e-05, 7.22690707e-08, 3.19616968e-08, ...,
       2.77215701e-04, 1.28374554e-05, 6.19594732e-07])

In [38]:
# cv 0.93
write_to_submission_file(test_pred_logit2,'logit_subm2.txt') #ROC AUC 0.93565 public lb

In [39]:
time_split = TimeSeriesSplit(n_splits=10)

In [40]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1)

In [41]:
%%time
logit_grid_searcher.fit(X_train_with_time, y_train) # WTF? Locally, it's 3min 30s

Fitting 10 folds for each of 10 candidates, totalling 100 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

CPU times: user 11min 30s, sys: 14min 11s, total: 25min 42s
Wall time: 6min 40s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
             estimator=LogisticRegression(C=1, random_state=17), n_jobs=1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             scoring='roc_auc', verbose=1)

In [42]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9157981261694369, {'C': 0.5994842503189409})

In [43]:
test_pred_logit3 = logit_grid_searcher.predict_proba(X_test_with_time)[:,1]

In [44]:
# cv 0.93
write_to_submission_file(test_pred_logit3,'logit_subm3.txt') #ROC AUC 0.93565 public lb