In [43]:
import numpy as np
import pandas as pd
import matplotlib 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import warnings

In [58]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels, index=np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

# Task exploration

In [4]:
train = pd.read_csv('train_sessions.csv')
test = pd.read_csv('test_sessions.csv')
train.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,,,,,,,,...,,,,,,,,,,0
1,2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
2,3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
3,4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
4,5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [7]:
import pickle


with open('site_dic.pkl', 'rb') as f:
    site_dic = pickle.load(f)

## Convert data to sparse matrix

In [12]:
sites = ['site%s' % i for i in range(1, 11)]
train[sites].fillna(0).astype('int').to_csv('train_session_text.text', sep=' ', index=None, header=None)
test[sites].fillna(0).astype('int').to_csv('test_session_text.text', sep=' ', index=None, header=None)

In [51]:
%%time 
y_train = train.target.astype('int')
cv = CountVectorizer()
with open('train_session_text.text') as f:
    X_train = cv.fit_transform(f)
with open('test_session_text.text') as f:
    X_test = cv.transform(f)
print(X_train.shape, X_test.shape)

(253561, 41592) (82797, 41592)
CPU times: total: 2.94 s
Wall time: 2.95 s


In [52]:
logit = LogisticRegression(random_state=17, C=1, max_iter=1000)

In [53]:
%%time
cv_scores = cross_val_score(logit, X_train, y_train, cv=5, scoring='roc_auc')

CPU times: total: 1min 45s
Wall time: 30.1 s


In [56]:
cv_scores.mean()

0.962679927207865

In [54]:
%%time
logit.fit(X_train, y_train)

CPU times: total: 25 s
Wall time: 8.11 s


LogisticRegression(C=1, max_iter=1000, random_state=17)

In [55]:
test_pred_logit1 = logit.predict_proba(X_test)[:, 1]

In [59]:
# CV = 0.962679927207865
# ROC AUC 0.90745
write_to_submission_file(test_pred_logit1, 'logit_subm1.txt')