In [1]:
import pickle
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]

train = pd.read_csv('train_sessions.csv', parse_dates = times, index_col='session_id')
test = pd.read_csv('test_sessions.csv', parse_dates = times, index_col='session_id')

train.sort_values(by='time1', inplace=True)

idx = train.shape[0]
data = pd.concat([train, test], sort=False) # leave train.target for eda

train.shape, test.shape, data.shape

((253561, 21), (82797, 20), (336358, 21))

In [3]:
data[sites] = data[sites].fillna(0).astype(np.uint16) # float->int (55.0 -> 55)

# for each row combine site_ids into one string separated by space
data['words'] = data[sites].astype(np.str).apply(' '.join, axis=1)

#words = CountVectorizer(max_features=50000, ngram_range=(1, 3)).fit_transform(data['words'])
words = TfidfVectorizer(max_features=50000, ngram_range=(1, 3)).fit_transform(data['words'])

data.drop(['words'], inplace=True, axis=1)
words

<336358x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 4433718 stored elements in Compressed Sparse Row format>

In [4]:
model = LogisticRegression(random_state=17, solver='liblinear')
time_split = TimeSeriesSplit(n_splits=10)
train.time1.min(), train.time1.max(), test.time1.min(), test.time1.max()

(Timestamp('2013-01-12 08:05:57'),
 Timestamp('2014-04-30 23:39:53'),
 Timestamp('2014-05-01 17:14:03'),
 Timestamp('2014-12-05 23:26:53'))

In [5]:
X_train = words[:idx]
y_train = train.target

cv_scores = cross_val_score(model, X_train, y_train, cv=time_split, scoring='roc_auc')
cv_scores, cv_scores.mean()

# 0.8670500571969433 CountVectorizer
# 0.8664051910501502 TfidfVectorizer

(array([0.81423928, 0.65396238, 0.87473037, 0.93492792, 0.84785245,
        0.88841551, 0.92458996, 0.87527804, 0.92858727, 0.92143771]),
 0.866402090529809)

In [6]:
data['min'] = data[times].min(axis=1)
data['max'] = data[times].max(axis=1)
data['seconds'] = ((data['max'] - data['min']) / np.timedelta64(1, 's'))
data['minutes'] = ((data['max'] - data['min']) / np.timedelta64(1, 'm')).round(2)
data.drop(['min','max'], inplace=True, axis=1)

data['month'] = data['time1'].apply(lambda ts: ts.month+(12*(ts.year-2013))).astype(np.int8)
data['yyyymm'] = data['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype(np.int32) # wtf! why this works?
data['mm'] = data['time1'].apply(lambda ts: ts.month).astype(np.int8)
data['yyyy'] = data['time1'].apply(lambda ts: ts.year).astype(np.int8)

data['dayofweek'] = data['time1'].apply(lambda ts: ts.dayofweek).astype(np.int8)
data['weekend'] = data['time1'].apply(lambda ts: ts.dayofweek > 5).astype(np.int8)

data['hour'] = data['time1'].apply(lambda ts: ts.hour).astype(np.int8)

In [7]:
data[times].min(axis=1)

session_id
21669    2013-01-12 08:05:57
54843    2013-01-12 08:37:23
77292    2013-01-12 08:50:13
114021   2013-01-12 08:50:17
146670   2013-01-12 08:50:20
242171   2013-01-12 08:50:22
57157    2013-01-12 08:50:25
240201   2013-01-12 08:50:28
210686   2013-01-12 08:50:31
98804    2013-01-12 08:50:37
113494   2013-01-12 08:51:05
223837   2013-01-12 08:51:36
145475   2013-01-12 08:52:27
186633   2013-01-12 08:53:09
45193    2013-01-12 08:53:25
102930   2013-01-12 08:55:10
15789    2013-01-12 08:59:34
205868   2013-01-12 09:10:04
21102    2013-01-12 09:14:59
193454   2013-01-12 09:15:38
252924   2013-01-12 09:20:03
12713    2013-01-12 09:23:33
182016   2013-01-12 09:25:03
55750    2013-01-12 09:37:32
237109   2013-01-12 09:56:03
120126   2013-01-12 10:02:11
140105   2013-01-12 10:02:46
129357   2013-01-12 10:05:08
136699   2013-01-12 10:05:32
113189   2013-01-12 10:06:19
                 ...        
82768    2014-05-16 15:46:49
82769    2014-10-02 13:29:11
82770    2014-05-28 17:01:59
827