In [6]:
from collections import Counter
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack, coo_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [7]:
train_df = pd.read_csv('data/train_sessions.csv',index_col='session_id')
test_df = pd.read_csv('data/test_sessions.csv',index_col='session_id')

In [9]:
times = ['time%s' % i for i in range(1,11)]

In [10]:
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

In [11]:
sites = ['site%s' % i for i in range(1,11)]
train_df[sites] = train_df[sites].fillna(0).astype(np.int64)
test_df[sites] = test_df[sites].fillna(0).astype(np.int64)

In [12]:
train_test_separator = train_df.shape[0]
y_train = train_df.target
del train_df['target']
union_df = pd.concat([train_df, test_df])

In [14]:
sites_flatten = union_df[sites].values.flatten()
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]

In [17]:
def get_auc_lr_valid(X, y, C=1.0, ratio = 0.9, seed=17):
    '''
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    '''
    
    train_len = int(X.shape[0] * ratio)
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    
    logit = LogisticRegression(random_state=seed, n_jobs=-1, C=C)
    logit.fit(X_train, y_train)
    valid_pred = logit.predict_proba(X_valid)[:, 1]
    
    return roc_auc_score(y_valid, valid_pred)

In [18]:
%%time
get_auc_lr_valid(full_sites_sparse[:train_test_separator], y_train)

CPU times: user 178 ms, sys: 181 ms, total: 360 ms
Wall time: 3.27 s


0.964210879276034

In [None]:
logit = LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(full_sites_sparse[:train_test_separator], y_train)

In [15]:
coun_ = set(train_df['time1'].dt.hour)

In [None]:
# Добавим новый признак, час начала сессии
train_df = train_df.join(pd.get_dummies(train_df['time1'].dt.hour).add_prefix('hour'))
test_df = test_df.join(pd.get_dummies(test_df['time1'].dt.hour).add_prefix('hour'))

In [16]:
hour = []
for h in coun_:
    hour.append('hour'+str(h))

In [None]:
union_df = pd.concat([train_df, test_df], sort=True)

In [None]:
full_sites_hour_sparse = np.hstack((full_sites_sparse.A, union_df[hour].values))

In [None]:
# del full_sites_hour_sparse
# full_sites_hour_sparse = hstack(full_sites_sparse.A, union_df[hour].values)

In [None]:
%%time
get_auc_lr_valid(full_sites_hour_sparse[:train_test_separator], y_train)

In [None]:
logit = LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(full_sites_sparse[:train_test_separator], y_train)