In [131]:
import numpy as np
import pandas as pd
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import roc_auc_score
import datetime

%matplotlib inline

In [3]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels, index=np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

# Task exploration

Bla bla

In [22]:
train = pd.read_csv('train_sessions.csv')
test = pd.read_csv('test_sessions.csv')

times = ["time%s" % i for i in range(1, 11)]
train[times] = train[times].apply(pd.to_datetime)
test[times] = test[times].apply(pd.to_datetime)

# отсортируем данные по времени
train = train.sort_values(by="time1")

train.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
21668,21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54842,54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77291,77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114020,114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146669,146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [26]:
import pickle


with open('site_dic.pkl', 'rb') as f:
    site_dict = pickle.load(f)
    
sites_dict_df = pd.DataFrame(
    list(site_dict.keys()), index=list(site_dict.values()), columns=["site"]
)

## Convert data to sparse matrix

In [24]:
sites = ['site%s' % i for i in range(1, 11)]
train[sites].fillna(0).astype('int').to_csv('train_session_text.text', sep=' ', index=None, header=None)
test[sites].fillna(0).astype('int').to_csv('test_session_text.text', sep=' ', index=None, header=None)

In [25]:
%%time 
y_train = train.target.astype('int')
cv = CountVectorizer()
with open('train_session_text.text') as f:
    X_train = cv.fit_transform(f)
with open('test_session_text.text') as f:
    X_test = cv.transform(f)
print(X_train.shape, X_test.shape)

(253561, 41592) (82797, 41592)
CPU times: total: 2.94 s
Wall time: 3.01 s


## Baseline Logit

In [None]:
logit = LogisticRegression(random_state=17, C=1, max_iter=1000)

In [None]:
%%time
cv_scores = cross_val_score(logit, X_train, y_train, cv=5, scoring='roc_auc')

In [10]:
cv_scores.mean()

0.962679927207865

In [11]:
%%time
logit.fit(X_train, y_train)

CPU times: total: 28.2 s
Wall time: 7.54 s


LogisticRegression(C=1, max_iter=1000, random_state=17)

In [12]:
test_pred_logit1 = logit.predict_proba(X_test)[:, 1]

In [13]:
# CV = 0.962679927207865
# ROC AUC 0.90745
write_to_submission_file(test_pred_logit1, 'logit_subm1.txt')

## Function to check ROC AUC of models 

In [53]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17):
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    
    logit = LogisticRegression(C=C, random_state=seed, max_iter=1000)
    logit.fit(X_train, y_train)
    
    valid_pred = logit.predict_proba(X_valid)[:, 1]
    
    return roc_auc_score(y_valid, valid_pred)

In [178]:
def get_lr_prediction(X_train, y_train, X_test, C=1.0, seed=17):
    logit = LogisticRegression(random_state=17, C=1, max_iter=1000)
    logit.fit(X_train, y_train)
    return(logit.predict_proba(X_test)[:, 1])

# Generate some new features

### Add year and month of session

In [161]:
def add_year_month_feat(df, X_sparse, scaled=True):
    year_month_feat = pd.DataFrame(index=df.index)
    year_month_feat['year_month'] = df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
    if scaled:
        year_month_scaler = StandardScaler()
        year_month_feat['year_month_scaled'] = scaler_year.fit_transform(year_month_feat)
        return csr_matrix(hstack([X_sparse, year_month_feat['year_month_scaled'].values.reshape(-1, 1)]))
    else:
        return csr_matrix(hstack([X_sparse, year_month_feat['year_month'].values.reshape(-1, 1)]))

In [162]:
X_year_month_train = add_year_month_feat(train, X_train)
X_year_month_test = add_year_month_feat(test, X_test)

In [163]:
%%time 

get_auc_lr_valid(X_year_month_train, y_train)

CPU times: total: 26 s
Wall time: 7.01 s


0.9160670789511502

### Add Time of Day

In [164]:
def add_day_time_feature(df, X_sparse):
    session_hour = df['time1'].apply(lambda tt: tt.hour)
    night = ((session_hour >= 0) & (session_hour <= 6)).astype('int').values
    morning = ((session_hour >= 7) & (session_hour <= 11)).astype('int').values
    day = ((session_hour >= 12) & (session_hour <= 18)).astype('int').values
    evening = ((session_hour >= 19) & (session_hour <= 23)).astype('int').values
    
    return csr_matrix(hstack([X_sparse, night.reshape(-1, 1), morning.reshape(-1, 1), \
                              day.reshape(-1, 1), evening.reshape(-1, 1)]))

In [165]:
X_year_day_time_train = add_day_time_feature(train, X_year_month_train)
X_year_day_time_test = add_day_time_feature(test, X_year_month_test)

In [166]:
%%time 

get_auc_lr_valid(X_year_day_time_train, y_train)

CPU times: total: 28.7 s
Wall time: 7.67 s


0.9480375427651562

### Add time of session

In [174]:
def add_delta_time_feat(df, X_sparse, scaled=True):
    delta_time = pd.DataFrame(index=df.index)
    delta_time['delta_time'] = df['time10'] - df['time1']
    delta_time['delta_time'] = delta_time['delta_time'].apply(lambda dt: dt.total_seconds())
    delta_time = delta_time.fillna(0)
    #print(delta_time.shape)
    if scaled:
        time_scaler = StandardScaler()
        delta_time['delta_time_scaled'] = time_scaler.fit_transform(delta_time) 
        return csr_matrix(hstack([X_sparse, delta_time['delta_time_scaled'].values.reshape(-1, 1)]))
    else:
        return csr_matrix(hstack([X_sparse, delta_time.values.reshape(-1, 1)]))

In [175]:
X_year_day_delta_time_train = add_delta_time_feat(train, X_year_day_time_train)
X_year_day_delta_time_test = add_delta_time_feat(test, X_year_day_time_test)

In [176]:
%%time 

get_auc_lr_valid(X_year_day_delta_time_train, y_train)

CPU times: total: 29 s
Wall time: 7.6 s


0.9478370763789008

In [180]:
three_new_feat_prediction = get_lr_prediction(X_year_day_delta_time_train, y_train, X_year_day_delta_time_test)

In [182]:
write_to_submission_file(three_new_feat_prediction, 'day_time_session_time_year_logit_pred.txt')