In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
times = ['time%s' % i for i in range(1, 11)]

train_df = pd.read_csv('train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')


In [3]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

# Load websites dictionary
with open(r"site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
#sites_dict.head()

Websites total: 48371


In [4]:
# Our target variable
y_train = train_df['target'].values

# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [39]:
split_id = train_df.shape[0]
X_train = train_df[:split_id]
X_valid = train_df[split_id]

In [5]:
# small
train_df[sites].fillna(0).to_csv('train_sessions_text.txt', 
                                 sep=' ', index=None, header=None)
test_df[sites].fillna(0).to_csv('test_sessions_text.txt', 
                                sep=' ', index=None, header=None)

In [6]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

(253561, 50000) (82797, 50000)
CPU times: user 9.5 s, sys: 185 ms, total: 9.69 s
Wall time: 7.43 s


In [7]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='lbfgs', max_iter=2000).fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [8]:
%%time
# Calculate metric on the validation set
print(get_auc_lr_valid(X_train, y_train))

0.9132519600597074
CPU times: user 1min 28s, sys: 1.49 s, total: 1min 30s
Wall time: 7.61 s


In [9]:
train_df_newfeatures = pd.DataFrame(index=train_df.index)
test_df_newfeatures = pd.DataFrame(index=test_df.index)

In [10]:
train_df_newfeatures['year'] = train_df['time1'].apply(lambda ts : ts.year * 100 + ts.month)
test_df_newfeatures['year'] = test_df['time1'].apply(lambda ts : ts.year * 100 + ts.month)

In [11]:
train_df_newfeatures['hour'] = train_df['time1'].apply(lambda ts : ts.hour)
test_df_newfeatures['hour'] = test_df['time1'].apply(lambda ts : ts.hour)

In [12]:
# is time between 5 and 13
train_df_newfeatures['morning'] = ((train_df_newfeatures['hour'] > 5) & (train_df_newfeatures['hour'] < 13)) * 1 - 0.5
test_df_newfeatures['morning'] = ((test_df_newfeatures['hour'] > 5) & (train_df_newfeatures['hour'] < 13)) * 1- 0.5

In [13]:
times = ['time%s' % i for i in range(1, 11)]
train_df_newfeatures['session_lenght'] = (train_df[times].max(axis = 1) - train_df[times].min(axis = 1)).apply(lambda ts: ts.seconds)
test_df_newfeatures['session_lenght'] = (test_df[times].max(axis = 1) - test_df[times].min(axis = 1)).apply(lambda ts: ts.seconds)

In [14]:
scaler = StandardScaler()

In [15]:
scaler.fit(test_df_newfeatures['year'].values.reshape(-1, 1))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
train_df_newfeatures['year_scaled'] = scaler.fit_transform(train_df_newfeatures['year'].values.reshape(-1,1))
test_df_newfeatures['year_scaled'] = scaler.transform(test_df_newfeatures['year'].values.reshape(-1,1))

## Scale hours

In [17]:
scaler.fit(test_df_newfeatures['hour'].values.reshape(-1, 1))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [18]:
train_df_newfeatures['hour_scaled'] = scaler.fit_transform(train_df_newfeatures['hour'].values.reshape(-1,1))
test_df_newfeatures['hour_scaled'] = scaler.transform(test_df_newfeatures['hour'].values.reshape(-1,1))

## Scale session Length

In [19]:
scaler.fit(test_df_newfeatures['session_lenght'].values.reshape(-1, 1))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [20]:
train_df_newfeatures['session_lenght_scaled'] = scaler.fit_transform(train_df_newfeatures['session_lenght'].values.reshape(-1,1))
test_df_newfeatures['session_lenght_scaled'] = scaler.transform(test_df_newfeatures['session_lenght'].values.reshape(-1,1))

## Add new features to dataset

In [21]:
X_train_new = csr_matrix(hstack([X_train, train_df_newfeatures[['year_scaled', 'hour_scaled', 'morning', 'session_lenght_scaled']]]))
X_test_new = csr_matrix(hstack([X_test, test_df_newfeatures[['year_scaled', 'hour_scaled', 'morning', 'session_lenght_scaled']]]))

In [22]:
X_train.shape, X_train_new.shape

((253561, 50000), (253561, 50004))

In [23]:
X_test.shape, X_test_new.shape

((82797, 50000), (82797, 50004))

In [24]:
get_auc_lr_valid(X_train_new, y_train)

0.9547832159471328

In [25]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [26]:
lr = LogisticRegression(C=1.0, random_state=17, solver='lbfgs', max_iter=500).fit(X_train_new, y_train)

# Make a prediction for test data set
y_test = lr.predict_proba(X_test_new)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline_1.csv')

In [27]:
lr1 = LogisticRegression(C=0.01, random_state=17, solver='lbfgs', max_iter=2000).fit(X_train_new, y_train)

In [28]:
y_test1 = lr.predict_proba(X_test_new)[:, 1]

In [29]:
write_to_submission_file(y_test1, 'baseline_1.csv')

In [33]:
get_auc_lr_valid(X_train_new, y_train, C = 10)

0.9240884454299088

In [30]:
# 0.9537978270268442 - hour
# 0.9546807898448859 - hour + year
# 0.9547075096976458 - hour + year + morning