In [31]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [32]:
# Read the training and test data sets
train_df = pd.read_csv('train_sessions.csv',
                       index_col='session_id', parse_dates=['time1'])
test_df = pd.read_csv('test_sessions.csv',
                      index_col='session_id', parse_dates=['time1'])

# Sort the data by time
train_df = train_df.sort_values(by='time1')


FileNotFoundError: [Errno 2] File b'../input/train_sessions.csv' does not exist: b'../input/train_sessions.csv'

In [33]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

# Load websites dictionary
with open(r"../input/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

FileNotFoundError: [Errno 2] No such file or directory: '../input/site_dic.pkl'

In [7]:
test_df.shape, train_df.shape

((82797, 20), (253561, 21))

In [8]:
y_train = train_df['target']

In [9]:
y_train.head()

session_id
21669     0
54843     0
77292     0
114021    0
146670    0
Name: target, dtype: int64

In [10]:
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

In [11]:
idx_split = train_df.shape[0]

In [12]:
train_df[sites].fillna(0).to_csv('train_sessions_text.txt', 
                                 sep=' ', index=None, header=None)
test_df[sites].fillna(0).to_csv('test_sessions_text.txt', 
                                sep=' ', index=None, header=None)

In [13]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=100000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

(253561, 100000) (82797, 100000)
Wall time: 9.63 s


In [14]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='lbfgs', max_iter=3000).fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [15]:
%%time
# Calculate metric on the validation set
print(get_auc_lr_valid(X_train, y_train))

0.9130394089145882
Wall time: 11.4 s


In [18]:
train_df_newfeatures = pd.DataFrame(index=train_df.index)

In [19]:
train_df_newfeatures['year'] = train_df['time1'].apply(lambda ts : ts.year * 100 + ts.month)

In [20]:
train_df_newfeatures.head()

Unnamed: 0_level_0,year
session_id,Unnamed: 1_level_1
21669,201301
54843,201301
77292,201301
114021,201301
146670,201301


In [21]:
test_df_newfeatures = pd.DataFrame(index=test_df.index)

In [22]:
test_df_newfeatures['year'] = test_df['time1'].apply(lambda ts : ts.year * 100 + ts.month)

In [23]:
test_df_newfeatures.head()

Unnamed: 0_level_0,year
session_id,Unnamed: 1_level_1
1,201410
2,201407
3,201412
4,201411
5,201405


In [24]:
scaler = StandardScaler()

In [25]:
scaler.fit(test_df_newfeatures['year'].values.reshape(-1, 1))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [26]:
train_df_newfeatures['year_scaled'] = scaler.transform(train_df_newfeatures['year'].values.reshape(-1,1))

In [27]:
test_df_newfeatures['year_scaled'] = scaler.transform(test_df_newfeatures['year'].values.reshape(-1,1))

In [28]:
X_train_new = csr_matrix(hstack([X_train, train_df_newfeatures['year_scaled'].values.reshape(-1, 1)]))

In [29]:
X_train.shape, X_train_new.shape

((253561, 100000), (253561, 100001))

In [30]:
get_auc_lr_valid(X_train_new, y_train)

0.9118056159394037