In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

In [22]:
# Read the training and test data sets, change paths if needed
train_df = pd.read_csv('../data/alice/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('../data/alice/test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
test_df.head(100)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,29,2014-10-04 11:19:53,35.0,2014-10-04 11:19:53,22.0,2014-10-04 11:19:54,321.0,2014-10-04 11:19:54,23.0,2014-10-04 11:19:54,2211.0,2014-10-04 11:19:54,6730.0,2014-10-04 11:19:54,21.0,2014-10-04 11:19:54,44582.0,2014-10-04 11:20:00,15336.0,2014-10-04 11:20:00
2,782,2014-07-03 11:00:28,782.0,2014-07-03 11:00:53,782.0,2014-07-03 11:00:58,782.0,2014-07-03 11:01:06,782.0,2014-07-03 11:01:09,782.0,2014-07-03 11:01:10,782.0,2014-07-03 11:01:23,782.0,2014-07-03 11:01:29,782.0,2014-07-03 11:01:30,782.0,2014-07-03 11:01:53
3,55,2014-12-05 15:55:12,55.0,2014-12-05 15:55:13,55.0,2014-12-05 15:55:14,55.0,2014-12-05 15:56:15,55.0,2014-12-05 15:56:16,55.0,2014-12-05 15:56:17,55.0,2014-12-05 15:56:18,55.0,2014-12-05 15:56:19,1445.0,2014-12-05 15:56:33,1445.0,2014-12-05 15:56:36
4,1023,2014-11-04 10:03:19,1022.0,2014-11-04 10:03:19,50.0,2014-11-04 10:03:20,222.0,2014-11-04 10:03:21,202.0,2014-11-04 10:03:21,3374.0,2014-11-04 10:03:22,50.0,2014-11-04 10:03:22,48.0,2014-11-04 10:03:22,48.0,2014-11-04 10:03:23,3374.0,2014-11-04 10:03:23
5,301,2014-05-16 15:05:31,301.0,2014-05-16 15:05:32,301.0,2014-05-16 15:05:33,66.0,2014-05-16 15:05:39,67.0,2014-05-16 15:05:40,69.0,2014-05-16 15:05:40,70.0,2014-05-16 15:05:40,68.0,2014-05-16 15:05:40,71.0,2014-05-16 15:05:40,167.0,2014-05-16 15:05:44
6,1218,2014-12-05 11:21:52,1220.0,2014-12-05 11:21:52,1202.0,2014-12-05 11:21:52,1221.0,2014-12-05 11:21:52,1222.0,2014-12-05 11:21:52,302.0,2014-12-05 11:21:52,1217.0,2014-12-05 11:21:53,1216.0,2014-12-05 11:21:53,1221.0,2014-12-05 11:21:53,1220.0,2014-12-05 11:21:53
7,43986,2014-05-21 14:10:43,43986.0,2014-05-21 14:10:44,43986.0,2014-05-21 14:10:45,43986.0,2014-05-21 14:10:46,43986.0,2014-05-21 14:11:01,43986.0,2014-05-21 14:11:04,43986.0,2014-05-21 14:11:05,43986.0,2014-05-21 14:11:06,43986.0,2014-05-21 14:11:07,43986.0,2014-05-21 14:11:08
8,3346,2014-05-25 18:32:04,3346.0,2014-05-25 18:33:59,23.0,2014-05-25 18:34:03,3346.0,2014-05-25 18:34:03,3359.0,2014-05-25 18:34:03,,NaT,,NaT,,NaT,,NaT,,NaT
9,1062,2014-06-05 10:25:52,21.0,2014-06-05 10:25:52,23.0,2014-06-05 10:25:52,22.0,2014-06-05 10:25:52,11009.0,2014-06-05 10:25:53,8220.0,2014-06-05 10:25:53,1063.0,2014-06-05 10:25:53,21.0,2014-06-05 10:25:53,2896.0,2014-06-05 10:25:54,292.0,2014-06-05 10:25:54
10,77,2014-05-02 11:48:47,77.0,2014-05-02 11:48:48,75.0,2014-05-02 11:48:48,876.0,2014-05-02 11:48:49,80.0,2014-05-02 11:48:49,76.0,2014-05-02 11:48:49,23.0,2014-05-02 11:48:51,76.0,2014-05-02 11:48:51,75.0,2014-05-02 11:48:51,77.0,2014-05-02 11:48:51


In [7]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype(np.uint16)
test_df[sites] = test_df[sites].fillna(0).astype(np.uint16)

# Load websites dictionary
with open(r"../data/alice/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [10]:
print(train_df.shape)
print(test_df.shape)

(253561, 21)
(82797, 20)


In [18]:
# Top websites in the training data set
top_sites = pd.Series(train_df[sites].values.flatten()
                     ).value_counts().sort_values(ascending=False).head(10)
print(top_sites)
sites_dict.loc[top_sites.drop(0).index]

0       45420
21      37175
782     31755
778     25813
23      24243
1202    20413
29      19580
22      17655
167     15293
80      14261
dtype: int64


Unnamed: 0,site
21,www.google.fr
782,annotathon.org
778,www.ncbi.nlm.nih.gov
23,www.google.com
1202,claroline.univ-bpclermont.fr
29,www.facebook.com
22,apis.google.com
167,www.bing.com
80,s.youtube.com


In [21]:

# Create a separate dataframe where we will work with timestamps
time_df = pd.DataFrame(index=train_df.index)
time_df['target'] = train_df['target']

# Find sessions' starting and ending
time_df['min'] = train_df[times].min(axis=1)
time_df['max'] = train_df[times].max(axis=1)

# Calculate sessions' duration in seconds
time_df['seconds'] = (time_df['max'] - time_df['min']) / np.timedelta64(1, 's')

time_df.head(100)

KeyError: 'target'