In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

In [2]:
df = pd.read_json("usersessions/train.json") #load data

In [3]:
df['is_joe'] = df.apply(lambda x: 1 if x['user_id'] == 0 else 0, axis=1)

## 1. UNWRAPPING DATA

# a) unwrap city and country


In [4]:
df[['country','city']] = df['location'].str.split('/', expand=True)
df[['country','city']]

Unnamed: 0,country,city
0,USA,Chicago
1,USA,Chicago
2,Singapore,Singapore
3,USA,Chicago
4,France,Paris
...,...,...
59995,USA,Chicago
59996,New Zealand,Auckland
59997,Netherlands,Amsterdam
59998,France,Paris


# b) unwrap date

In [5]:
df[["dayofweek", "day", "month"]] = df["date"].apply(lambda x: pd.Series([x.dayofweek, x.day, x.month]))

df["hour"] = df["time"].apply(lambda x: pd.to_datetime(x).hour + pd.to_datetime(x).minute/60)



In [6]:
df['locale'].unique() # ru_RU is a mistake and needs to be change to ru-RU

array(['ru_RU', 'pl-PL', 'zh-CN', 'en-SG', 'xh-ZA', 'ja-JP', 'en-NZ',
       'en-CA', 'pt-BR', 'ur-PK', 'bg-BG', 'uk-UA', 'fr-FR', 'ru-RU',
       'ro-RO', 'it-IT', 'pt-PT', 'es-MX', 'en-AU', 'de-DE', 'vi-VN',
       'nl-NL', 'en-GB', 'en-US', 'wo-SN'], dtype=object)

In [7]:
df['locale'] = df['locale'].str.replace('_', '-').replace(" ", "")

# c) unwrap history

In [8]:
history = df['sites'].apply(pd.Series)

df['number_of_sites'] = df["sites"].apply(len)
df['total_session_time'] = history.apply(lambda x: x.str['length']).sum(axis=1)



In [9]:
sites = history.apply(lambda x: x.str['site'])

df_joe = df[df['user_id'] == 0]
history_joe = df_joe['sites'].apply(pd.Series)
sites_joe = history_joe.apply(lambda x: x.str['site'])




In [10]:
unique_sites = (
sites_joe
.groupby([0, 1, 2])
.size()
.reset_index(name="count")
.sort_values(by='count', ascending=False)
.head(15)
)

In [11]:
column_values = unique_sites[[0, 1, 2]].astype(str).values
fav_sites =  np.unique(column_values)
fav_sites

array(['blogadda.com', 'jeuxonline.info', 'lenta.ru', 'mail.google.com',
       'slack.com', 'toptal.com', 'ucoz.net', 'vk.com', 'youtube.com'],
      dtype=object)

In [12]:
def categorial_feature_overview(feature, rotation = 0, hue = None, df=df):
    print(feature, 'has', df[feature].isnull().sum() / len(df) * 100, '% of null values')
    f,ax = plt.subplots(1, 2, figsize=(20, 6))
    ax[0].tick_params(labelrotation = rotation)
    ax[1].tick_params(labelrotation = rotation)
    sns.countplot(data = df, x = feature, ax = ax[0], hue=hue)
    sns.kdeplot(data = df, x = 'total_session_time', ax = ax[1], hue= feature, shade=True)
    plt.show()

In [13]:
df.fillna(0, inplace=True)

In [14]:
timezones= dict()
timezones = {'Chicago' : -6, 'Singapore' : 8,
            'Paris' : 1, 'Berlin' : 1,
            'San Francisco' : -8, 'Shanghai' : 9,
            'Auckland' : 13, 'Kuala Lumpur' : 8,
            'Moscow' : 3, 'Rio de Janeiro' : -3,
            'Toronto' : -5, 'Madrid' : 1,
            'Miami' : -5, 'Delhi' : 5.5,
            'Amsterdam' : 1, 'Tokyo' : 9,
            'London' : 0, 'Rome' : 1,
            'Sydney': 11, 'New York' : -5,
            'Vancouver' : -8 
}

In [15]:
## Getting insights from visited sites order 

In [16]:
(sites
.groupby([0, 1])
.size()
.reset_index(name="count")
.sort_values(by='count', ascending=False)
.head(15)
)

Unnamed: 0,0,1,count
3806,toptal.com,lenta.ru,1128
2321,lenta.ru,toptal.com,916
2634,mail.google.com,vk.com,898
3571,slack.com,youtube.com,864
2642,mail.google.com,youtube.com,763
4319,vk.com,mail.google.com,747
2625,mail.google.com,toptal.com,696
3885,toptal.com,toptal.com,640
3549,slack.com,toptal.com,601
3868,toptal.com,slack.com,598


In [18]:
sites.isna().sum()

0       303
1       932
2      1961
3      4011
4      7781
5     13735
6     21263
7     29612
8     37645
9     45123
10    51688
11    56307
12    58932
dtype: int64

In [19]:
sites.fillna('', inplace=True)

In [20]:
def clean_sites(x):
    if x in fav_sites:
        return x
    elif x == "":
        return "empty"
    else:
        return "other"

In [21]:
for i in sites.columns:
    df["page_"+str(i)] = sites[i].apply(clean_sites)

In [22]:
fav_sites

array(['blogadda.com', 'jeuxonline.info', 'lenta.ru', 'mail.google.com',
       'slack.com', 'toptal.com', 'ucoz.net', 'vk.com', 'youtube.com'],
      dtype=object)

In [23]:
(df
.groupby(['page_0', 'page_1'])
.size()
.reset_index(name="count")
.sort_values(by='count', ascending=False)
# .shape
.head(10)
)

Unnamed: 0,page_0,page_1,count
20,other,other,29694
54,youtube.com,other,2620
25,other,youtube.com,1918
21,other,slack.com,1453
29,slack.com,other,1348
22,other,toptal.com,1154
37,toptal.com,other,1138
35,toptal.com,lenta.ru,1128
6,lenta.ru,toptal.com,916
15,mail.google.com,vk.com,898


In [24]:
from sklearn.model_selection import train_test_split

In [25]:
df.groupby(['page_0','page_1'], sort=False).ngroup()

0         0
1         1
2         1
3         1
4         1
         ..
59995    32
59996    48
59997    17
59998    26
59999     1
Length: 60000, dtype: int64

In [27]:
df

Unnamed: 0,browser,os,locale,user_id,gender,location,sites,time,date,is_joe,...,page_3,page_4,page_5,page_6,page_7,page_8,page_9,page_10,page_11,page_12
0,Firefox,Ubuntu,ru-RU,105,m,USA/Chicago,"[{'site': 'mail.google.com', 'length': 50}, {'...",09:03:00,2017-01-08,0,...,lenta.ru,youtube.com,other,empty,empty,empty,empty,empty,empty,empty
1,Firefox,Windows 8,pl-PL,11,m,USA/Chicago,"[{'site': 'meduza.org', 'length': 40}, {'site'...",13:57:00,2016-10-05,0,...,other,other,other,other,other,other,empty,empty,empty,empty
2,Chrome,Ubuntu,zh-CN,17,m,Singapore/Singapore,"[{'site': 'facebook.net', 'length': 74}, {'sit...",02:06:00,2017-03-28,0,...,other,other,other,other,empty,empty,empty,empty,empty,empty
3,Firefox,Windows 10,en-SG,92,f,USA/Chicago,"[{'site': 'live.com', 'length': 79}, {'site': ...",00:05:00,2016-02-10,0,...,empty,empty,empty,empty,empty,empty,empty,empty,empty,empty
4,Internet Explorer,Windows 8,xh-ZA,120,m,France/Paris,"[{'site': 'cnn.com', 'length': 65}, {'site': '...",14:55:00,2017-03-28,0,...,other,other,other,other,other,other,empty,empty,empty,empty
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,Chrome,Windows 8,en-SG,60,f,USA/Chicago,"[{'site': 'toptal.com', 'length': 191}, {'site...",14:59:00,2016-02-01,0,...,other,other,other,empty,empty,empty,empty,empty,empty,empty
59996,Chrome,Windows 10,pt-PT,178,m,New Zealand/Auckland,"[{'site': 'vk.com', 'length': 126}, {'site': '...",06:02:00,2016-12-30,0,...,other,other,empty,empty,empty,empty,empty,empty,empty,empty
59997,Safari,MacOS,it-IT,32,m,Netherlands/Amsterdam,"[{'site': 'slack.com', 'length': 74}, {'site':...",10:27:00,2017-01-27,0,...,other,empty,empty,empty,empty,empty,empty,empty,empty,empty
59998,Firefox,Windows 10,ru-RU,56,m,France/Paris,"[{'site': 'lenta.ru', 'length': 82}, {'site': ...",14:17:00,2016-12-06,0,...,mail.google.com,youtube.com,empty,empty,empty,empty,empty,empty,empty,empty


In [33]:
df[useful_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   browser             60000 non-null  object 
 1   os                  60000 non-null  object 
 2   locale              60000 non-null  object 
 3   gender              60000 non-null  object 
 4   country             60000 non-null  object 
 5   city                60000 non-null  object 
 6   dayofweek           60000 non-null  int64  
 7   day                 60000 non-null  int64  
 8   month               60000 non-null  int64  
 9   hour                60000 non-null  float64
 10  number_of_sites     60000 non-null  int64  
 11  total_session_time  60000 non-null  float64
 12  page_0              60000 non-null  object 
 13  page_1              60000 non-null  object 
 14  page_2              60000 non-null  object 
 15  page_3              60000 non-null  object 
 16  page

In [44]:
object_cols = df.select_dtypes(include=['object']).columns
object_cols
# low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 8]
# high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))




Index(['location', 'sites', 'time'], dtype='object')

In [45]:
from xgboost import XGBClassifier

Index(['browser', 'os', 'locale', 'gender', 'country', 'city', 'dayofweek',
       'day', 'month', 'hour', 'number_of_sites', 'total_session_time',
       'page_0', 'page_1', 'page_2', 'page_3', 'page_4', 'page_5', 'page_6',
       'page_7', 'page_8', 'page_9', 'page_10', 'page_11', 'page_12'],
      dtype='object')

In [41]:
for col in object_cols:
    df[col] = pd.factorize(df[col])[0]

useful_cols = df.columns.drop(['user_id', 'date', 'is_joe', 'location', 'sites','time'])
y = df['is_joe']

In [47]:
X = df[useful_cols].copy()
X.sample(5)

Unnamed: 0,browser,os,locale,gender,country,city,dayofweek,day,month,hour,...,page_3,page_4,page_5,page_6,page_7,page_8,page_9,page_10,page_11,page_12
46642,0,2,13,1,3,3,4,6,1,6.766667,...,1,1,0,1,1,1,1,1,1,0
31770,2,2,8,1,16,18,4,1,7,21.333333,...,7,1,1,0,0,0,0,0,0,0
41853,3,4,2,1,5,6,4,16,9,3.45,...,1,1,0,1,1,1,3,1,1,1
13360,0,2,17,1,0,4,6,8,1,16.4,...,1,1,0,1,1,1,1,1,0,0
47231,1,2,12,1,8,9,5,3,9,2.616667,...,3,1,0,1,0,0,0,0,0,0


In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=43, stratify=y)

In [103]:
my_model = XGBClassifier()
my_model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [104]:
from sklearn.metrics import roc_auc_score, f1_score

predictions = my_model.predict(X_test)
print("ROC: " + str(roc_auc_score(predictions, y_test)))
print("F1: " + str(f1_score(predictions, y_test)))

ROC: 0.974532174044159
F1: 0.9306930693069307


In [105]:
from eli5 import show_weights
show_weights(my_model)

Weight,Feature
0.4447,locale
0.1057,country
0.0675,page_1
0.0567,gender
0.0527,page_0
0.0482,hour
0.0387,city
0.0330,month
0.0304,page_3
0.0303,page_2


In [106]:
df['total_session_time'].mean()

974.79495

In [107]:
df['number_of_sites'].mean()

7.511783333333334

In [108]:
df_joe['total_session_time'].mean()

1104.3095238095239

In [109]:
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold


In [110]:
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20901
           1       0.91      0.95      0.93        99

    accuracy                           1.00     21000
   macro avg       0.96      0.97      0.97     21000
weighted avg       1.00      1.00      1.00     21000

