In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
!gdown --id 1s2ggWGqAjB-wlXkd6Pu7pM_3kqs6RhAF
!gdown --id 1iQCM32OzxqvmDsgL5j4et-yE5W8o6afE
!gdown --id 1bs8PHTExPfItW636-HVRVYwjjPilQVgy

Mounted at /content/gdrive/
Downloading...
From: https://drive.google.com/uc?id=1s2ggWGqAjB-wlXkd6Pu7pM_3kqs6RhAF
To: /content/id_map.parquet
100% 1.20M/1.20M [00:00<00:00, 37.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1iQCM32OzxqvmDsgL5j4et-yE5W8o6afE
To: /content/train.csv
42.9MB [00:00, 162MB/s]
Downloading...
From: https://drive.google.com/uc?id=1bs8PHTExPfItW636-HVRVYwjjPilQVgy
To: /content/test.csv
24.8MB [00:00, 78.6MB/s]


In [None]:
!pip install imbalanced-learn
!pip install scikit-learn==0.24
!pip install catboost
!pip install tldextract



In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")

from scipy.stats import norm
from scipy import stats

from tldextract import extract
import re

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OrdinalEncoder

import six
import sys
sys.modules['sklearn.externals.six'] = six

import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
sys.modules['sklearn.utils.safe_indexing'] = sklearn.utils._safe_indexing

from imblearn.pipeline import Pipeline as imb_make_pipeline
from imblearn.over_sampling import SMOTE

In [1]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

ModuleNotFoundError: ignored

In [None]:
def merge_data_and_domain(data, id_map):
    for i in range(1,11):
        data = pd.merge(data, id_map.rename(columns={"id":"webpage"+str(i), 
                                                     "domain_enc":"domain_enc"+str(i), 
                                                     "domain_user":"domain_user"+str(i)}).loc[:, ["webpage"+str(i),
                                                                                                  "domain_enc"+str(i),
                                                                                                  "domain_user"+str(i)]], 
                        on="webpage"+str(i), how="left")
    return data

def merge_data_and_suf(data, id_map):
    for i in range(1,11):
        data = pd.merge(data, id_map.rename(columns={"id":"webpage"+str(i), 
                                                     "suf_enc":"suf_enc"+str(i), 
                                                     "suf_user":"suf_user"+str(i)}).loc[:, ["webpage"+str(i),
                                                                                            "suf_enc"+str(i),
                                                                                            "suf_user"+str(i)]], 
                        on="webpage"+str(i), how="left")
    return data

def merge_data_and_sub(data, id_map):
    for i in range(1,11):
        data = pd.merge(data, id_map.rename(columns={"id":"webpage"+str(i), 
                                                     "sub_enc":"sub_enc"+str(i), 
                                                     "sub_user":"sub_user"+str(i)}).loc[:, ["webpage"+str(i),
                                                                                            "sub_enc"+str(i),
                                                                                            "sub_user"+str(i)]], 
                        on="webpage"+str(i), how="left")
    return data

In [None]:
def unique_features(domain_features):
    n_unique = domain_features.assign(
        nans_count=lambda x: x.filter(like='webpage').isna().sum(axis=1),
        # number of unique pages in session
        n_unique_sub=lambda x: x.filter(like='sub_enc').apply(lambda row: row.nunique(), axis=1) / (10 - x.nans_count),
        n_unique_domain=lambda x: x.filter(like='domain_enc').apply(lambda row: row.nunique(), axis=1) / (10 - x.nans_count),
        n_unique_enc=lambda x: x.filter(like='suf_enc').apply(lambda row: row.nunique(), axis=1) / (10 - x.nans_count))
    return n_unique

In [None]:
def time_features_extraction(data):

    data = data.assign(
            # avg hour of day in a session
            avg_hour=lambda x: np.round(x.filter(like='time').apply(lambda x: x.dt.hour).mean(axis=1)),
            # avg day of week in a session
            avg_day=lambda x: x.filter(like='time').apply(lambda x: x.dt.dayofweek).mean(axis=1),
        )

    for i in range(1,10):
        data["delta"+str(i)] = (data["time" + str(i+1)] - data["time" + str(i)]).dt.seconds
    
    data.drop(columns=[f'time{i+1}' for i in range(10)], inplace=True)

    data = data.assign(
        sum_time=lambda x: x.filter(like="delta").sum(axis=1)
    )
    data.loc[data["sum_time"]==0, "sum_time"] = 0.01

    return data

In [None]:
def text_preprocessing_1(text):
    # split text by . or -
    clean_text = re.split("\.|-", text)
    # join text
    clean_text = " ".join(clean_text)
    # remove nums
    clean_text = re.sub("[0-9]", "", clean_text)
    # remove spaces
    clean_text = re.sub(' +', ' ', clean_text)
    # split
    clean_text = re.split(" ", clean_text)
    # seq must be longer then 1
    clean_text = [x for x in clean_text if len(x)>1]
    # join text
    clean_text = " ".join(clean_text)

    return clean_text

In [None]:
train = pd.read_csv("./train.csv", index_col=0, parse_dates=[f'time{i+1}' for i in range(10)])
test = pd.read_csv('./test.csv', index_col=0, parse_dates=[f'time{i+1}' for i in range(10)])
id_map = pd.read_parquet("/content/id_map.parquet")

In [None]:
train

Unnamed: 0_level_0,webpage1,time1,webpage2,time2,webpage3,time3,webpage4,time4,webpage5,time5,webpage6,time6,webpage7,time7,webpage8,time8,webpage9,time9,webpage10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9486,2019-02-20 05:57:45,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,0
1,11722,2019-02-22 07:14:50,12385.0,2019-02-22 07:14:50,50163.0,2019-02-22 07:14:51,12385.0,2019-02-22 07:14:51,12398.0,2019-02-22 07:14:51,50150.0,2019-02-22 07:14:51,50163.0,2019-02-22 07:14:52,50150.0,2019-02-22 07:14:52,19860.0,2019-02-22 07:15:15,19886.0,2019-02-22 07:15:16,0
2,192149,2018-12-16 12:35:17,659.0,2018-12-16 12:35:18,192136.0,2018-12-16 12:35:19,192149.0,2018-12-16 12:35:19,633.0,2018-12-16 12:35:19,659.0,2018-12-16 12:35:19,192136.0,2018-12-16 12:35:20,192136.0,2018-12-16 12:35:21,192136.0,2018-12-16 12:35:22,192136.0,2018-12-16 12:35:24,0
3,10591,2019-02-13 12:40:35,451.0,2019-02-13 12:40:35,77580.0,2019-02-13 12:40:35,227821.0,2019-02-13 12:40:35,633.0,2019-02-13 12:41:05,425.0,2019-02-13 12:42:14,10591.0,2019-02-13 12:42:14,227834.0,2019-02-13 12:42:15,227834.0,2019-02-13 12:42:16,227834.0,2019-02-13 12:42:17,0
4,438,2018-04-12 06:22:26,425.0,2018-04-12 06:22:26,529.0,2018-04-12 06:22:28,65685.0,2018-04-12 06:22:29,187638.0,2018-04-12 06:22:29,451.0,2018-04-12 06:22:29,425.0,2018-04-12 06:22:29,65685.0,2018-04-12 06:22:31,187625.0,2018-04-12 06:22:31,187625.0,2018-04-12 06:22:32,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159964,117685,2018-04-12 09:06:57,3935.0,2018-04-12 09:07:01,451.0,2018-04-12 09:07:03,828.0,2018-04-12 09:07:03,451.0,2018-04-12 09:07:06,30975.0,2018-04-12 09:07:06,30975.0,2018-04-12 09:07:07,30975.0,2018-04-12 09:07:09,828.0,2018-04-12 09:07:12,828.0,2018-04-12 09:07:13,0
159965,45314,2018-11-25 06:21:54,45314.0,2018-11-25 06:21:58,1985.0,2018-11-25 06:22:03,31716.0,2018-11-25 06:22:04,1530.0,2018-11-25 06:22:13,31716.0,2018-11-25 06:22:16,31716.0,2018-11-25 06:22:28,31716.0,2018-11-25 06:22:40,31716.0,2018-11-25 06:22:52,2076.0,2018-11-25 06:22:53,0
159966,165603,2018-03-12 11:56:15,165603.0,2018-03-12 11:56:16,28947.0,2018-03-12 11:56:16,646.0,2018-03-12 11:56:17,28947.0,2018-03-12 11:56:17,451.0,2018-03-12 11:56:17,330924.0,2018-03-12 11:56:18,28947.0,2018-03-12 11:56:18,451.0,2018-03-12 11:56:18,425.0,2018-03-12 11:56:18,0
159967,34745,2018-09-12 10:00:03,195204.0,2018-09-12 10:00:10,72458.0,2018-09-12 10:00:10,72458.0,2018-09-12 10:01:29,72458.0,2018-09-12 10:01:30,,NaT,,NaT,,NaT,,NaT,,NaT,0


In [None]:
# get user's webpages 
user_frame = train.loc[train["target"]==1].reset_index(drop=True)

array = np.array([])
for element in ["webpage"+str(x) for x in range(1,11)]:
    unique_array = user_frame.loc[:, element].unique()
    array = np.concatenate([array, unique_array])
array = {x for x in array if x==x}

id_map["target"] = 0
id_map.loc[id_map["id"].isin(array), "target"] = 1

In [None]:
# extract "sub-domain", "domain", "suf" from url of webpage
list_ = [list(extract(id_map.loc[i].webpage)) for i in id_map.index]
id_map.loc[:, ["sub","domain","suf"]] = list_

In [None]:
id_map.loc[:, "domain"] = [text_preprocessing_1(text=x) for x in id_map.loc[:, "domain"].values]
id_map.loc[:, "sub"] = [text_preprocessing_1(text=x) for x in id_map.loc[:, "sub"].values]

In [None]:
encoder_sub = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
encoder_domain = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
encoder_suf = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

In [None]:
id_map.loc[:, "sub_enc"] = encoder_sub.fit_transform(id_map.loc[:, ["sub"]])
id_map.loc[:, "domain_enc"] = encoder_domain.fit_transform(id_map.loc[:, ["domain"]])
id_map.loc[:, "suf_enc"] = encoder_suf.fit_transform(id_map.loc[:, ["suf"]])

In [None]:
sub_enc_user_list = id_map.loc[id_map['target']==1, "sub_enc"].unique()
domain_enc_user_list = id_map.loc[id_map['target']==1, "domain_enc"].unique()
suf_enc_user_list = id_map.loc[id_map['target']==1, "suf_enc"].unique()

id_map["sub_user"] = 0
id_map["domain_user"] = 0
id_map["suf_user"] = 0
id_map.loc[id_map['sub_enc'].isin(sub_enc_user_list), "sub_user"] = 1
id_map.loc[id_map['domain_enc'].isin(sub_enc_user_list), "domain_user"] = 1
id_map.loc[id_map['suf_enc'].isin(sub_enc_user_list), "suf_user"] = 1

In [None]:
train = merge_data_and_domain(train, id_map)
train = merge_data_and_suf(train, id_map)
train = merge_data_and_sub(train, id_map)
train = unique_features(train)
train.drop(columns=["webpage"+str(i) for i in range(1,11)], inplace=True)

In [None]:
test = merge_data_and_domain(test, id_map)
test = merge_data_and_suf(test, id_map)
test = merge_data_and_sub(test, id_map)
test = unique_features(test)
test.drop(columns=["webpage"+str(i) for i in range(1,11)], inplace=True)

In [None]:
train = time_features_extraction(train)

In [None]:
test = time_features_extraction(test)

In [None]:
train.fillna(-1000, inplace=True)

In [None]:
test.fillna(-1000, inplace=True)

In [None]:
train_set, y_train = train.drop(columns=["target"]), train.loc[:, "target"]

In [None]:
sm = SMOTE(random_state=42, k_neighbors=7, sampling_strategy=0.20)
train_resampled, y_train_resampled = sm.fit_resample(train_set, y_train)

In [None]:
model_1 = LGBMClassifier(random_state=42, )
tscv = TimeSeriesSplit()
n_scores = cross_val_score(model, train_resampled, y_train_resampled, scoring='roc_auc', cv=tscv, n_jobs=-1)
# report performance
print('roc_auc: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
print(n_scores)

roc_auc: 0.983 (0.009)
[0.9892943  0.98211946 0.98488744 0.99344375 0.96593099]


In [None]:
model_2 = CatBoostClassifier(learning_rate=0.03, depth=8, l2_leaf_reg=3, iterations=200, 
                           auto_class_weights = "Balanced", loss_function = "Logloss", 
                           random_state=42, silent=True)
tscv = TimeSeriesSplit()
n_scores = cross_val_score(model, train_resampled, y=y_train_resampled, scoring='roc_auc', cv=tscv, n_jobs=-1)
# report performance
print('roc_auc: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
print(n_scores)

roc_auc: 0.982 (0.010)
[0.98536121 0.98569844 0.98380157 0.99300376 0.96315392]


In [None]:
model_3 = XGBClassifier(random_state=42, scale_pos_weight=5, n_estimators=150)
tscv = TimeSeriesSplit()
n_scores = cross_val_score(model, train_resampled, y=y_train_resampled, scoring='roc_auc', cv=tscv, n_jobs=-1)
# report performance
print('roc_auc: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
n_scores

roc_auc: 0.978 (0.012)


array([0.98442861, 0.98185512, 0.98259067, 0.98431449, 0.95464153])

In [None]:
model_1 = LGBMClassifier(random_state=42)
model_2 = CatBoostClassifier(learning_rate=0.03, depth=8, l2_leaf_reg=3, iterations=200, 
                           auto_class_weights = "Balanced", loss_function = "Logloss", 
                           random_state=42, silent=True)
model_3 = XGBClassifier(random_state=42, scale_pos_weight=5, n_estimators=150)

In [None]:
model_1.fit(train_resampled, y_train_resampled)
model_2.fit(train_resampled, y_train_resampled)
model_3.fit(train_resampled, y_train_resampled)

XGBClassifier(n_estimators=150, random_state=42, scale_pos_weight=5)

In [None]:
predict_1 = model_1.predict(train_resampled)
predict_2 = model_2.predict(train_resampled)
predict_3 = model_3.predict(train_resampled)

In [None]:
print(classification_report(y_train_resampled, predict_1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    158394
           1       1.00      0.98      0.99     31678

    accuracy                           1.00    190072
   macro avg       1.00      0.99      0.99    190072
weighted avg       1.00      1.00      1.00    190072



In [None]:
print(classification_report(y_train_resampled, predict_2))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    158394
           1       0.95      0.99      0.97     31678

    accuracy                           0.99    190072
   macro avg       0.97      0.99      0.98    190072
weighted avg       0.99      0.99      0.99    190072



In [None]:
print(classification_report(y_train_resampled, predict_3))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    158394
           1       0.90      0.98      0.94     31678

    accuracy                           0.98    190072
   macro avg       0.95      0.98      0.96    190072
weighted avg       0.98      0.98      0.98    190072



In [None]:
probas_1 = model_1.predict_proba(test.to_numpy())[:, 1]
probas_2 = model_2.predict_proba(test.to_numpy())[:, 1]
probas_3 = model_3.predict_proba(test.to_numpy())[:, 1]

In [None]:
probas = (probas_1 + probas_2 + probas_3)/3

In [None]:
def save_submission(pred, number):
    pd.Series(
        pred, name='target', index=pd.Index(range(len(pred)), name='session_id')
    ).to_csv('/content/gdrive/MyDrive/EPAM/Week 7. Trees/HW/submissions_test/notebook_submission' + str(number) + '.csv')

In [None]:
save_submission(pred=probas, number=8)