In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
!gdown --id 1s2ggWGqAjB-wlXkd6Pu7pM_3kqs6RhAF
!gdown --id 1iQCM32OzxqvmDsgL5j4et-yE5W8o6afE
!gdown --id 1bs8PHTExPfItW636-HVRVYwjjPilQVgy

Mounted at /content/gdrive/
Downloading...
From: https://drive.google.com/uc?id=1s2ggWGqAjB-wlXkd6Pu7pM_3kqs6RhAF
To: /content/id_map.parquet
100% 1.20M/1.20M [00:00<00:00, 78.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1iQCM32OzxqvmDsgL5j4et-yE5W8o6afE
To: /content/train.csv
42.9MB [00:00, 116MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1bs8PHTExPfItW636-HVRVYwjjPilQVgy
To: /content/test.csv
24.8MB [00:00, 93.9MB/s]


In [2]:
!pip install scikit-learn==0.24
!pip install tldextract
!pip install eli5
!pip install category_encoders
!pip install hyperopt

Collecting scikit-learn==0.24
  Downloading scikit_learn-0.24.0-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 4.5 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.0 threadpoolctl-2.2.0
Collecting tldextract
  Downloading tldextract-3.1.2-py2.py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 2.3 MB/s 
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-3.1.2
Collecting eli5
  Downloading eli5-0.11.0-py2.py3-none-any.whl (106

In [10]:
import pandas as pd
import numpy as np
import time

from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [6]:
train = pd.read_csv("./train.csv", index_col=0, parse_dates=[f'time{i+1}' for i in range(10)])
test = pd.read_csv('./test.csv', index_col=0, parse_dates=[f'time{i+1}' for i in range(10)])

In [7]:
def preproc(data):
    data = data.assign(
            session_start=lambda x: x.filter(like='time').min(axis=1),
            # difference between 10th page and 1st in seconds
            duration=lambda x: (x.time10 - x.time1).dt.seconds.fillna(0),
            # number of missing pages (from 10 max)
            nans_count=lambda x: x.isna().sum(axis=1).div(2),
            # number of unique pages in session
            n_unique_pages=lambda x: 
                x.filter(like='webpage').apply(lambda row: row.nunique(), axis=1) / (10 - x.nans_count),
            # avg hour of day in a session
            avg_hour=lambda x: x.filter(like='time').apply(lambda x: x.dt.hour).mean(axis=1),
            # avg day of week in a session
            avg_day=lambda x: x.filter(like='time').apply(lambda x: x.dt.dayofweek).mean(axis=1),
        )
    data = data.drop(columns=[f'time{i+1}' for i in range(10)]).fillna(0.)

    data.loc[data["duration"]==0, "duration"] = 0.0001
    data["duration"], lmbda = stats.boxcox(data.loc[:, "duration"])

    return data


train = preproc(train)
test = preproc(test)

In [8]:
train = train.reset_index().sort_values(['session_start'], ignore_index=True)
train.set_index("session_id", inplace=True)
y_train = train.target
x_train = train.drop(columns='target')

x_train.drop(columns=["session_start"], inplace=True)
test.drop(columns=["session_start"], inplace=True)

In [11]:
time_split = TimeSeriesSplit(n_splits=10)

estimator = LGBMClassifier(boosting_type="goss", class_weight='balanced', 
                           num_leaves=31, n_estimators=400, random_state=42)

score = cross_val_score(estimator, x_train, y_train, scoring='roc_auc', cv=time_split)
estimator.fit(x_train, y_train)

LGBMClassifier(boosting_type='goss', class_weight='balanced', n_estimators=400,
               random_state=42)

In [15]:
x_train

Unnamed: 0_level_0,webpage1,webpage2,webpage3,webpage4,webpage5,webpage6,webpage7,webpage8,webpage9,webpage10,duration,nans_count,n_unique_pages,avg_hour,avg_day
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
13694,880,867.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,1.000000,4.0,4.0
34657,880,867.0,880.0,867.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.500000,4.5,4.0
48835,12450,12450.0,12515.0,12450.0,12450.0,12437.0,12476.0,10344.0,12489.0,12450.0,4.0,0.0,0.600000,4.0,4.0
71986,12437,12476.0,12489.0,12476.0,12437.0,12450.0,12463.0,12437.0,12450.0,12450.0,3.0,0.0,0.500000,4.0,4.0
92602,12463,12502.0,12476.0,12463.0,12502.0,12528.0,12450.0,12515.0,12450.0,12463.0,2.0,0.0,0.600000,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129531,867,880.0,867.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.666667,19.0,1.0
24530,10344,10344.0,12879.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.666667,19.0,1.0
139265,828,828.0,828.0,1218.0,11540.0,7562.0,438.0,828.0,8173.0,1153.0,186.0,0.0,0.700000,19.0,1.0
43125,1192,17143.0,1140.0,1166.0,1127.0,451.0,7562.0,1218.0,8173.0,1140.0,657.0,0.0,0.900000,19.0,1.0


In [14]:
score.mean()

0.8761325478348233

In [17]:
def save_submission(pred, number):
    pd.Series(
        pred, name='target', index=pd.Index(range(len(pred)), name='session_id')
    ).to_csv('/content/gdrive/MyDrive/EPAM/Week 7. Trees/HW/submissions_test/notebook_submission' + str(number) + '.csv')

In [18]:
lgbm_pred = estimator.predict_proba(test)[:, 1]
save_submission(lgbm_pred, number=32)