In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
!gdown --id 1s2ggWGqAjB-wlXkd6Pu7pM_3kqs6RhAF
!gdown --id 1iQCM32OzxqvmDsgL5j4et-yE5W8o6afE
!gdown --id 1bs8PHTExPfItW636-HVRVYwjjPilQVgy

Mounted at /content/gdrive/
Downloading...
From: https://drive.google.com/uc?id=1s2ggWGqAjB-wlXkd6Pu7pM_3kqs6RhAF
To: /content/id_map.parquet
100% 1.20M/1.20M [00:00<00:00, 76.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1iQCM32OzxqvmDsgL5j4et-yE5W8o6afE
To: /content/train.csv
42.9MB [00:00, 137MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1bs8PHTExPfItW636-HVRVYwjjPilQVgy
To: /content/test.csv
24.8MB [00:00, 116MB/s]


In [2]:
!pip install scikit-learn==0.24
!pip install tldextract
!pip install eli5
!pip install hyperopt
!pip install catboost

Collecting scikit-learn==0.24
  Downloading scikit_learn-0.24.0-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.7 MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.0 threadpoolctl-2.2.0
Collecting tldextract
  Downloading tldextract-3.1.2-py2.py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 2.5 MB/s 
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-3.1.2
Collecting eli5
  Downloading eli5-0.11.0-py2.py3-none-any.wh

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from numpy import mean
from numpy import std
from scipy import stats
from scipy.sparse import hstack
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from functools import partial

import time
from tqdm import tqdm_notebook

from sklearn.model_selection import GridSearchCV, train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder

import re
from tldextract import extract
import six
from pandas.api.types import is_sparse

import catboost
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

# Support

In [4]:
def time_feature_extraction(data, train_flag):

    data = data.assign(
        # session start time
        session_start=lambda x: x.filter(like='time').min(axis=1),
        # session end time
        session_end=lambda x: x.filter(like='time').max(axis=1),
        # session duration
        duration=lambda x: (x.session_end - x.session_start).dt.seconds,
        # start hour in a session
        start_hour=lambda x: x.session_start.apply(lambda x: x.hour).astype('int'),
        # day of week in a session
        weekday=lambda x: x.session_start.apply(lambda x: x.dayofweek).astype('category'),

        # count_of_nans
        nans_count=lambda x: x.filter(like='webpage').isna().sum(axis=1),
        # nunique pages
        nunique_pages=lambda x: x.filter(like='webpage').apply(lambda row: row.nunique(), axis=1) / (10 - x.nans_count)
        )

    data.loc[data["duration"]==0, "duration"] = 0.001
    data["duration"], lmbda = stats.boxcox(data.loc[:, "duration"])

    data.loc[data["start_hour"].isin([15,16,17,18,19,3]), "start_hour"] = 17

    if train_flag:
        data = data.reset_index().sort_values(['session_start', "session_id"], ignore_index=True)
        data.drop(columns=["session_start", "session_end"], inplace=True)
    else:
        data.drop(columns=["session_start", "session_end"], inplace=True)
    return data

In [5]:
def intersection(train, id_map):
    """seq of webpages"""
    data_train = merge_data_and_domain(train, id_map).loc[:, ['webpage_update%s' % i for i in range(1, 11)]+["start_hour", "weekday"]]

    data_train.loc[:, ['webpage_update%s' % i for i in range(1, 11)]] = data_train.loc[:, ['webpage_update%s' % i for i in range(1, 11)]].fillna("")
    data_train.loc[:, "string"] = data_train.apply(lambda x: "-".join([x[i] for i in range(0,10) if x[i] != ""]), axis=1)
    data_train.loc[:, "preproc_tokens"] = data_train.loc[:, "string"].apply(lambda x: x.split('-')).values

    data_train = data_train.loc[:, ["preproc_tokens"]]

    data_train.loc[:, [".youtube", "sn gxouxg jqbe.googlevideo", ".ytimg"]] = 0

    for token in [".youtube", "sn gxouxg jqbe.googlevideo", ".ytimg"]:
        data_train[token] = data_train.loc[:, "preproc_tokens"].apply(lambda x: 1 if token in x else 0)
    data_train["intersection"] = 0
    data_train.loc[(data_train[".youtube"]==1) & (data_train["sn gxouxg jqbe.googlevideo"]==1) & (data_train[".ytimg"]==1), "intersection"] = 1

    train = pd.concat([train, data_train.loc[:, ["intersection"]]], axis=1)

    return train

In [6]:
def text_preprocessing_1(text):
    """text_preproc"""
    # split text by . or -
    clean_text = re.split("\.|-", text)
    # join text
    clean_text = " ".join(clean_text)
    # remove nums
    clean_text = re.sub("[0-9]", "", clean_text)
    # remove spaces
    clean_text = re.sub(' +', ' ', clean_text)
    # split
    clean_text = re.split(" ", clean_text)
    # seq must be longer then 1
    clean_text = [x for x in clean_text if len(x)>1]
    # join text
    clean_text = " ".join(clean_text)

    return clean_text

In [7]:
def merge_data_and_domain(data, id_map):
    """merge sub_domain + domain to full webapage"""
    for i in range(1,11):
        data = pd.merge(data, id_map.rename(columns={"id":"webpage"+str(i), 
                                                     "webpage_update":"webpage_update"+str(i)}).loc[:, ["webpage"+str(i),
                                                                                                        "webpage_update"+str(i)]], 
                        on="webpage"+str(i), how="left")
    return data
    
def tf_idf_features(train, test, vectorizer_params):
    """apply tf-idf to sub_domain + domain"""

    webpages_update = ['webpage_update%s' % i for i in range(1, 11)]

    train_sessions = train.loc[:, webpages_update].fillna("")
    test_sessions = test.loc[:, webpages_update].fillna("")

    train_tokens = train_sessions.apply(lambda x: "-".join([x[i] for i in range(0,10) if x[i] != ""]), axis=1)
    test_tokens = test_sessions.apply(lambda x: "-".join([x[i] for i in range(0,10) if x[i] != ""]), axis=1)

    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_tokens)
    X_test = vectorizer.transform(test_tokens)
    
    return X_train, X_test, vectorizer

In [8]:
def sparse_to_dense(x_train, x_test):
    """sparse matrix to normal dense"""
    for column in x_train.columns:
        if is_sparse(x_train[column]):
            x_train.loc[:, column] = x_train.loc[:, column].sparse.to_dense()

    for column in x_test.columns:
        if is_sparse(x_test[column]):
            x_test.loc[:, column] = x_test.loc[:, column].sparse.to_dense()
    return x_train, x_test

In [9]:
def extraction(train, x_test, id_map, vectorizer_params, 
               time_features, popular_features, 
               columns_tf_idf, tf_idf, ohe_flag):

    if time_features:      
        print("extract time features")    
        train = time_feature_extraction(train, train_flag=True)
        x_test = time_feature_extraction(x_test, train_flag=False)

    if popular_features:
        print("intersection features") 
        train = intersection(train.copy(), id_map.copy())
        x_test = intersection(x_test, id_map)

    if tf_idf:
        print("apply tf idf")  
        data_train = merge_data_and_domain(train, id_map).loc[:, ['webpage_update%s' % i for i in range(1, 11)]]
        data_test = merge_data_and_domain(x_test, id_map).loc[:, ['webpage_update%s' % i for i in range(1, 11)]]
        train_tf_idf, test_tf_idf, vectorizer = tf_idf_features(train=data_train, test=data_test, vectorizer_params=vectorizer_params)
        train_tf_idf = pd.DataFrame.sparse.from_spmatrix(train_tf_idf, 
                                                         columns=vectorizer.get_feature_names(), 
                                                         index=train.set_index("session_id").index)
        train_tf_idf = train_tf_idf.loc[:, columns_tf_idf]
        test_tf_idf = pd.DataFrame.sparse.from_spmatrix(test_tf_idf, 
                                                        columns=vectorizer.get_feature_names(), 
                                                        index=x_test.set_index("session_id").index)
        test_tf_idf = test_tf_idf.loc[:, columns_tf_idf]

    # set_index
    train.set_index("session_id", inplace=True)
    x_test.set_index("session_id", inplace=True)

    x_train, y_train = train.drop(columns=["target"]), train.loc[:, ["target"]]

    # drop useless features
    other_features_list = ["time"+str(i) for i in range(1,11)]
    x_train.drop(columns=other_features_list, inplace=True)
    x_test.drop(columns=other_features_list, inplace=True)

    if ohe_flag:
        encoder = OneHotEncoder(handle_unknown="ignore")
        features_dummy_train = encoder.fit_transform(x_train.loc[:, ["weekday"]])
        features_dummy_train = pd.DataFrame.sparse.from_spmatrix(features_dummy_train, index=x_train.index)
        features_dummy_test = encoder.transform(x_test.loc[:, ["weekday"]])
        features_dummy_test = pd.DataFrame.sparse.from_spmatrix(features_dummy_test, index=x_test.index)

        cat_columns = list(range(features_dummy_test.shape[1]))

        x_train.drop(columns=["weekday"], inplace=True)
        x_test.drop(columns=["weekday"], inplace=True)

        x_train = pd.concat([features_dummy_train, x_train], axis=1)
        x_test = pd.concat([features_dummy_test, x_test], axis=1)

    else:
        cat_columns = ["weekday"]

    columns = x_train.columns

    if tf_idf:
        x_train = pd.concat([x_train, train_tf_idf], axis=1)
        x_test = pd.concat([x_test, test_tf_idf], axis=1)
        x_train, x_test = sparse_to_dense(x_train, x_test)
        return x_train, y_train, x_test, vectorizer, columns, cat_columns
    else:
        x_train, x_test = sparse_to_dense(x_train, x_test)
        return x_train, y_train, x_test, columns, cat_columns

In [10]:
def objective(space, estimator, x_train, y_train):
    time_split = TimeSeriesSplit(n_splits=4)

    params = {
        'max_depth': space['max_depth'],
        'iterations': space['iterations'],
        'l2_leaf_reg': space['l2_leaf_reg'],
        'border_count': space['border_count']
    }
    
    # задаём модели требуемые параметры    
    estimator.set_params(**params)
    
    score = cross_val_score(estimator, x_train, y_train, scoring='roc_auc', cv=time_split).mean()
    print("AUC {:.3f} params {}".format(score, params))
    return {'loss':1 - score, 'status': STATUS_OK }

In [11]:
def df_results(hp_results):
    """
    Отображаем результаты hyperopt в формате DataFrame 

    :hp_results: результаты hyperop
    :return: pandas DataFrame
    """ 

    results = pd.DataFrame([{**x, **x['params']} for x in  hp_results])
    results.drop(labels=['status', 'params'], axis=1, inplace=True)
    results.sort_values(by=['loss'], ascending=False, inplace=True)
    return results

In [12]:
def save_submission(pred, number):
    pd.Series(
        pred, name='target', index=pd.Index(range(len(pred)), name='session_id')
    ).to_csv('/content/gdrive/MyDrive/EPAM/Week 7. Trees/HW/submissions_test/notebook_submission' + str(number) + '.csv')

# Preprocessing

In [13]:
# usefull features tf-idf
list_positive = ['.express', '.info jeunes', '.vk', '.melty', 'fr web img.acsta',
                '.videostep', 'khms.google', '.radio canada', 'api.bing',
                '.banque chalus', '.audienceinsights', 'demotivateur.disqus',
                '.indeed', 'media.melty', '.video', '.jobisjob', '.blogger',
                'gujynbgx.admedia', '.youwatch', 'reviewer.lavoixdunord',
                'img wikia.nocookie', 'static.programme tv', 'static.flickr',
                '.audienceinsights facebook', '.regarder film gratuit', '.bbc',
                'facebook .audienceinsights', 'static.videostep', '.exashare',
                '.blastr', 'docs.google', 'graphics.nytimes', '.reddit']

list_neg = ['.futura sciences', 'clients.google clients.google clients.google',
            'fr mg mail.yahoo', '.ztat', 'fbstatic.akamaihd', '.spin',
            'facebook', 'ba.commentcamarche', 'static.ccm', 'safebrowsing cache.google', 
            '.linkedin', '.wordreference', 'fr.openclassrooms', '.bing', 
            'clients.google clients.google', 'mail.google', 'plus.google']

features_list = list_positive + list_neg

In [14]:
OUTPUT_FOLDER = "/content/gdrive/MyDrive/EPAM/Week 7. Trees/HW/model_test"

In [15]:
train = pd.read_csv("./train.csv", index_col=0, parse_dates=[f'time{i+1}' for i in range(10)])
x_test = pd.read_csv('./test.csv', index_col=0, parse_dates=[f'time{i+1}' for i in range(10)])
x_test.reset_index(inplace=True)
id_map = pd.read_parquet("/content/id_map.parquet")

In [16]:
# get user's webpages 
user_frame = train.loc[train["target"]==1].reset_index(drop=True)

array = np.array([])
for element in ["webpage"+str(x) for x in range(1,11)]:
    unique_array = user_frame.loc[:, element].unique()
    array = np.concatenate([array, unique_array])
array = {x for x in array if x==x}

id_map["target"] = 0
id_map.loc[id_map["id"].isin(array), "target"] = 1

# extract "sub-domain", "domain", "suf" from url of webpage
list_ = [list(extract(id_map.loc[i].webpage)) for i in id_map.index]
id_map.loc[:, ["sub","domain","suf"]] = list_

# url preprocessing
id_map.loc[:, "domain"] = [text_preprocessing_1(text=x) for x in id_map.loc[:, "domain"].values]
id_map.loc[:, "sub"] = [text_preprocessing_1(text=x) for x in id_map.loc[:, "sub"].values]
id_map.loc[:, "sub"] = id_map.loc[:, "sub"].apply(lambda x: "" if x in ["www", "js init"] else x)
id_map["webpage_update"] = id_map.apply(lambda x: ".".join([x[3], x[4]]), axis=1)
id_map.loc[:, "webpage_update"] = id_map.loc[:, "webpage_update"].apply(lambda x: "twitter" if x in [".twitter", "platform.twitter"] else x)
id_map.loc[:, "webpage_update"] = id_map.loc[:, "webpage_update"].apply(lambda x: "facebook" if x in ["static ak.facebook", ".facebook", "connect.facebook"] else x)

In [17]:
# params for tf-idf
vectorizer_params={'ngram_range': (1, 5), 'max_features': 30000,
                   "min_df": 2, "max_df": 0.7,
                   'tokenizer': lambda s: s.split("-")
                   }

In [18]:
# time_features + popular_features
x_train, y_train, x_test, vectorizer, columns, cat_columns = extraction(train=train.copy(), x_test=x_test.copy(), id_map=id_map, 
                                                                   vectorizer_params=vectorizer_params, time_features=True,
                                                                   popular_features=True, columns_tf_idf=features_list,
                                                                   tf_idf=True, ohe_flag=False)

x_train.fillna(0, inplace=True)
y_train.fillna(0, inplace=True)
x_test.fillna(0, inplace=True)

extract time features
intersection features
apply tf idf


## Catboost

In [21]:
estimator = CatBoostClassifier(verbose=0, task_type="GPU", loss_function="Logloss", 
                               cat_features=cat_columns, early_stopping_rounds=50,
                               auto_class_weights ='Balanced')

space = {
    'max_depth': hp.quniform('max_depth', 3, 12, 1),
    'iterations': hp.quniform('iterations', 300, 800,1),
    'l2_leaf_reg' : hp.uniform('l2_leaf_reg', 0, 25),
    'border_count': hp.choice('border_count', [10,35,50])
    }

trials = Trials()

best = fmin(
    # функция для оптимизации 
    fn=partial(objective, estimator=estimator, x_train=x_train, y_train=y_train),
    # пространство поиска гиперпараметров
    space=space,
    # алгоритм поиска
    algo=tpe.suggest,
    # число итераций
    max_evals=175,
    # куда сохранять историю поиска
    trials=trials,
    # random state
    rstate=np.random.RandomState(1),
    # progressbar
    show_progressbar=True
    )

  0%|          | 0/175 [00:00<?, ?it/s, best loss: ?]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

AUC 0.901 params {'max_depth': 5.0, 'iterations': 775.0, 'l2_leaf_reg': 16.92786178971578, 'border_count': 35}
AUC 0.891 params {'max_depth': 9.0, 'iterations': 488.0, 'l2_leaf_reg': 1.541293295355581, 'border_count': 10}
AUC 0.895 params {'max_depth': 9.0, 'iterations': 315.0, 'l2_leaf_reg': 8.365163630027068, 'border_count': 50}
AUC 0.888 params {'max_depth': 10.0, 'iterations': 685.0, 'l2_leaf_reg': 0.3362262627282536, 'border_count': 50}
AUC 0.897 params {'max_depth': 9.0, 'iterations': 360.0, 'l2_leaf_reg': 10.289324886600843, 'border_count': 50}
AUC 0.896 params {'max_depth': 11.0, 'iterations': 313.0, 'l2_leaf_reg': 8.987800211068214, 'border_count': 50}
AUC 0.898 params {'max_depth': 12.0, 'iterations': 599.0, 'l2_leaf_reg': 10.786469487359934, 'border_count': 50}
AUC 0.894 params {'max_depth': 9.0, 'iterations': 601.0, 'l2_leaf_reg': 2.41797599546566, 'border_count': 35}
AUC 0.898 params {'max_depth': 6.0, 'iterations': 713.0, 'l2_leaf_reg': 18.471985344381327, 'border_count':

In [24]:
best = {'max_depth': 7, 'iterations': 770, 'l2_leaf_reg': 20, 'border_count': 50}

In [25]:
time_split = TimeSeriesSplit(n_splits=7)

estimator = CatBoostClassifier(metric_period=100,loss_function="Logloss", 
                               cat_features=cat_columns, early_stopping_rounds=50,
                               auto_class_weights ='Balanced')
estimator.set_params(**best)

score = cross_val_score(estimator, x_train, y_train, scoring='roc_auc', cv=time_split)
estimator.fit(x_train, y_train)
print("score: ", score)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.6722573	total: 34.8ms	remaining: 26.7s
100:	learn: 0.1312111	total: 2.86s	remaining: 19s
200:	learn: 0.0717992	total: 5.58s	remaining: 15.8s
300:	learn: 0.0453758	total: 8.33s	remaining: 13s
400:	learn: 0.0291294	total: 11.1s	remaining: 10.2s
500:	learn: 0.0194686	total: 13.8s	remaining: 7.42s
600:	learn: 0.0148598	total: 16.5s	remaining: 4.65s
700:	learn: 0.0125069	total: 19.1s	remaining: 1.88s
769:	learn: 0.0111592	total: 20.9s	remaining: 0us
0:	learn: 0.6736035	total: 51.4ms	remaining: 39.5s
100:	learn: 0.1932789	total: 4.79s	remaining: 31.7s
200:	learn: 0.1310423	total: 9.6s	remaining: 27.2s
300:	learn: 0.1010899	total: 14.3s	remaining: 22.3s
400:	learn: 0.0775555	total: 19s	remaining: 17.5s
500:	learn: 0.0589129	total: 23.9s	remaining: 12.8s
600:	learn: 0.0456111	total: 28.6s	remaining: 8.05s
700:	learn: 0.0386059	total: 33.1s	remaining: 3.25s
769:	learn: 0.0348975	total: 36.1s	remaining: 0us
0:	learn: 0.6618247	total: 69.5ms	remaining: 53.5s
100:	learn: 0.1919601	tota

In [26]:
time_split = TimeSeriesSplit(n_splits=7)

estimator_1 = CatBoostClassifier(metric_period=100,loss_function="Logloss", 
                               cat_features=cat_columns, early_stopping_rounds=50,
                               auto_class_weights ='Balanced', random_state=42)
estimator_1.set_params(**best)

score = cross_val_score(estimator_1, x_train, y_train, scoring='roc_auc', cv=time_split)
estimator_1.fit(x_train, y_train)
print("score: ", score)

0:	learn: 0.6653765	total: 31.4ms	remaining: 24.2s
100:	learn: 0.1248502	total: 2.81s	remaining: 18.6s
200:	learn: 0.0694944	total: 5.64s	remaining: 16s
300:	learn: 0.0441423	total: 8.45s	remaining: 13.2s
400:	learn: 0.0284326	total: 11.2s	remaining: 10.3s
500:	learn: 0.0184918	total: 14s	remaining: 7.49s
600:	learn: 0.0139421	total: 16.6s	remaining: 4.66s
700:	learn: 0.0120586	total: 18.9s	remaining: 1.86s
769:	learn: 0.0109418	total: 20.6s	remaining: 0us
0:	learn: 0.6705538	total: 56.1ms	remaining: 43.1s
100:	learn: 0.1909971	total: 4.66s	remaining: 30.9s
200:	learn: 0.1311202	total: 9.4s	remaining: 26.6s
300:	learn: 0.1036623	total: 14s	remaining: 21.9s
400:	learn: 0.0796006	total: 18.8s	remaining: 17.3s
500:	learn: 0.0600312	total: 23.6s	remaining: 12.7s
600:	learn: 0.0480354	total: 28.5s	remaining: 8s
700:	learn: 0.0398458	total: 33.2s	remaining: 3.27s
769:	learn: 0.0345220	total: 36.5s	remaining: 0us
0:	learn: 0.6658412	total: 67.6ms	remaining: 52s
100:	learn: 0.1884970	total: 6.

In [29]:
# predict
cat_pred_1 = estimator.predict_proba(x_test)[:, 1]
cat_pred_2 = estimator_1.predict_proba(x_test)[:, 1]

cat_pred = (cat_pred_1+cat_pred_2)/2

In [30]:
# save Submission
save_submission(cat_pred, number=52)