In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
!gdown --id 1s2ggWGqAjB-wlXkd6Pu7pM_3kqs6RhAF
!gdown --id 1iQCM32OzxqvmDsgL5j4et-yE5W8o6afE
!gdown --id 1bs8PHTExPfItW636-HVRVYwjjPilQVgy

Mounted at /content/gdrive/
Downloading...
From: https://drive.google.com/uc?id=1s2ggWGqAjB-wlXkd6Pu7pM_3kqs6RhAF
To: /content/id_map.parquet
100% 1.20M/1.20M [00:00<00:00, 7.35MB/s]
Downloading...
From: https://drive.google.com/uc?id=1iQCM32OzxqvmDsgL5j4et-yE5W8o6afE
To: /content/train.csv
42.9MB [00:01, 30.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1bs8PHTExPfItW636-HVRVYwjjPilQVgy
To: /content/test.csv
24.8MB [00:01, 23.3MB/s]


In [2]:
!pip install scikit-learn==0.24
!pip install tldextract
!pip install eli5
!pip install category_encoders
!pip install hyperopt
!pip install catboost

Collecting scikit-learn==0.24
  Downloading scikit_learn-0.24.0-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.2 MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.0 threadpoolctl-2.2.0
Collecting tldextract
  Downloading tldextract-3.1.2-py2.py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 4.7 MB/s 
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-3.1.2
Collecting eli5
  Downloading eli5-0.11.0-py2.py3-none-any.wh

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy import stats
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import time
from tqdm import tqdm_notebook

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder

import re
from wordcloud import WordCloud
from tldextract import extract
from gensim.models import Word2Vec

import warnings
warnings.filterwarnings("ignore")

import six
from pandas.api.types import is_sparse
from category_encoders import TargetEncoder

In [4]:
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, TimeSeriesSplit
from numpy import mean
from numpy import std
from scipy.sparse import hstack
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from functools import partial

# Support

In [5]:
def time_feature_extraction(data, train_flag):

    data = data.assign(
        # session start time
        session_start=lambda x: x.filter(like='time').min(axis=1),
        # session end time
        session_end=lambda x: x.filter(like='time').max(axis=1),
        # session duration
        duration=lambda x: (x.session_end - x.session_start).dt.seconds,
        # start hour in a session
        start_hour=lambda x: x.session_start.apply(lambda x: x.hour).astype('category'),
        # day of week in a session
        weekday=lambda x: x.session_start.apply(lambda x: x.dayofweek).astype('category'),

        # count_of_nans
        nans_count=lambda x: x.filter(like='webpage').isna().sum(axis=1),
        # nunique pages
        nunique_pages=lambda x: x.filter(like='webpage').apply(lambda row: row.nunique(), axis=1) / (10 - x.nans_count)
        )

    data.loc[data["duration"]==0, "duration"] = 0.01
    data["duration"], lmbda = stats.boxcox(data.loc[:, "duration"])

    data["time"] = "none"
    data.loc[data["start_hour"].isin([15,16,17,18,19,3]), "time"] = "first"
    data.loc[data["start_hour"].isin([4,5,6,7]), "time"] = "second"
    data.loc[data["start_hour"].isin([8,9]), "time"] = "third"
    data.loc[data["start_hour"].isin([10,11]), "time"] = "fourth"
    data.loc[data["start_hour"].isin([12,13,14]), "time"] = "fifth"

    if train_flag:
        data = data.reset_index().sort_values(['session_start', "session_id"], ignore_index=True)
        data.drop(columns=["session_start", "session_end"], inplace=True)
    else:
        data.drop(columns=["session_start", "session_end"], inplace=True)
    return data

In [6]:
def intersection(train, id_map):
    data_train = merge_data_and_domain(train, id_map).loc[:, ['webpage_update%s' % i for i in range(1, 11)]+["start_hour", "weekday"]]

    data_train.loc[:, ['webpage_update%s' % i for i in range(1, 11)]] = data_train.loc[:, ['webpage_update%s' % i for i in range(1, 11)]].fillna("")
    data_train.loc[:, "string"] = data_train.apply(lambda x: "-".join([x[i] for i in range(0,10) if x[i] != ""]), axis=1)
    data_train.loc[:, "preproc_tokens"] = data_train.loc[:, "string"].apply(lambda x: x.split('-')).values

    data_train = data_train.loc[:, ["preproc_tokens"]]

    data_train.loc[:, [".youtube", "sn gxouxg jqbe.googlevideo", ".ytimg"]] = 0

    for token in [".youtube", "sn gxouxg jqbe.googlevideo", ".ytimg"]:
        data_train[token] = data_train.loc[:, "preproc_tokens"].apply(lambda x: 1 if token in x else 0)
    data_train["intersection"] = 0
    data_train.loc[(data_train[".youtube"]==1) & (data_train["sn gxouxg jqbe.googlevideo"]==1) & (data_train[".ytimg"]==1), "intersection"] = 1

    train = pd.concat([train, data_train.loc[:, ["intersection"]]], axis=1)

    return train

In [7]:
def popular_webpage_features_train(data):
    for webpage_number in range(1,11):
        if webpage_number == 1:
            df = data[['session_id',f'webpage{webpage_number}','target']]
            df.columns = ['session_id','webpage', 'target']
        if webpage_number!=1:
            next = data[['session_id',f'webpage{webpage_number}', 'target']]
            next.columns = ['session_id','webpage', 'target']
            df = pd.concat([df, next], ignore_index=True)
    # check target share
    webpage_mean = df.groupby('webpage')['target'].mean()
    # check sum
    webpage_sum = df.groupby('webpage')['target'].sum()
    webpage = pd.merge(webpage_mean, webpage_sum, left_index=True, right_index=True)
    # share * sum
    webpage["target_value"] = webpage["target_x"] * webpage["target_y"]
    webpage.sort_values(inplace=True, by=["target_value"])
    webpage = webpage.loc[webpage["target_x"]>0]
    webpage["target_value_log"] = np.log(webpage.loc[:, "target_value"])
    important_pages = webpage.loc[webpage["target_x"]>0.9].index.astype(int).values
    webpage = webpage.loc[webpage["target_x"]<0.9]
    quantiles = webpage["target_value_log"].quantile([.4, 0.7, 0.95]).values

    count = 0
    for quantile in quantiles:
        count += 1
        pages = webpage.loc[webpage["target_value_log"]>quantile].index.astype(int).values
        pages = list(pages) + list(important_pages)
        data.loc[:, f'popular_pages_{count}'] = 0
        for col in [f"webpage{i}" for i in range(1,11)]:
            data.loc[data[col].isin(pages), f'popular_pages_{count}'] = 1

    return data, quantiles, webpage, important_pages

def popular_webpage_features_test(data, quantiles, webpage, important_pages):
    count = 0
    for quantile in quantiles:
        count += 1
        pages = webpage.loc[webpage["target_value_log"]>quantile].index.astype(int).values
        pages = list(pages) + list(important_pages)
        data.loc[:, f'popular_pages_{count}'] = 0
        for col in [f"webpage{i}" for i in range(1,11)]:
            data.loc[data[col].isin(pages), f'popular_pages_{count}'] = 1
    return data

In [8]:
def remove_days(data):
    
    data = data.set_index('session_id')
    
    bad_hours_array = np.array([3, 6, 15, 16, 17, 18, 19])
    good_hours_array = np.array([4, 5, 7, 8, 9, 10, 11, 12, 13, 14])
    good_session_id = data[data.start_hour.isin(good_hours_array)].index
    bad_session_id = data[data.start_hour.isin(bad_hours_array)].index
    
    data = data.loc[good_session_id]
    data = data.reset_index()
    
    return data, good_session_id, bad_session_id

In [9]:
def text_preprocessing_1(text):
    # split text by . or -
    clean_text = re.split("\.|-", text)
    # join text
    clean_text = " ".join(clean_text)
    # remove nums
    clean_text = re.sub("[0-9]", "", clean_text)
    # remove spaces
    clean_text = re.sub(' +', ' ', clean_text)
    # split
    clean_text = re.split(" ", clean_text)
    # seq must be longer then 1
    clean_text = [x for x in clean_text if len(x)>1]
    # join text
    clean_text = " ".join(clean_text)

    return clean_text

In [10]:
def merge_data_and_domain(data, id_map):
    for i in range(1,11):
        data = pd.merge(data, id_map.rename(columns={"id":"webpage"+str(i), 
                                                     "webpage_update":"webpage_update"+str(i)}).loc[:, ["webpage"+str(i),
                                                                                                        "webpage_update"+str(i)]], 
                        on="webpage"+str(i), how="left")
    return data
    
def tf_idf_features(train, test, vectorizer_params):

    webpages_update = ['webpage_update%s' % i for i in range(1, 11)]

    train_sessions = train.loc[:, webpages_update].fillna("")
    test_sessions = test.loc[:, webpages_update].fillna("")

    train_tokens = train_sessions.apply(lambda x: "-".join([x[i] for i in range(0,10) if x[i] != ""]), axis=1)
    test_tokens = test_sessions.apply(lambda x: "-".join([x[i] for i in range(0,10) if x[i] != ""]), axis=1)

    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_tokens)
    X_test = vectorizer.transform(test_tokens)
    
    return X_train, X_test, vectorizer

In [11]:
def train_word2vec(train_domain_features, test_domain_features, 
                   size, window, min_count, workers, sg):
    
    webpages_update = ['webpage_update%s' % i for i in range(1, 11)]

    train_domain_features = train_domain_features.loc[:, webpages_update].fillna("")
    test_domain_features = test_domain_features.loc[:, webpages_update].fillna("")
    
    train_domain_features.loc[:, "string"] = train_domain_features.apply(lambda x: "-".join([x[i] for i in range(0,10) if x[i] != ""]), axis=1)
    test_domain_features.loc[:, "string"] = test_domain_features.apply(lambda x: "-".join([x[i] for i in range(0,10) if x[i] != ""]), axis=1)

    train_domain_features.loc[:, "preproc_tokens"] = train_domain_features.loc[:, "string"].apply(lambda x: x.split('-')).values
    test_domain_features.loc[:, "preproc_tokens"] = test_domain_features.loc[:, "string"].apply(lambda x: x.split('-')).values

    # Store the model in following file
    word2vec_model_file = OUTPUT_FOLDER + '/word2vec_' + str(size) + '.model'
    # Store the vectors for train data in following file
    word2vec_filename = OUTPUT_FOLDER + '/train_review_word2vec.csv'

    start_time = time.time()
    preproc_tokens = pd.Series(train_domain_features['preproc_tokens']).values
    # Train the Word2Vec Model
    w2v_model = Word2Vec(preproc_tokens, min_count=min_count, size=size, workers=workers, window=window, sg=sg)
    print(f"Time taken to train word2vec model: {(time.time() - start_time)/60:0.03} min")
    # save model to file
    w2v_model.save(word2vec_model_file)

    # save emb to file
    with open(word2vec_filename, 'w+') as word2vec_file:
        print("data shape: ", train_domain_features.shape[0])
        for index, row in tqdm_notebook(train_domain_features.iterrows()):
            model_vector = (np.mean([w2v_model[token] for token in row['preproc_tokens']], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(x) for x in range(size))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            # Check if the line exists else it is vector of zeros
            if type(model_vector) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(size)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')

    # Load train from the filename
    word2vec_train = pd.read_csv(word2vec_filename)

    test_features_word2vec = np.array([x for x in range(size)])
    test_ = np.array([0 for x in range(size)])
    print("data shape: ", test_domain_features.shape[0])
    for index, row in tqdm_notebook(test_domain_features.iterrows()):
        model_vector = np.mean([w2v_model[token] for token in row['preproc_tokens'] if token in w2v_model.wv.vocab], axis=0)
        if type(model_vector) is np.ndarray:
            test_ = np.vstack([test_, model_vector])
            if test_.shape[0] > 500:
                test_features_word2vec = np.vstack([test_features_word2vec, test_[1:]])
                test_ = np.array([0 for x in range(size)])
        else:
            test_ = np.vstack([test_, np.array([0 for i in range(size)])])
            if test_.shape[0] > 500:
                test_features_word2vec = np.vstack([test_features_word2vec, test_[1:]])
                test_ = np.array([0 for x in range(size)])
    test_features_word2vec = np.vstack([test_features_word2vec, test_[1:]])

    word2vec_test = pd.DataFrame(test_features_word2vec[1:])

    return word2vec_train, word2vec_test

In [12]:
def sparse_to_dense(x_train, x_test):
    for column in x_train.columns:
        if is_sparse(x_train[column]):
            x_train.loc[:, column] = x_train.loc[:, column].sparse.to_dense()

    for column in x_test.columns:
        if is_sparse(x_test[column]):
            x_test.loc[:, column] = x_test.loc[:, column].sparse.to_dense()
    return x_train, x_test

In [13]:
def extraction(train, x_test, id_map, vectorizer_params, 
               time_features, popular_features, remove_obs, 
               columns_tf_idf, tf_idf, word2vec, tf_w2v):

    if time_features:      
        print("extract time features")    
        train = time_feature_extraction(train, train_flag=True)
        x_test = time_feature_extraction(x_test, train_flag=False)

        print("intersection features") 
        train = intersection(train.copy(), id_map.copy())
        x_test = intersection(x_test, id_map)

    if popular_features:
        print("extract popularity features")  
        train, quantiles, webpage, important_pages = popular_webpage_features_train(train)
        train.drop(columns=['popular_pages_1'], inplace=True)
        x_test = popular_webpage_features_test(x_test, quantiles, webpage, important_pages)
        x_test.drop(columns=['popular_pages_1'], inplace=True)

    if remove_obs:
        print("remove_obs")  
        # можно выкинуть дополнительно 3 часа, потеряв 5 наблюдений класса
        train, good_ids_train, bad_ids_train = remove_days(train.copy())

    if tf_idf | tf_w2v:
        print("apply tf idf")  
        data_train = merge_data_and_domain(train, id_map).loc[:, ['webpage_update%s' % i for i in range(1, 11)]]
        data_test = merge_data_and_domain(x_test, id_map).loc[:, ['webpage_update%s' % i for i in range(1, 11)]]
        train_tf_idf, test_tf_idf, vectorizer = tf_idf_features(train=data_train, test=data_test, vectorizer_params=vectorizer_params)
        train_tf_idf = pd.DataFrame.sparse.from_spmatrix(train_tf_idf, 
                                                         columns=vectorizer.get_feature_names(), 
                                                         index=train.set_index("session_id").index)
        train_tf_idf = train_tf_idf.loc[:, columns_tf_idf]
        test_tf_idf = pd.DataFrame.sparse.from_spmatrix(test_tf_idf, 
                                                        columns=vectorizer.get_feature_names(), 
                                                        index=x_test.set_index("session_id").index)
        test_tf_idf = test_tf_idf.loc[:, columns_tf_idf]

    if word2vec | tf_w2v:
        print("word2vec")  
        data_train = merge_data_and_domain(train, id_map).loc[:, ['webpage_update%s' % i for i in range(1, 11)]]
        data_test = merge_data_and_domain(x_test, id_map).loc[:, ['webpage_update%s' % i for i in range(1, 11)]]
        word2vec_train, word2vec_test = train_word2vec(data_train, data_test, size=175, window=2, 
                                                       min_count=1, workers=3, sg=1)
        columns_2 = word2vec_train.columns
        word2vec_train.index = train.set_index("session_id").index
        word2vec_test.index = x_test.set_index("session_id").index

    # set_index
    train.set_index("session_id", inplace=True)
    x_test.set_index("session_id", inplace=True)

    x_train, y_train = train.drop(columns=["target"]), train.loc[:, ["target"]]

    # drop useless features
    other_features_list = ["webpage"+str(i) for i in range(1,11)] + ["time"+str(i) for i in range(1,11)]
    x_train.drop(columns=other_features_list, inplace=True)
    x_test.drop(columns=other_features_list, inplace=True)

    cat_columns = ["weekday", "nans_count", "time"]
    x_train.drop(columns=["start_hour", ], inplace=True)
    x_test.drop(columns=["start_hour"], inplace=True)
    columns = x_train.columns

    if tf_w2v:
        x_train = pd.concat([x_train, train_tf_idf], axis=1)
        x_test = pd.concat([x_test, test_tf_idf], axis=1)

        x_train = pd.concat([x_train, word2vec_train], axis=1)
        x_test = pd.concat([x_test, word2vec_test], axis=1)
        x_train, x_test = sparse_to_dense(x_train, x_test)
        return x_train, y_train, x_test, vectorizer, columns, columns_2, cat_columns
    if tf_idf:
        x_train = pd.concat([x_train, train_tf_idf], axis=1)
        x_test = pd.concat([x_test, test_tf_idf], axis=1)
        x_train, x_test = sparse_to_dense(x_train, x_test)
        return x_train, y_train, x_test, vectorizer, columns, cat_columns
    elif word2vec:
        x_train = pd.concat([x_train, word2vec_train], axis=1)
        x_test = pd.concat([x_test, word2vec_test], axis=1)
        x_train, x_test = sparse_to_dense(x_train, x_test)
        return x_train, y_train, x_test, columns, columns_2, cat_columns
    else:
        x_train, x_test = sparse_to_dense(x_train, x_test)
        return x_train, y_train, x_test, columns, cat_columns

In [14]:
def save_submission(pred, number):
    pd.Series(
        pred, name='target', index=pd.Index(range(len(pred)), name='session_id')
    ).to_csv('/content/gdrive/MyDrive/EPAM/Week 7. Trees/HW/submissions_test/notebook_submission' + str(number) + '.csv')

In [38]:
def objective(space, estimator, x_train, y_train):
    time_split = TimeSeriesSplit(n_splits=4)

    params = {
        'learning_rate': space['learning_rate'],
        'max_depth': space['max_depth'],
        'min_data_in_leaf': space['min_data_in_leaf'],
        'iterations': space['iterations'],
        'l2_leaf_reg': space['l2_leaf_reg'],
        'bagging_temperature': space['bagging_temperature'],
        'auto_class_weights': space['auto_class_weights'],
    }
    
    # задаём модели требуемые параметры    
    estimator.set_params(**params)
    
    score = cross_val_score(estimator, x_train, y_train, scoring='roc_auc', cv=time_split).mean()
    print("AUC {:.3f} params {}".format(score, params))
    return {'loss':1 - score, 'status': STATUS_OK }

In [16]:
def df_results(hp_results):
    """
    Отображаем результаты hyperopt в формате DataFrame 

    :hp_results: результаты hyperop
    :return: pandas DataFrame
    """ 

    results = pd.DataFrame([{**x, **x['params']} for x in  hp_results])
    results.drop(labels=['status', 'params'], axis=1, inplace=True)
    results.sort_values(by=['loss'], ascending=False, inplace=True)
    return results

# Preprocessing

In [17]:
list_positive = ['.express', '.info jeunes', '.vk', '.melty', 'fr web img.acsta',
       '.videostep', 'khms.google', '.radio canada', 'api.bing',
       '.banque chalus', '.audienceinsights', 'demotivateur.disqus',
       '.indeed', 'media.melty', '.video', '.jobisjob', '.blogger',
       'gujynbgx.admedia', '.youwatch', 'reviewer.lavoixdunord',
       'img wikia.nocookie', 'static.programme tv', 'static.flickr',
       '.audienceinsights facebook', '.regarder film gratuit', '.bbc',
       '.', 'facebook .audienceinsights', 'static.videostep', '.exashare',
       '.blastr', '.brgm', 'docs.google', 'graphics.nytimes', '.reddit',
       'id.google', 'fr glee.wikia', '.caf','cdn.freepik', 'betacie.cachefly',
       'clients.google .google .google .google', 'sn gkued.googlevideo',
       'twitter .demotivateur', '.demotivateur', 'storage.canalblog',
       '.ajoutezvotresite', '.hellopro', 'wwwd.caf', 'www auvergne.afpa']

list_neg = ['.futura sciences', 'clients.google clients.google clients.google',
'fr mg mail.yahoo', '.ztat', 'fbstatic.akamaihd', '.spin',
'facebook', 'ba.commentcamarche', 'static.ccm',
'safebrowsing cache.google', '.linkedin', '.wordreference',
'fr.openclassrooms', '.bing', 'clients.google clients.google', 'mail.google', 'plus.google']

features_list = list_positive + list_neg

In [18]:
OUTPUT_FOLDER = "/content/gdrive/MyDrive/EPAM/Week 7. Trees/HW/model_test"

In [19]:
train = pd.read_csv("./train.csv", index_col=0, parse_dates=[f'time{i+1}' for i in range(10)])
x_test = pd.read_csv('./test.csv', index_col=0, parse_dates=[f'time{i+1}' for i in range(10)])
x_test.reset_index(inplace=True)
id_map = pd.read_parquet("/content/id_map.parquet")

In [20]:
# get user's webpages 
user_frame = train.loc[train["target"]==1].reset_index(drop=True)

array = np.array([])
for element in ["webpage"+str(x) for x in range(1,11)]:
    unique_array = user_frame.loc[:, element].unique()
    array = np.concatenate([array, unique_array])
array = {x for x in array if x==x}

id_map["target"] = 0
id_map.loc[id_map["id"].isin(array), "target"] = 1

# extract "sub-domain", "domain", "suf" from url of webpage
list_ = [list(extract(id_map.loc[i].webpage)) for i in id_map.index]
id_map.loc[:, ["sub","domain","suf"]] = list_

# url preprocessing
id_map.loc[:, "domain"] = [text_preprocessing_1(text=x) for x in id_map.loc[:, "domain"].values]
id_map.loc[:, "sub"] = [text_preprocessing_1(text=x) for x in id_map.loc[:, "sub"].values]
id_map.loc[:, "sub"] = id_map.loc[:, "sub"].apply(lambda x: "" if x in ["www", "js init"] else x)
id_map["webpage_update"] = id_map.apply(lambda x: ".".join([x[3], x[4]]), axis=1)
id_map.loc[:, "webpage_update"] = id_map.loc[:, "webpage_update"].apply(lambda x: "twitter" if x in [".twitter", "platform.twitter"] else x)
id_map.loc[:, "webpage_update"] = id_map.loc[:, "webpage_update"].apply(lambda x: "facebook" if x in ["static ak.facebook", ".facebook", "connect.facebook"] else x)

In [21]:
vectorizer_params={'ngram_range': (1, 5), 'max_features': 30000,
                   "min_df": 2, "max_df": 0.7,
                   'tokenizer': lambda s: s.split("-")
                   }

In [22]:
# time_features + popular_features
x_train1, y_train1, x_test1, vectorizer1, columns1, cat_columns1 = extraction(train=train.copy(), x_test=x_test.copy(), id_map=id_map, 
                                                                            vectorizer_params=vectorizer_params, time_features=True,
                                                                            popular_features=True, remove_obs=False, columns_tf_idf = features_list,
                                                                            tf_idf=True, word2vec=False, tf_w2v=False)

extract time features
intersection features
extract popularity features
apply tf idf


In [None]:
# time_features + popular_features
x_train2, y_train2, x_test2, vectorizer2, columns2_1, columns2_2, cat_columns2 = extraction(train=train.copy(), x_test=x_test.copy(), id_map=id_map, 
                                                                                            vectorizer_params=vectorizer_params, time_features=True,
                                                                                            popular_features=True, remove_obs=False, columns_tf_idf = features_list,
                                                                                            tf_idf=False, word2vec=False, tf_w2v=True)

extract time features
intersection features
extract popularity features
apply tf idf
word2vec
Time taken to train word2vec model: 0.352 min
data shape:  159969


0it [00:00, ?it/s]

data shape:  93338


0it [00:00, ?it/s]

## Catboost

In [None]:
estimator = CatBoostClassifier(verbose=0, task_type="GPU", loss_function="Logloss", 
                               cat_features=cat_columns1, early_stopping_rounds=20)

space = {
    'learning_rate': hp.loguniform('learning_rate', low=-4*np.log(10), high=-1*np.log(10)),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 1, 75, 1),
    'iterations': hp.quniform('iterations', 250, 750,1),
    'l2_leaf_reg' : hp.uniform('l2_leaf_reg', 0, 10),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 10),
    'auto_class_weights': hp.choice('auto_class_weights', ['Balanced', 'SqrtBalanced'])
    }

trials = Trials()

best = fmin(
    # функция для оптимизации 
    fn=partial(objective, estimator=estimator, x_train=x_train1, y_train=y_train1),
    # пространство поиска гиперпараметров
    space=space,
    # алгоритм поиска
    algo=tpe.suggest,
    # число итераций
    max_evals=50,
    # куда сохранять историю поиска
    trials=trials,
    # random state
    rstate=np.random.RandomState(1),
    # progressbar
    show_progressbar=True
    )

  0%|          | 0/50 [00:00<?, ?it/s, best loss: ?]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

AUC 0.927 params {'learning_rate': 0.07040123201241501, 'max_depth': 11.0, 'min_data_in_leaf': 19.0, 'iterations': 539.0, 'l2_leaf_reg': 3.832111930100426, 'bagging_temperature': 0.9057019926571874, 'auto_class_weights': 'Balanced'}
AUC 0.906 params {'learning_rate': 0.0013510630867871877, 'max_depth': 4.0, 'min_data_in_leaf': 53.0, 'iterations': 497.0, 'l2_leaf_reg': 1.4564339539751736, 'bagging_temperature': 8.889254049717005, 'auto_class_weights': 'Balanced'}
AUC 0.893 params {'learning_rate': 0.00012252759808604552, 'max_depth': 7.0, 'min_data_in_leaf': 52.0, 'iterations': 452.0, 'l2_leaf_reg': 4.347750061961953, 'bagging_temperature': 7.4643788472382875, 'auto_class_weights': 'Balanced'}
AUC 0.931 params {'learning_rate': 0.020398755272921575, 'max_depth': 3.0, 'min_data_in_leaf': 62.0, 'iterations': 474.0, 'l2_leaf_reg': 2.801205621797558, 'bagging_temperature': 6.897676461199843, 'auto_class_weights': 'Balanced'}
AUC 0.891 params {'learning_rate': 0.00023054471703432773, 'max_de

In [None]:
best

In [None]:
lgbm_pred = estimator.predict_proba(x_test1)[:, 1]
save_submission(lgbm_pred, number=27)