In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
tqdm.pandas()

In [2]:
hosts_original = pd.read_csv("data/host.csv", header=None)
hosts_original.columns = ["url"]

In [3]:
hosts_original.head()

Unnamed: 0,url
0,api.youla.io
1,favicon.yandex.net
2,w-74721.fp.kaspersky-labs.com
3,questtime.net
4,passport-authproxy.taxi.yandex.net


### Анализ датасета
* Очень много хостов, которые являются облаками крупных компаний
* Явный дисбаланс в сторону "технических" хостов

#### В качестве train сета будем использовать собранный командой датасет
target: 
* 0 - ответ, привычный для пользователя
* 1 - ответ, соответствующий служебному (техническому) взаимодействию 

Также в качестве попытки устранить дисбаланс на обучающей выборке, был добавлен датасет, содержащий обычные пользовательские сайты (`data/train_zeros.csv`)

In [4]:
train_1 = pd.read_csv("data/train_dan.csv", sep=";")

train_2 = pd.read_csv("data/train_den.csv", sep=";", header=None)
train_2.columns = ["url", "target"]

train_3 = pd.read_csv("data/train_kon.csv")
train_3.columns = ["url", "target"]
train_3 = train_3[train_3["target"].notna()].reset_index(drop=True)
train_3["target"] = train_3["target"].astype(int)

train_zeros = pd.read_csv("data/train_zeros.csv")
train_zeros.dropna(inplace=True)
train_zeros["target"] = train_zeros["target"].astype(int)


train = pd.concat([train_1, train_2, train_3, train_zeros])

In [5]:
train.head()

Unnamed: 0,url,target
0,api.youla.io,0
1,favicon.yandex.net,1
2,w-74721.fp.kaspersky-labs.com,1
3,questtime.net,0
4,passport-authproxy.taxi.yandex.net,1


In [6]:
train.shape

(966, 2)

#### Посмортим, какие компоненты чаще всего встречаются в url'ах

In [7]:
def get_top_components(urls):
    words = []
    for url in tqdm(urls):
        words += url.split(".") 
    words = pd.Series(words)
    return words.value_counts()

In [8]:
top_components = get_top_components(hosts_original["url"])

100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:01<00:00, 745073.75it/s]


In [9]:
top_components.head(30)

com                  520391
ru                   172788
net                  153952
yandex                63034
userapi               46256
me                    45263
mycdn                 37291
googlevideo           32175
www                   24163
cdn                   23947
googleapis            23779
tiktokcdn             23406
api                   21447
googlesyndication     21416
apple                 20303
safeframe             19802
google                18399
img                   17551
st                    17017
strm                  16765
org                   16732
io                    16224
avito                 16071
mts                   14195
0                     11399
fbcdn                 10349
mail                   9982
push                   9884
match                  9682
akadns                 9592
dtype: int64

#### Для train датасета добавим фичи, которые не требуют выхода в онлай

In [10]:
def get_offline_features(t):
    df = t.copy()
    df["start_with_api"] = df["url"].str.contains("^api", regex=True).astype(int)
    df["has_userapi"] = df["url"].str.contains("userapi").astype(int)
    df["has_googleapis"] = df["url"].str.contains("googleapis").astype(int)
    df["size_of_url"] = df["url"].apply(lambda x: len(x))
    df["size_of_url_split"] = df["url"].apply(lambda x: len(x.split(".")))
    df["clear_url"] =  df["url"].apply(lambda x: x.replace(".", " "))
    df["minus_count"] = df["url"].str.count("-")
    return df

In [11]:
train = get_offline_features(train)

In [12]:
train.head()

Unnamed: 0,url,target,start_with_api,has_userapi,has_googleapis,size_of_url,size_of_url_split,clear_url,minus_count
0,api.youla.io,0,1,0,0,12,3,api youla io,0
1,favicon.yandex.net,1,0,0,0,18,3,favicon yandex net,0
2,w-74721.fp.kaspersky-labs.com,1,0,0,0,29,4,w-74721 fp kaspersky-labs com,2
3,questtime.net,0,0,0,0,13,2,questtime net,0
4,passport-authproxy.taxi.yandex.net,1,0,0,0,34,4,passport-authproxy taxi yandex net,1


#### Теперь создадим модель на основе CatBoost
Обучать модель мы будем несколько раз, поэтому обернём её в функцию

In [13]:
def get_model(train, cat_features, text_features):
    
    # Сначала разделим выборку на две части
    x_train, x_validation, y_train, y_validation = train_test_split(train.drop(columns=["url", "target"]), 
                                                                train["target"], 
                                                                stratify=train["target"],
                                                                test_size=0.33, 
#                                                                 random_state=3
                                                               )
    # Создадим экземпляр модели
    # CatBoost выполнит препроцессинг для текста (колонка clear_url) за нас, главное - передать настройки
    # Аналогично нет необходимости задумываться о категориальных фичах
    model = CatBoostClassifier(iterations=500,
#                             depth = 6,
                            learning_rate = 0.07,
#                             l2_leaf_reg = 4,
                            eval_metric="F1",
                            loss_function = "Logloss",
                            task_type="GPU",
                            # fold_permutation_block = 2,
                            # fold_len_multiplier = 1.5,
                            leaf_estimation_iterations = 10,
                            max_ctr_complexity = 5,
                            random_seed= 127,
                               
                            cat_features = cat_features,
                            text_features = ["clear_url"],
                               
                            text_processing = {
                                "tokenizers" : [{
                                "tokenizer_id" : "Space",
                                "separator_type" : "ByDelimiter",
                                "delimiter" : " "
                            }],

                            "dictionaries" : [{
                                "dictionary_id" : "BiGram",
                                "token_level_type": "Letter",
                                "max_dictionary_size" : "150000",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "2"
                            }, {
                                "dictionary_id" : "Trigram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "3"
                            }, {
                                "dictionary_id" : "Fourgram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "4"
                            }, {
                                "dictionary_id" : "Word",
                                "max_dictionary_size" : "30000",
                                "occurrence_lower_bound" : "3",
                                "gram_order" : "1"
                            }, {
                                "dictionary_id" : "Fivegram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "5"
                            }, {
                                "dictionary_id" : "Sixgram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "6"
                            }
                            ],

                            "feature_processing" : {
                                "default" : [
                                        {
                                        "dictionaries_names" : ["BiGram", "Trigram", "Fourgram", "Word", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["BoW"],
                                        "tokenizers_names" : ["Space"]
                                    },{
                                        "dictionaries_names" : ["BiGram", "Trigram", "Fourgram", "Word", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["NaiveBayes"],
                                        "tokenizers_names" : ["Space"]
                                    },{
                                        "dictionaries_names" : [ "BiGram", "Trigram", "Fourgram", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["BM25"],
                                        "tokenizers_names" : ["Space"]
                                    }
                                ],
                            }
                        }
      )
    
    #Запустим обучение модели
    model.fit(x_train, 
          y_train, 
          eval_set=(x_validation, y_validation), 
          use_best_model=True, 
          early_stopping_rounds=300,  
#           plot=True, 
          verbose=50
          )
    
    print("\nVS\nConst 1: ", f1_score(y_validation, np.ones(len(y_validation))))
    
    return model

In [14]:
cat_features = ['start_with_api', 'has_userapi', 'has_googleapis']
text_features = ["clear_url"]

model = get_model(train, cat_features, text_features)

0:	learn: 0.8769634	test: 0.8347339	best: 0.8347339 (0)	total: 144ms	remaining: 1m 11s
50:	learn: 0.9326425	test: 0.8770053	best: 0.9030612 (6)	total: 4.92s	remaining: 43.3s
100:	learn: 0.9516340	test: 0.8750000	best: 0.9030612 (6)	total: 9.29s	remaining: 36.7s
150:	learn: 0.9711286	test: 0.8804348	best: 0.9030612 (6)	total: 13.5s	remaining: 31.3s
200:	learn: 0.9749009	test: 0.8821918	best: 0.9030612 (6)	total: 17.8s	remaining: 26.5s
250:	learn: 0.9827357	test: 0.8712329	best: 0.9030612 (6)	total: 22.2s	remaining: 22s
300:	learn: 0.9866667	test: 0.8657534	best: 0.9030612 (6)	total: 26.6s	remaining: 17.6s
bestTest = 0.9030612245
bestIteration = 6
Shrink model to first 7 iterations.

VS
Const 1:  0.7290836653386454


#### Результат: 0.90306
Констатное предсказание 1: 0.729083


#### Теперь попробуем добавить в датасет фичи, которые можно получить, обратившись к хосту

In [None]:
def get_online_features(t):
    df = t.copy()
    
    status_codes = []
    content_types = []
    is_json = []
    is_redirect = []
    size_of_cookies = []
    encodings = []
    
    def process_url(url):
        try:
            response = requests.get("http://"+url, timeout=0.5)

            try: 
                status_codes.append(response.status_code)
            except:
                status_codes.append(-1)

            try:
                content_types.append(response.headers["Content-Type"].split(";")[0])
            except:
                content_types.append("none")

            try: 
                response.json()
                is_json.append(1)
            except:
                is_json.append(0)

            try:
                is_redirect.append(int(response.is_redirect))
            except:
                is_redirect.append(-1)

            try:
                size_of_cookies.append(len(response.cookies))
            except:
                size_of_cookies.append(-1)

            try:
                encodings.append(response.encoding.lower() if len(response.encoding) != 0 else "none")
            except: 
                encodings.append("none")

        except:
            status_codes.append(-1)
            content_types.append("none")
            is_json.append(-1)
            is_redirect.append(-1)
            size_of_cookies.append(-1)
            encodings.append("none")
            
    df["url"].progress_apply(process_url)
    
    df["content_type"] = content_types
    df["status_code"] = status_codes
    df["is_json"] = is_json
    df["is_redirect"] = is_redirect
    df["encoding"] = encodings
    df["size_of_cookies"] = size_of_cookies
    return df

In [None]:
train = get_online_features(train)

In [None]:
train.head()

#### Снова обучим модель

In [None]:
cat_features = ['start_with_api', 'has_userapi', 'has_googleapis', 'content_type', 'status_code',
       'is_json', 'is_redirect', 'encoding']
text_features = ["clear_url"]

model_v2 = get_model(train, cat_features, text_features)

In [None]:
model_v2.get_feature_importance(prettified=True)

#### Видим улучшение результата: 0.91623, и оно появилось именно из-за online фичей