In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
tqdm.pandas()

In [2]:
hosts = pd.read_csv("data/host.csv", header=None)
hosts.columns = ["url"]

In [3]:
hosts.head()

Unnamed: 0,url
0,api.youla.io
1,favicon.yandex.net
2,w-74721.fp.kaspersky-labs.com
3,questtime.net
4,passport-authproxy.taxi.yandex.net


In [4]:
hosts.shape

(199944, 1)

### Анализ датасета
* Очень много хостов, которые являются облаками крупных компаний
* Явный дисбаланс в сторону "технических" хостов

#### В качестве train сета будем использовать собранный командой датасет
target: 
* 0 - ответ, привычный для пользователя
* 1 - ответ, соответствующий служебному (техническому) взаимодействию 

Также в качестве попытки устранить дисбаланс на обучающей выборке, был добавлен сабсет, содержащий обычные пользовательские сайты (спарсили по запросу популярных сайтов в поисковике)

In [5]:
train = pd.read_csv("data/train.csv")

In [6]:
train.head()

Unnamed: 0,url,target
0,api.youla.io,0
1,favicon.yandex.net,1
2,w-74721.fp.kaspersky-labs.com,1
3,questtime.net,0
4,passport-authproxy.taxi.yandex.net,1


In [7]:
train.shape

(1426, 2)

#### Посмортим, какие компоненты чаще всего встречаются в url'ах

In [8]:
def get_top_components(urls):
    words = []
    for url in tqdm(urls):
        words += url.split(".") 
    words = pd.Series(words)
    return words.value_counts()

In [9]:
top_components = get_top_components(hosts["url"])

100%|██████████████████████████████████████████████████████████████████████| 199944/199944 [00:00<00:00, 891005.46it/s]


In [10]:
top_components.head(30)

com                  81315
ru                   47350
net                  22317
googlesyndication    19807
safeframe            19802
yandex               10472
0                     8436
mts                   8194
match                 8013
verify                7940
whiteboxdigital       7111
mitdmp                7110
org                   5992
local                 5457
me                    4779
io                    4689
www                   4653
cc                    4140
googlevideo           4137
pushy                 3661
biz                   3570
in                    3373
cdn                   2767
2                     2194
arpa                  2110
in-addr               2109
1                     2103
v1                    1787
a                     1750
e                     1747
dtype: int64

#### Для train датасета добавим фичи, которые не требуют выхода в онлай

In [11]:
def get_offline_features(t):
    df = t.copy()
    df["start_with_api"] = df["url"].str.contains("^api", regex=True).astype(int)
    df["has_userapi"] = df["url"].str.contains("userapi").astype(int)
    df["has_googleapis"] = df["url"].str.contains("googleapis").astype(int)
    df["size_of_url"] = df["url"].apply(lambda x: len(x))
    df["size_of_url_split"] = df["url"].apply(lambda x: len(x.split(".")))
    df["clear_url"] =  df["url"].apply(lambda x: x.replace(".", " "))
    df["minus_count"] = df["url"].str.count("-")
    return df

In [12]:
train = get_offline_features(train)

In [13]:
train.head()

Unnamed: 0,url,target,start_with_api,has_userapi,has_googleapis,size_of_url,size_of_url_split,clear_url,minus_count
0,api.youla.io,0,1,0,0,12,3,api youla io,0
1,favicon.yandex.net,1,0,0,0,18,3,favicon yandex net,0
2,w-74721.fp.kaspersky-labs.com,1,0,0,0,29,4,w-74721 fp kaspersky-labs com,2
3,questtime.net,0,0,0,0,13,2,questtime net,0
4,passport-authproxy.taxi.yandex.net,1,0,0,0,34,4,passport-authproxy taxi yandex net,1


#### Теперь создадим модель на основе CatBoost
Обучать модель мы будем несколько раз, поэтому обернём её в функцию

In [14]:
def get_model(train, cat_features, text_features):
    
    # Сначала разделим выборку на две части
    x_train, x_validation, y_train, y_validation = train_test_split(train.drop(columns=["url", "target"]), 
                                                                train["target"], 
                                                                stratify=train["target"],
                                                                test_size=0.33, 
                                                                random_state=127
                                                               )
    # Создадим экземпляр модели
    # CatBoost выполнит препроцессинг для текста (колонка clear_url) за нас, главное - передать настройки
    # Аналогично нет необходимости задумываться о категориальных фичах
    model = CatBoostClassifier(iterations=500,
#                             depth = 6,
                            learning_rate = 0.07,
#                             l2_leaf_reg = 4,
                            eval_metric="F1",
                            loss_function = "Logloss",
                            task_type="GPU",
                            # fold_permutation_block = 2,
                            # fold_len_multiplier = 1.5,
                            leaf_estimation_iterations = 10,
                            max_ctr_complexity = 5,
                            random_seed= 127,
                               
                            cat_features = cat_features,
                            text_features = ["clear_url"],
                               
                            text_processing = {
                                "tokenizers" : [{
                                "tokenizer_id" : "Space",
                                "separator_type" : "ByDelimiter",
                                "delimiter" : " "
                            }],

                            "dictionaries" : [{
                                "dictionary_id" : "BiGram",
                                "token_level_type": "Letter",
                                "max_dictionary_size" : "150000",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "2"
                            }, {
                                "dictionary_id" : "Trigram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "3"
                            }, {
                                "dictionary_id" : "Fourgram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "4"
                            }, {
                                "dictionary_id" : "Word",
                                "max_dictionary_size" : "30000",
                                "occurrence_lower_bound" : "3",
                                "gram_order" : "1"
                            }, {
                                "dictionary_id" : "Fivegram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "5"
                            }, {
                                "dictionary_id" : "Sixgram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "6"
                            }
                            ],

                            "feature_processing" : {
                                "default" : [
                                        {
                                        "dictionaries_names" : ["BiGram", "Trigram", "Fourgram", "Word", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["BoW"],
                                        "tokenizers_names" : ["Space"]
                                    },{
                                        "dictionaries_names" : ["BiGram", "Trigram", "Fourgram", "Word", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["NaiveBayes"],
                                        "tokenizers_names" : ["Space"]
                                    },{
                                        "dictionaries_names" : [ "BiGram", "Trigram", "Fourgram", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["BM25"],
                                        "tokenizers_names" : ["Space"]
                                    }
                                ],
                            }
                        }
      )
    
    #Запустим обучение модели
    model.fit(x_train, 
          y_train, 
          eval_set=(x_validation, y_validation), 
          use_best_model=True, 
#           early_stopping_rounds=300,  
#           plot=True, 
          verbose=50
          )
    
    print("\nVS\nConst 1: ", f1_score(y_validation, np.ones(len(y_validation))))
    
    return model

In [15]:
cat_features = ['start_with_api', 'has_userapi', 'has_googleapis']
text_features = ["clear_url"]

model_v1 = get_model(train, cat_features, text_features)

0:	learn: 0.8788660	test: 0.8825065	best: 0.8825065 (0)	total: 116ms	remaining: 57.7s
50:	learn: 0.9502762	test: 0.8975069	best: 0.9005525 (40)	total: 4.86s	remaining: 42.8s
100:	learn: 0.9625520	test: 0.9060773	best: 0.9085873 (78)	total: 8.79s	remaining: 34.7s
150:	learn: 0.9762238	test: 0.9060773	best: 0.9090909 (104)	total: 12.9s	remaining: 29.9s
200:	learn: 0.9817159	test: 0.8938547	best: 0.9090909 (104)	total: 17.2s	remaining: 25.6s
250:	learn: 0.9915254	test: 0.8969359	best: 0.9090909 (104)	total: 21.4s	remaining: 21.3s
300:	learn: 0.9929279	test: 0.8932584	best: 0.9090909 (104)	total: 25.6s	remaining: 17s
350:	learn: 0.9971671	test: 0.8938547	best: 0.9090909 (104)	total: 30.1s	remaining: 12.8s
400:	learn: 0.9971671	test: 0.8907563	best: 0.9090909 (104)	total: 34.5s	remaining: 8.53s
450:	learn: 0.9985856	test: 0.8901408	best: 0.9090909 (104)	total: 38.9s	remaining: 4.22s
499:	learn: 0.9985856	test: 0.8901408	best: 0.9090909 (104)	total: 43.2s	remaining: 0us
bestTest = 0.90909090

In [16]:
model_v1.get_feature_importance(prettified=True).head()

Unnamed: 0,Feature Id,Importances
0,size_of_url_split,48.573407
1,clear_url,43.57842
2,size_of_url,7.577916
3,minus_count,0.270257
4,start_with_api,0.0


#### Средний результат нескольких запусков: 0.89032
* Против 0.54180 при константном предсказании 1


### 1 вариант улучшения модели
- Используя обученную модель, предсказываем метки класса оставшихся записей из hosts
- Используем полученный датасет для обучения новой модели
- Проверяем её качество на исходном датасете (собранном участниками команды)

In [17]:
train_v2 = get_offline_features(hosts[~hosts["url"].isin(train["url"])])
print("Shape: ",train_v2.shape)
predict = model_v1.predict(train_v2.drop(columns=["url"]))
train_v2["target"] = predict

cat_features = ['start_with_api', 'has_userapi', 'has_googleapis']
text_features = ["clear_url"]

Shape:  (199058, 8)


In [18]:
model_v2 = get_model(train_v2, cat_features, text_features)

predicts = model_v2.predict(train.drop(columns=["url", "target"]))
print("\n\nFinal score: ", f1_score(train["target"], predicts))
print("Constant prediction 1: ", f1_score(train["target"], np.ones(len(train["target"]))))

0:	learn: 0.9664572	test: 0.9691368	best: 0.9691368 (0)	total: 114ms	remaining: 56.7s
50:	learn: 0.9740429	test: 0.9755576	best: 0.9755576 (50)	total: 4.83s	remaining: 42.5s
100:	learn: 0.9772020	test: 0.9779606	best: 0.9779606 (99)	total: 8.87s	remaining: 35s
150:	learn: 0.9788014	test: 0.9796558	best: 0.9796558 (150)	total: 12.9s	remaining: 29.8s
200:	learn: 0.9803010	test: 0.9809753	best: 0.9809753 (200)	total: 16.9s	remaining: 25.1s
250:	learn: 0.9813624	test: 0.9816421	best: 0.9816421 (250)	total: 20.7s	remaining: 20.5s
300:	learn: 0.9823618	test: 0.9822627	best: 0.9822627 (300)	total: 24.5s	remaining: 16.2s
350:	learn: 0.9829798	test: 0.9827042	best: 0.9827042 (350)	total: 28.4s	remaining: 12.1s
400:	learn: 0.9836181	test: 0.9830171	best: 0.9830282 (395)	total: 32.2s	remaining: 7.95s
450:	learn: 0.9842085	test: 0.9833707	best: 0.9833707 (450)	total: 35.9s	remaining: 3.9s
499:	learn: 0.9846707	test: 0.9835768	best: 0.9836177 (488)	total: 39.5s	remaining: 0us
bestTest = 0.983617665

#### Средний результат нескольких запусков: 0.89840
* Против 0.89032 предыдущей модели
* Против 0.54118 при константном предсказании 1

### 2 вариант улучшения модели
Попробуем добавить в исходный обучающий датасет фичи, которые можно получить, обратившись к хосту  
Из них мы выбрали:
* Content-Type ответа
* Status code
* Можно ли спарсить ответ JSON'ом
* и другие


In [19]:
def get_online_features(t):
    df = t.copy()
    
    status_codes = []
    content_types = []
    is_json = []
    is_redirect = []
    size_of_cookies = []
    encodings = []
    
    def process_url(url):
        try:
            response = requests.get("http://"+url, timeout=0.5)

            try: 
                status_codes.append(response.status_code)
            except:
                status_codes.append(-1)

            try:
                content_types.append(response.headers["Content-Type"].split(";")[0])
            except:
                content_types.append("none")

            try: 
                response.json()
                is_json.append(1)
            except:
                is_json.append(0)

            try:
                is_redirect.append(int(response.is_redirect))
            except:
                is_redirect.append(-1)

            try:
                size_of_cookies.append(len(response.cookies))
            except:
                size_of_cookies.append(-1)

            try:
                encodings.append(response.encoding.lower() if len(response.encoding) != 0 else "none")
            except: 
                encodings.append("none")

        except:
            status_codes.append(-1)
            content_types.append("none")
            is_json.append(-1)
            is_redirect.append(-1)
            size_of_cookies.append(-1)
            encodings.append("none")
            
    df["url"].progress_apply(process_url)
    
    df["content_type"] = content_types
    df["status_code"] = status_codes
    df["is_json"] = is_json
    df["is_redirect"] = is_redirect
    df["encoding"] = encodings
    df["size_of_cookies"] = size_of_cookies
    return df

In [20]:
train_v3 = get_online_features(train)

100%|██████████████████████████████████████████████████████████████████████████████| 1426/1426 [17:44<00:00,  1.34it/s]


In [21]:
train_v3.head()

Unnamed: 0,url,target,start_with_api,has_userapi,has_googleapis,size_of_url,size_of_url_split,clear_url,minus_count,content_type,status_code,is_json,is_redirect,encoding,size_of_cookies
0,api.youla.io,0,1,0,0,12,3,api youla io,0,text/html,200,0,0,utf-8,0
1,favicon.yandex.net,1,0,0,0,18,3,favicon yandex net,0,none,404,0,0,none,0
2,w-74721.fp.kaspersky-labs.com,1,0,0,0,29,4,w-74721 fp kaspersky-labs com,2,none,-1,-1,-1,none,-1
3,questtime.net,0,0,0,0,13,2,questtime net,0,text/html,200,0,0,utf-8,1
4,passport-authproxy.taxi.yandex.net,1,0,0,0,34,4,passport-authproxy taxi yandex net,1,application/json,404,1,0,utf-8,0


#### Снова обучим модель

In [22]:
cat_features = ['start_with_api', 'has_userapi', 'has_googleapis', 'content_type', 'status_code',
       'is_json', 'is_redirect', 'encoding']
text_features = ["clear_url"]

model_v3 = get_model(train_v3, cat_features, text_features)

0:	learn: 0.8056738	test: 0.7635328	best: 0.7635328 (0)	total: 136ms	remaining: 1m 7s
50:	learn: 0.9116022	test: 0.8994413	best: 0.8994413 (48)	total: 4.4s	remaining: 38.8s
100:	learn: 0.9279778	test: 0.9025070	best: 0.9025070 (96)	total: 8.43s	remaining: 33.3s
150:	learn: 0.9374131	test: 0.8975069	best: 0.9030471 (121)	total: 12.7s	remaining: 29.3s
200:	learn: 0.9415042	test: 0.9030471	best: 0.9055556 (199)	total: 17s	remaining: 25.2s
250:	learn: 0.9415042	test: 0.9000000	best: 0.9055556 (199)	total: 20.7s	remaining: 20.5s
300:	learn: 0.9413408	test: 0.8938547	best: 0.9055556 (199)	total: 24.4s	remaining: 16.1s
350:	learn: 0.9439776	test: 0.8938547	best: 0.9055556 (199)	total: 27.8s	remaining: 11.8s
400:	learn: 0.9466292	test: 0.8938547	best: 0.9055556 (199)	total: 31.5s	remaining: 7.78s
450:	learn: 0.9523810	test: 0.8938547	best: 0.9055556 (199)	total: 34.9s	remaining: 3.79s
499:	learn: 0.9523810	test: 0.8963585	best: 0.9055556 (199)	total: 38.3s	remaining: 0us
bestTest = 0.905555555

In [23]:
model_v3.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,size_of_url_split,31.611967
1,clear_url,26.815538
2,status_code,16.79977
3,content_type,14.643261
4,encoding,5.347574
5,size_of_url,2.715102
6,is_json,2.066788
7,start_with_api,0.0
8,has_userapi,0.0
9,has_googleapis,0.0


#### Средний результат после нескольких запусков: 0.90181
* Против 0.89032 первой модели
* Против 0.89840 второй модели
* Против 0.54118 константного предсказания 1