In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
tqdm.pandas()

In [2]:
hosts_original = pd.read_csv("data/host.csv", header=None)
hosts_original.columns = ["url"]

In [3]:
hosts_original.head()

Unnamed: 0,url
0,api.youla.io
1,favicon.yandex.net
2,w-74721.fp.kaspersky-labs.com
3,questtime.net
4,passport-authproxy.taxi.yandex.net


In [4]:
hosts_original.shape

(1000000, 1)

### Анализ датасета
* Очень много хостов, которые являются облаками крупных компаний
* Явный дисбаланс в сторону "технических" хостов

#### В качестве train сета будем использовать собранный командой датасет
target: 
* 0 - ответ, привычный для пользователя
* 1 - ответ, соответствующий служебному (техническому) взаимодействию 

Также в качестве попытки устранить дисбаланс на обучающей выборке, был добавлен датасет, содержащий обычные пользовательские сайты (`data/train_zeros.csv`)

In [5]:
train_1 = pd.read_csv("data/train_dan.csv", sep=";")

train_2 = pd.read_csv("data/train_den.csv", sep=";", header=None)
train_2.columns = ["url", "target"]

train_3 = pd.read_csv("data/train_kon.csv")
train_3.columns = ["url", "target"]
train_3 = train_3[train_3["target"].notna()].reset_index(drop=True)
train_3["target"] = train_3["target"].astype(int)

train_zeros = pd.read_csv("data/train_zeros.csv")
train_zeros.dropna(inplace=True)
train_zeros["target"] = train_zeros["target"].astype(int)


train = pd.concat([train_1, train_2, train_3, train_zeros])

In [6]:
train.head()

Unnamed: 0,url,target
0,api.youla.io,0
1,favicon.yandex.net,1
2,w-74721.fp.kaspersky-labs.com,1
3,questtime.net,0
4,passport-authproxy.taxi.yandex.net,1


In [7]:
train.shape

(966, 2)

#### Посмортим, какие компоненты чаще всего встречаются в url'ах

In [8]:
def get_top_components(urls):
    words = []
    for url in tqdm(urls):
        words += url.split(".") 
    words = pd.Series(words)
    return words.value_counts()

In [9]:
top_components = get_top_components(hosts_original["url"])

100%|███████████████████████████████████████████████████████████████████| 1000000/1000000 [00:00<00:00, 1618128.52it/s]


In [10]:
top_components.head(30)

com                  520391
ru                   172788
net                  153952
yandex                63034
userapi               46256
me                    45263
mycdn                 37291
googlevideo           32175
www                   24163
cdn                   23947
googleapis            23779
tiktokcdn             23406
api                   21447
googlesyndication     21416
apple                 20303
safeframe             19802
google                18399
img                   17551
st                    17017
strm                  16765
org                   16732
io                    16224
avito                 16071
mts                   14195
0                     11399
fbcdn                 10349
mail                   9982
push                   9884
match                  9682
akadns                 9592
dtype: int64

#### Для train датасета добавим фичи, которые не требуют выхода в онлай

In [11]:
def get_offline_features(t):
    df = t.copy()
    df["start_with_api"] = df["url"].str.contains("^api", regex=True).astype(int)
    df["has_userapi"] = df["url"].str.contains("userapi").astype(int)
    df["has_googleapis"] = df["url"].str.contains("googleapis").astype(int)
    df["size_of_url"] = df["url"].apply(lambda x: len(x))
    df["size_of_url_split"] = df["url"].apply(lambda x: len(x.split(".")))
    df["clear_url"] =  df["url"].apply(lambda x: x.replace(".", " "))
    df["minus_count"] = df["url"].str.count("-")
    return df

In [12]:
train = get_offline_features(train)

In [13]:
train.head()

Unnamed: 0,url,target,start_with_api,has_userapi,has_googleapis,size_of_url,size_of_url_split,clear_url,minus_count
0,api.youla.io,0,1,0,0,12,3,api youla io,0
1,favicon.yandex.net,1,0,0,0,18,3,favicon yandex net,0
2,w-74721.fp.kaspersky-labs.com,1,0,0,0,29,4,w-74721 fp kaspersky-labs com,2
3,questtime.net,0,0,0,0,13,2,questtime net,0
4,passport-authproxy.taxi.yandex.net,1,0,0,0,34,4,passport-authproxy taxi yandex net,1


#### Теперь создадим модель на основе CatBoost
Обучать модель мы будем несколько раз, поэтому обернём её в функцию

In [14]:
def get_model(train, cat_features, text_features):
    
    # Сначала разделим выборку на две части
    x_train, x_validation, y_train, y_validation = train_test_split(train.drop(columns=["url", "target"]), 
                                                                train["target"], 
                                                                stratify=train["target"],
                                                                test_size=0.33, 
                                                                random_state=778
                                                               )
    # Создадим экземпляр модели
    # CatBoost выполнит препроцессинг для текста (колонка clear_url) за нас, главное - передать настройки
    # Аналогично нет необходимости задумываться о категориальных фичах
    model = CatBoostClassifier(iterations=500,
#                             depth = 6,
                            learning_rate = 0.07,
#                             l2_leaf_reg = 4,
                            eval_metric="F1",
                            loss_function = "Logloss",
                            task_type="GPU",
                            # fold_permutation_block = 2,
                            # fold_len_multiplier = 1.5,
                            leaf_estimation_iterations = 10,
                            max_ctr_complexity = 5,
                            random_seed= 127,
                               
                            cat_features = cat_features,
                            text_features = ["clear_url"],
                               
                            text_processing = {
                                "tokenizers" : [{
                                "tokenizer_id" : "Space",
                                "separator_type" : "ByDelimiter",
                                "delimiter" : " "
                            }],

                            "dictionaries" : [{
                                "dictionary_id" : "BiGram",
                                "token_level_type": "Letter",
                                "max_dictionary_size" : "150000",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "2"
                            }, {
                                "dictionary_id" : "Trigram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "3"
                            }, {
                                "dictionary_id" : "Fourgram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "4"
                            }, {
                                "dictionary_id" : "Word",
                                "max_dictionary_size" : "30000",
                                "occurrence_lower_bound" : "3",
                                "gram_order" : "1"
                            }, {
                                "dictionary_id" : "Fivegram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "5"
                            }, {
                                "dictionary_id" : "Sixgram",
                                "max_dictionary_size" : "150000",
                                "token_level_type": "Letter",
                                "occurrence_lower_bound" : "1",
                                "gram_order" : "6"
                            }
                            ],

                            "feature_processing" : {
                                "default" : [
                                        {
                                        "dictionaries_names" : ["BiGram", "Trigram", "Fourgram", "Word", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["BoW"],
                                        "tokenizers_names" : ["Space"]
                                    },{
                                        "dictionaries_names" : ["BiGram", "Trigram", "Fourgram", "Word", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["NaiveBayes"],
                                        "tokenizers_names" : ["Space"]
                                    },{
                                        "dictionaries_names" : [ "BiGram", "Trigram", "Fourgram", "Fivegram", "Sixgram"],
                                        "feature_calcers" : ["BM25"],
                                        "tokenizers_names" : ["Space"]
                                    }
                                ],
                            }
                        }
      )
    
    #Запустим обучение модели
    model.fit(x_train, 
          y_train, 
          eval_set=(x_validation, y_validation), 
          use_best_model=True, 
#           early_stopping_rounds=300,  
#           plot=True, 
          verbose=50
          )
    
    print("\nVS\nConst 1: ", f1_score(y_validation, np.ones(len(y_validation))))
    
    return model

In [15]:
cat_features = ['start_with_api', 'has_userapi', 'has_googleapis']
text_features = ["clear_url"]

model_v1 = get_model(train, cat_features, text_features)

0:	learn: 0.8719899	test: 0.8785530	best: 0.8785530 (0)	total: 105ms	remaining: 52.6s
50:	learn: 0.9408673	test: 0.8810811	best: 0.9052632 (13)	total: 3.78s	remaining: 33.3s
100:	learn: 0.9614874	test: 0.8750000	best: 0.9052632 (13)	total: 7.2s	remaining: 28.4s
150:	learn: 0.9748344	test: 0.8719346	best: 0.9052632 (13)	total: 10.8s	remaining: 24.9s
200:	learn: 0.9787798	test: 0.8695652	best: 0.9052632 (13)	total: 14.3s	remaining: 21.3s
250:	learn: 0.9814324	test: 0.8695652	best: 0.9052632 (13)	total: 17.9s	remaining: 17.8s
300:	learn: 0.9880160	test: 0.8664850	best: 0.9052632 (13)	total: 21.2s	remaining: 14s
350:	learn: 0.9933244	test: 0.8726287	best: 0.9052632 (13)	total: 24.6s	remaining: 10.5s
400:	learn: 0.9933244	test: 0.8726287	best: 0.9052632 (13)	total: 28.1s	remaining: 6.93s
450:	learn: 0.9946524	test: 0.8726287	best: 0.9052632 (13)	total: 31.7s	remaining: 3.45s
499:	learn: 0.9959839	test: 0.8726287	best: 0.9052632 (13)	total: 35.2s	remaining: 0us
bestTest = 0.9052631579
bestIt

In [16]:
model_v1.get_feature_importance(prettified=True).head()

Unnamed: 0,Feature Id,Importances
0,size_of_url_split,51.083917
1,clear_url,36.843421
2,size_of_url,11.898553
3,start_with_api,0.174108
4,has_userapi,0.0


#### Результат: 0.90526
* Против 0.72908 при константном предсказании 1


### 1 вариант улучшения модели
- Используя обученную модель, предсказываем метки класса для 500к записей из hosts_original
- Используем полученный датасет для обучения новой модели
- Проверяем её качество на исходном датасете (собранном участниками команды)

In [17]:
train_v2 = get_offline_features(hosts_original.tail(500000))
predict = model_v1.predict(train_v2.drop(columns=["url"]))
train_v2["target"] = predict

cat_features = ['start_with_api', 'has_userapi', 'has_googleapis']
text_features = ["clear_url"]

model_v2 = get_model(train_v2, cat_features, text_features)

predicts = model_v2.predict(train.drop(columns=["url", "target"]))
print("\n\nFinal score: ", f1_score(train["target"], predicts))
print("Constant prediction 1: ", f1_score(train["target"], np.ones(len(train["target"]))))

0:	learn: 0.9857582	test: 0.9878400	best: 0.9878400 (0)	total: 184ms	remaining: 1m 31s
50:	learn: 0.9925805	test: 0.9933972	best: 0.9933972 (50)	total: 9.3s	remaining: 1m 21s
100:	learn: 0.9941210	test: 0.9946154	best: 0.9946154 (100)	total: 16.7s	remaining: 1m 5s
150:	learn: 0.9949634	test: 0.9954007	best: 0.9954007 (150)	total: 23.5s	remaining: 54.4s
200:	learn: 0.9955555	test: 0.9958326	best: 0.9958401 (198)	total: 30.6s	remaining: 45.5s
250:	learn: 0.9959861	test: 0.9962391	best: 0.9962391 (250)	total: 37.3s	remaining: 37s
300:	learn: 0.9963712	test: 0.9966114	best: 0.9966114 (300)	total: 44.2s	remaining: 29.2s
350:	learn: 0.9966537	test: 0.9967864	best: 0.9967901 (349)	total: 50.9s	remaining: 21.6s
400:	learn: 0.9969159	test: 0.9969165	best: 0.9969615 (374)	total: 57.5s	remaining: 14.2s
450:	learn: 0.9970975	test: 0.9971141	best: 0.9971141 (446)	total: 1m 4s	remaining: 6.97s
499:	learn: 0.9972442	test: 0.9972073	best: 0.9972073 (488)	total: 1m 10s	remaining: 0us
bestTest = 0.99720

#### Результат: 0.92121 
* Против 0.90526 предыдущей модели
* Против 0.72978 при константном предсказании

### 2 вариант улучшения модели
Попробуем добавить в исходный обучающий датасет фичи, которые можно получить, обратившись к хосту  
Из них мы выбрали:
* Content-Type ответа
* Status code
* Можно ли спарсить ответ JSON'ом
* и другие


In [18]:
def get_online_features(t):
    df = t.copy()
    
    status_codes = []
    content_types = []
    is_json = []
    is_redirect = []
    size_of_cookies = []
    encodings = []
    
    def process_url(url):
        try:
            response = requests.get("http://"+url, timeout=0.5)

            try: 
                status_codes.append(response.status_code)
            except:
                status_codes.append(-1)

            try:
                content_types.append(response.headers["Content-Type"].split(";")[0])
            except:
                content_types.append("none")

            try: 
                response.json()
                is_json.append(1)
            except:
                is_json.append(0)

            try:
                is_redirect.append(int(response.is_redirect))
            except:
                is_redirect.append(-1)

            try:
                size_of_cookies.append(len(response.cookies))
            except:
                size_of_cookies.append(-1)

            try:
                encodings.append(response.encoding.lower() if len(response.encoding) != 0 else "none")
            except: 
                encodings.append("none")

        except:
            status_codes.append(-1)
            content_types.append("none")
            is_json.append(-1)
            is_redirect.append(-1)
            size_of_cookies.append(-1)
            encodings.append("none")
            
    df["url"].progress_apply(process_url)
    
    df["content_type"] = content_types
    df["status_code"] = status_codes
    df["is_json"] = is_json
    df["is_redirect"] = is_redirect
    df["encoding"] = encodings
    df["size_of_cookies"] = size_of_cookies
    return df

In [19]:
train_v3 = get_online_features(train)

100%|████████████████████████████████████████████████████████████████████████████████| 966/966 [07:50<00:00,  2.05it/s]


In [20]:
train_v3.head()

Unnamed: 0,url,target,start_with_api,has_userapi,has_googleapis,size_of_url,size_of_url_split,clear_url,minus_count,content_type,status_code,is_json,is_redirect,encoding,size_of_cookies
0,api.youla.io,0,1,0,0,12,3,api youla io,0,text/html,200,0,0,utf-8,0
1,favicon.yandex.net,1,0,0,0,18,3,favicon yandex net,0,none,404,0,0,none,0
2,w-74721.fp.kaspersky-labs.com,1,0,0,0,29,4,w-74721 fp kaspersky-labs com,2,none,-1,-1,-1,none,-1
3,questtime.net,0,0,0,0,13,2,questtime net,0,text/html,200,0,0,utf-8,1
4,passport-authproxy.taxi.yandex.net,1,0,0,0,34,4,passport-authproxy taxi yandex net,1,application/json,404,1,0,utf-8,0


#### Снова обучим модель

In [21]:
cat_features = ['start_with_api', 'has_userapi', 'has_googleapis', 'content_type', 'status_code',
       'is_json', 'is_redirect', 'encoding']
text_features = ["clear_url"]

model_v3 = get_model(train_v3, cat_features, text_features)

0:	learn: 0.8224299	test: 0.7894737	best: 0.7894737 (0)	total: 114ms	remaining: 56.7s
50:	learn: 0.9081365	test: 0.8972973	best: 0.9090909 (9)	total: 4.31s	remaining: 38s
100:	learn: 0.9192053	test: 0.8967391	best: 0.9090909 (9)	total: 8.16s	remaining: 32.3s
150:	learn: 0.9306667	test: 0.8937330	best: 0.9090909 (9)	total: 12.4s	remaining: 28.6s
200:	learn: 0.9399199	test: 0.8888889	best: 0.9090909 (9)	total: 16.6s	remaining: 24.6s
250:	learn: 0.9438503	test: 0.8888889	best: 0.9090909 (9)	total: 20.9s	remaining: 20.7s
300:	learn: 0.9465241	test: 0.8858696	best: 0.9090909 (9)	total: 24.7s	remaining: 16.4s
350:	learn: 0.9518717	test: 0.8828338	best: 0.9090909 (9)	total: 28.6s	remaining: 12.1s
400:	learn: 0.9572193	test: 0.8828338	best: 0.9090909 (9)	total: 32.1s	remaining: 7.92s
450:	learn: 0.9598930	test: 0.8797814	best: 0.9090909 (9)	total: 35.7s	remaining: 3.88s
499:	learn: 0.9598930	test: 0.8804348	best: 0.9090909 (9)	total: 39.3s	remaining: 0us
bestTest = 0.9090909091
bestIteration =

In [22]:
model_v3.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,size_of_url_split,51.168318
1,status_code,21.112203
2,content_type,15.993433
3,size_of_url,5.589089
4,encoding,4.214969
5,is_json,1.921989
6,start_with_api,0.0
7,has_userapi,0.0
8,has_googleapis,0.0
9,clear_url,0.0


#### Результат 0.90909
* Против 0.90526 первой модели
* Против 0.92121 второй модели
* Против 0.72908 константного предсказания 1