# Classification hosts on users and tech

In [1]:
import json
import re

import pandas as pd
import validators
from tqdm.notebook import trange, tqdm

# Dataset

In [2]:
df = pd.read_csv('data/train_v3.csv')  # выборка после паука: парсинг сайтов

In [3]:
df[['host', 'is_tech_v3']].head()

Unnamed: 0,host,is_tech_v3
0,api.youla.io,True
1,favicon.yandex.net,True
2,w-74721.fp.kaspersky-labs.com,True
3,questtime.net,False
4,passport-authproxy.taxi.yandex.net,True


host: имя хоста  
is_tech (target): 1(True) - если хост технический, 0(False) - пользовательский

### Набор правил для разметки технических и пользовательских хостов

In [4]:
with open('data/good_hosts.txt') as f:  # kaggle dataset
    good_hosts = set(json.load(f))

good_hosts_v2 = {
    h.replace('www.', '')
    for h in good_hosts
}

with open('data/rambler.json') as f:  # nice host list
    rambler = json.load(f)

rambler_wo_www = {
    h.replace('www.', '')
    for h in rambler
}
rambler.extend(rambler_wo_www)

with open('data/tlds-alpha-by-domain.txt') as f:  # domain endings
    domain_root_zones = f.readlines()
    domain_root_zones = [
        d[:-1].lower()
        for d in domain_root_zones
    ]

with open('data/all_english_words.json') as f:  # english words
    english_words = json.load(f)

sites = pd.read_csv('data/sites.csv', names=['URL'])  # # nice host list
good_hosts_v3_sites_wo_www = {
    h.replace('www.', '')
    for h in sites['URL'].values
}
good_hosts_v3 = list(sites['URL'].values)
good_hosts_v3.extend(good_hosts_v3_sites_wo_www)


def is_contains_english_words(s):
    s = s.lower()
    return any(
        w in s
        for w in english_words
    )


re_is_ip = re.compile('^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$') # ip adress
re_m = re.compile('^m[0-9]+') # m + number
re_2numbers = re.compile('\d{3}')
re_dynamic_prefix = re.compile('^[a-z]{1,2}[0-9]{1,}[\-\._]')
re_cache_prefix = re.compile('cache-?[0-9]{1,2}')
re_digits_only = re.compile('^[0-9]+$')
re_infra_prefix = re.compile('^infra-[0-9]+')

www_ignore_pattern = [
    'www.gstatic.com',
    'www.googleadservices.com',
    'www.googleapis.com',
    'www.google-analytics.com',
    'www.googletagmanager.com',
    'www.tns-counter.ru',
    'www.googletagservices.com',
    'cdn'
]

tech_patterns_contains = [
    'api.',
    '.api',
    'cdn',
    'ad.',
    'ads.',
    'static.',
    's3.',
    'cache.',
    'stat.',
    'logs.',
    'log.',
    'stats.',
    'auth.',
    'sentry.',
    'script.',
    'storage.',
    '--',
    'an.yandex.ru',
    'app-measurement.com',
    'tpc.googlesyndication.com',
    'tpc.googlesyndication.com',
    'favicon.yandex.net',
    'googlesyndication.com',
]
tech_pattern_starts = [
    'api',
    'proxy',
    'log',
    'static',
    'counter',
    'sync.',
    's.',
    'a.',
    'c.',
    'pixel.',
    'v1.',
    'ssp.',
    'img.',
    'rtb.',
    'code.',
    'cm.',
    't.',
    'app.',
    'grs.',
    'analytics.',
    'match.',
    'adservice.',
    'data.',
    'd.',
    'mc.',
    'track.',
    'assets',
    'st.',
    'js',
    'connect.',
    'media.',
    'pagead2.',
    'dl.',
    'ajax.',
    'content.',
    'i.',
    'tracking.',
    'graph.',
    'banners.',
    'widget.',
    'abtest.',
    'strm.yandex.ru',
    'yabs.yandex.ru',
    'push.yandex.ru',
    'bs.yandex.ru',
    'statistics.',
    'tags.',
    'cs',
    'adx',
    'img',
    'image',
    'ads',
    'ct.',
    'pics.',
    'clk.',
    'notify.',
    'data',
    'ocsp.',
    'files.',
    'dl-',
    'token.',
    'graphql.',
    'pushserver',
    'balancer.',
    'go.',
    'informer.',
    'clck.',
    'clicks.',
    'click.',
    'target.',
    'xray.',
    'tiles.',
    'gridserver.',
    'metrika.',
    'ntp.',
    'fronterr.',
    'lib.',
    'tracker',
    'appgateway',
    'frontend.',
    'mfa.',
    'gate.',
    'edge.',
    'chat.',
    'config.',
    'amp.',
    'widgets.',
    'dev.',
    'admin.',
    'health.',
    'callback.',
    'post.',
    'xxx-files',
    'cluster.',
    'ext.',
    'file.',
    'links.',
    'metrics',
]

tech_patterns_ends = [
    '.local',
    'googleapis.com',
    'googleusercontent.com',
    'vkuser.net',
    '.akamai',
    '.link',
    '.googleadservices',
    '.googleadserv',
]

non_tech_pattern_starts = set([
    'www.',
    'm.',
    "maps",
    "video",
    "online",
    "news",
    "forum",
    "berezniki",
    "mobile",
    "mail",
    'web.',
    'pda.',
    'wap.',
])


def predict_baseline(s):
    # non tech start patterns
    if any(s.startswith(p) for p in non_tech_pattern_starts):
        return False
    
    # ip adress = tech
    if re_is_ip.search(s) is not None:
        return True

    # if host is not exist = tech
    if not (validators.domain(s) is True):
        return True
    

    # if is not usual domain ending = tech
    if not any(s.endswith(p) for p in domain_root_zones):
        return True

    # tech hosts
    if any(p in s for p in www_ignore_pattern):
        return True

    # tech endings
    if any(s.endswith(p) for p in tech_patterns_ends):
        return True

    # non tech hosts 
    if s in good_hosts:
        return False
    if s in good_hosts_v2:
        return False
    if s in good_hosts_v3:
        return False
    if s in rambler:
        return False

    # TECH:
    # tech contains patterns
    return any(p in s for p in tech_patterns_contains) or (
            # len parts of host > 3 and wo 'www', 'm.'
            len(s.split('.')) > 3 and not s.startswith('www.') and not s.startswith('m.')
    # tech starts
    ) or any(s.startswith(p) for p in tech_pattern_starts) or (
                    # len first part > 10 and len all parts >= 3
                   len(s.split('.')[0]) > 10 and len(s.split('.')) >= 3
            # has m + number
           ) or re_m.search(s) is not None or (
                    # has 'google.com' but not for user
                   s.endswith('google.com') and s != 'www.google.com'
           ) or ( # 2 parts, len >25, 2 numbers together, has '-'
                   len(s.split('.')) == 2 and len(s) > 25 and re_2numbers.search(s) is not None and '-' in s
           ) or ( # >2 parts, has 'api'
                   len(s.split('.')) > 2 and 'api' in s.split('.')[0]
           ) or ( # >2 parts, has 'node'
                   len(s.split('.')) > 2 and 'node' in s.split('.')[0]
           ) or ( # >2 parts, start with 's'
                   len(s.split('.')) > 2 and s.split('.')[0].startswith('s')
           ) or ( # >2 parts, has letter + number
                   len(s.split('.')) > 2 and re_dynamic_prefix.search(s) is not None
           ) or ( # >2 parts, has 'cache'
                   len(s.split('.')) > 2 and re_cache_prefix.search(s) is not None
           ) or ( # >2 parts, has numbers in first part
                   len(s.split('.')) > 2 and re_digits_only.search(s.split('.')[0]) is not None
           ) or ( # >2 parts, len first part = 1, and it's not 'm' (like mobile)
                   len(s.split('.')) > 2 and len(s.split('.')[0]) == 1 and s.split('.')[0] != 'm'
           ) or ( # >2 parts, len first part = 2, and it's not lang abb
                   len(s.split('.')) > 2 and len(s.split('.')[0]) == 2 and s.split('.')[0] not in {
               'ru', 'en', 'de', 'us'
           } 
           ) or ( # >2 parts, has 'infra' + number in first part
                   len(s.split('.')) > 2 and re_infra_prefix.search(s.split('.')[0]) is not None
           ) or ( # >2 parts, has 'auth' + number in first part
                   len(s.split('.')) > 2 and 'auth' in s.split('.')[0]
           ) or ( # >2 parts, has ... in first part
                   len(s.split('.')) > 2 and s.split('.')[0] in {
               'us-east-2',
               'us-east-1',
               'us-west-1',
               'us-west-2',
               'af-south-1',
               'ap-east-1',
               'ap-south-1',
               'ap-northeast-3',
               'ap-northeast-2',
               'ap-southeast-1',
               'ap-southeast-2',
               'ap-northeast-1',
               'ca-central-1',
               'eu-central-1',
               'eu-west-1',
               'eu-west-2',
               'eu-south-1',
               'eu-west-3',
               'eu-north-1',
               'me-south-1',
               'sa-east-1',
               'us-gov-east-1',
               'us-gov-west-1'
           }
           ) or ( # >2 parts, has 'counter' in first part
                   len(s.split('.')) > 2 and 'counter' in s.split('.')[0]
           ) or ( # >2 parts, has not eng words in first part
                   len(s.split('.')) > 2 and not is_contains_english_words(s.split('.')[0])
           ) or ( # has not eng words in all not ending parts
               not is_contains_english_words(
                   ''.join(s.split('.')[:-1])
               )
           )


In [5]:
hosts = df['host'].values.tolist()
is_tech = [
    predict_baseline(hosts[host])
    for host in trange(len(hosts))
]

  0%|          | 0/199944 [00:00<?, ?it/s]

In [6]:
df['is_tech'] = is_tech

In [7]:
df['is_tech'].sum() / len(df['is_tech'])  # 84% hosts is tech

0.8405403512983636

-----

# Model

In [8]:
import dill
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
X, y = df[['host']], df['is_tech'].values.astype(int).tolist()

In [10]:
# change because of metrics (users hosts more important)
y = pd.Series(y).map({
                         1: 0,
                         0: 1
                     })  # 1 - users, 0 - tech

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X[['host']], y, test_size=0.33, random_state=42)

In [12]:
# 3 ngrams
def create_ngrams():
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
    X_train_tfidf = vectorizer.fit_transform(X_train['host'].str.replace('.', ''))
    vectorizer_model = LogisticRegression(C=5e1, solver='liblinear', random_state=17, n_jobs=1)
    vectorizer_model.fit(X_train_tfidf, [int(y_i) for y_i in y_train])

    vectorizer_model_weights = eli5.formatters.as_dataframe.explain_weights_df(
        estimator=vectorizer_model,
        feature_names=list(vectorizer.get_feature_names()),
        top=(100, 10)
    )
    return vectorizer_model_weights.set_index('feature')['weight'].to_dict()

ngrams = create_ngrams()

In [13]:
def create_features(hosts, ngram):
    import pandas as pd
    import re

    df_features = pd.DataFrame()
    df_features['host'] = hosts
    df_features['url_len'] = df_features['host'].apply(lambda s: len(s))
    df_features['max_domain_level'] = df_features['host'].apply(lambda s: len(s.split('.')))
    df_features['max_domain_part_len'] = df_features['host'].apply(
        lambda s: max((len(s_i) for s_i in s.split('.'))))

    df_features['ngram_max'] = df_features['host'].apply(lambda s: max([
                                                                           ngrams[ngram]
                                                                           for ngram in ngrams.keys()
                                                                           if ngram in s
                                                                       ] + [0]))

    df_features['ngram_min'] = df_features['host'].apply(lambda s: min([
                                                                           ngrams[ngram]
                                                                           for ngram in ngrams.keys()
                                                                           if ngram in s
                                                                       ] + [0]))
    df_features['users_start'] = df_features['host'].apply(lambda s: (s.startswith('www.')) or 
                                                           (s.startswith('m.')))

    re_digit = re.compile('\D')
    df_features['digits_count'] = df_features['host'].apply(lambda s: len(re_digit.sub('', s)))

    del df_features['host']

    return df_features

In [14]:
X_train = create_features(X_train['host'].tolist(), ngrams)
X_test = create_features(X_test['host'].tolist(), ngrams)

In [21]:
model = CatBoostClassifier(random_state=0, verbose=0, max_depth=3, n_estimators=5)
booster = model.fit(X_train, y_train)
print(pd.Series(dict(zip(booster.feature_names_, booster.feature_importances_))).sort_values(ascending=False))
predicts = booster.predict(X_test)
print(f'precision={round(precision_score(y_test, predicts), 2)}')
print(f'recall={round(recall_score(y_test, predicts), 2)}')

digits_count           32.667387
max_domain_level       31.093086
url_len                11.334012
users_start            10.233795
ngram_min               5.532076
max_domain_part_len     4.684761
ngram_max               4.454882
dtype: float64
precision=0.93
recall=0.61


In [22]:
golden_url = ['yandex.ru', 'api.yandex.ru', 'cdn.vtb.ru', 'no-cdn.vtb.ru', 'rbc.ru', 'ya.ru']

booster.predict_proba(create_features(golden_url, ngrams))[:, 1]

array([0.92134499, 0.0854344 , 0.0854344 , 0.0854344 , 0.92134499,
       0.92134499])

# Validation

In [23]:
# hosts by hands
df_val_1 = pd.read_csv('validation_manual/df_non_tech_manual.csv')
df_val_1 = df_val_1[['host', 'is_tech_manual']]
df_val_1.loc[df_val_1['is_tech_manual'] == ' ', 'is_tech_manual'] = 1
df_val_1.dropna(inplace=True)
df_val_1['is_tech_manual'] = df_val_1['is_tech_manual'].astype('int')

df_val_2 = pd.read_csv('validation_manual/output_AM.csv')
df_val_2 = df_val_2[['host', 'is_tech_manual']]
df_val_2['is_tech_manual'] = df_val_2['is_tech_manual'].astype('int')

df_val = pd.concat([df_val_1, df_val_2])

In [24]:
X_val = df_val['host']
y_val = df_val['is_tech_manual'].map({
                                         1: 0,
                                         0: 1
                                     })

In [25]:
X_val = create_features(X_val.tolist(), ngrams)

In [26]:
predicts = booster.predict(X_val)
print(f'precision={round(precision_score(y_val, predicts), 2)}')
print(f'recall={round(recall_score(y_val, predicts), 2)}')

precision=0.73
recall=0.59


# Export model

In [27]:
class SharedPredictor:
    def __init__(self, booster, ngrams):
        self._booster = booster
        self._ngrams = ngrams

    def predict(self, hosts):
        return self._predict(self._booster, self._create_features(hosts, self._ngrams))

    @classmethod
    def _create_features(cls, hosts, ngrams):
        # copy features extract pipeline
        import pandas as pd
        import re

        df_features = pd.DataFrame()
        df_features['host'] = hosts
        df_features['url_len'] = df_features['host'].apply(lambda s: len(s))
        df_features['max_domain_level'] = df_features['host'].apply(lambda s: len(s.split('.')))
        df_features['max_domain_part_len'] = df_features['host'].apply(
            lambda s: max((len(s_i) for s_i in s.split('.'))))

        df_features['ngram_max'] = df_features['host'].apply(lambda s: max([
                                                                               ngrams[ngram]
                                                                               for ngram in ngrams.keys()
                                                                               if ngram in s
                                                                           ] + [0]))

        df_features['ngram_min'] = df_features['host'].apply(lambda s: min([
                                                                               ngrams[ngram]
                                                                               for ngram in ngrams.keys()
                                                                               if ngram in s
                                                                           ] + [0]))
        df_features['users_start'] = df_features['host'].apply(lambda s: (s.startswith('www.')) or 
                                                           (s.startswith('m.')))

        re_digit = re.compile('\D')
        df_features['digits_count'] = df_features['host'].apply(lambda s: len(re_digit.sub('', s)))

        del df_features['host']

        return df_features

    @classmethod
    def _predict(cls, booster, df_features):
        import shap

        russian_names = {
            'url_len': 'Длина хоста',
            'max_domain_level': 'Количество поддоменов',
            'max_domain_part_len': 'Максимальная длина поддомена',
            'ngram_max': 'Максимальный вес n-граммы',
            'ngram_min': 'Минимальный вес n-граммы',
            'digits_count': 'Число цифр в хосте',
            'users_start' : 'Начало с www. или с m.'
        }

        explainer = shap.Explainer(booster)
        predicted_proba = round(booster.predict_proba(df_features)[0][1], 2)
        shap_values = explainer(df_features)
        shap_feature_importance = dict(zip(
            [russian_names.get(n, n) for n in shap_values.feature_names],
            [
                round(x, 2)
                for x in shap_values.values[0].tolist()
            ]
        ))
        predicted_proba = float(predicted_proba)

        return {
            'predict': predicted_proba > 0.5,
            'predicted_proba': predicted_proba,
            'shap_feature_importance': shap_feature_importance
        }


In [28]:
booster_export = model.fit(create_features(X['host'].tolist(), ngrams), y)
predictor = SharedPredictor(booster_export, ngrams)
with open('data/model.bin', 'wb') as f:
    dill.dump(predictor, f)

In [29]:
# testing
predictor.predict(['api.yandex.ru']) # False = tech host

{'predict': False,
 'predicted_proba': 0.09,
 'shap_feature_importance': {'Длина хоста': 0.37,
  'Количество поддоменов': -0.23,
  'Максимальная длина поддомена': 0.0,
  'Максимальный вес n-граммы': -0.07,
  'Минимальный вес n-граммы': -0.94,
  'Начало с www. или с m.': -0.07,
  'Число цифр в хосте': 0.66}}

# check errors
Анализ ошибок модели, на основе которого можно далее создавать новые фичи для улучшения качества предсказания

In [30]:
import numpy as np

In [31]:
check = pd.DataFrame(np.vstack([y_val, pd.Series(predicts)])).T
check.columns = ['true', 'pred']

In [32]:
check = df_val.join(check)

In [34]:
check[check['true'] != check['pred']][['host', 'true', 'pred']]

Unnamed: 0,host,true,pred
4,mitdmp.whiteboxdigital.ru,1,0
4,mitdmp.whiteboxdigital.ru,1,0
7,dsp.e-contenta.com,1,0
7,dsp.e-contenta.com,1,0
8,mobile.yandexadexchange.net,1,0
...,...,...,...
183,m.vk.com,1,0
184,v-fall.net,1,0
190,lookaside.facebook.com,0,1
195,ice.360yield.com,0,1
