In [1]:
import json
import re

import pandas as pd
import validators
from tqdm import tqdm_notebook

# Dataset

In [2]:
df = pd.read_csv('data/train_v3.csv')  # выборка после паука: парсинг сайтов

In [3]:
with open('data/good_hosts.txt') as f:  # kaggle dataset
    good_hosts = set(json.load(f))

good_hosts_v2 = {
    h.replace('www.', '')
    for h in good_hosts
}

with open('data/rambler.json') as f:  # nice host list
    rambler = json.load(f)

rambler_wo_www = {
    h.replace('www.', '')
    for h in rambler
}
rambler.extend(rambler_wo_www)

with open('data/tlds-alpha-by-domain.txt') as f:  # domain endings
    domain_root_zones = f.readlines()
    domain_root_zones = [
        d[:-1].lower()
        for d in domain_root_zones
    ]

with open('data/all_english_words.json') as f:  # english words
    english_words = json.load(f)

sites = pd.read_excel('data/sites.xls')  # # nice host list
good_hosts_v3_sites_wo_www = {
    h.replace('www.', '')
    for h in sites['URL'].values
}
good_hosts_v3 = list(sites['URL'].values)
good_hosts_v3.extend(good_hosts_v3_sites_wo_www)


def is_contains_english_words(s):
    s = s.lower()
    return any(
        w in s
        for w in english_words
    )


re_is_ip = re.compile('^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')
re_m = re.compile('^m[0-9]+')
re_2numbers = re.compile('\d{3}')
re_dynamic_prefix = re.compile('^[a-z]{1,2}[0-9]{1,}[\-\._]')
re_cache_prefix = re.compile('cache-?[0-9]{1,2}')
re_digits_only = re.compile('^[0-9]+$')
re_infra_prefix = re.compile('^infra-[0-9]+')

www_ignore_pattern = [
    'www.gstatic.com',
    'www.googleadservices.com',
    'www.googleapis.com',
    'www.google-analytics.com',
    'www.googletagmanager.com',
    'www.tns-counter.ru',
    'www.googletagservices.com',
    'cdn'
]

tech_patterns_contains = [
    'api.',
    '.api',
    'cdn',
    'ad.',
    'ads.',
    'static.',
    's3.',
    'cache.',
    'stat.',
    'logs.',
    'log.',
    'stats.',
    'auth.',
    'sentry.',
    'script.',
    'storage.',
    '--',
    'an.yandex.ru',
    'app-measurement.com',
    'tpc.googlesyndication.com',
    'tpc.googlesyndication.com',
    'favicon.yandex.net',
    'googlesyndication.com',
]
tech_pattern_starts = [
    'api',
    'proxy',
    'log',
    'static',
    'counter',
    'sync.',
    's.',
    'a.',
    'c.',
    'pixel.',
    'v1.',
    'ssp.',
    'img.',
    'rtb.',
    'code.',
    'cm.',
    't.',
    'app.',
    'grs.',
    'analytics.',
    'match.',
    'adservice.',
    'data.',
    'd.',
    'mc.',
    'track.',
    'assets',
    'st.',
    'js',
    'connect.',
    'media.',
    'pagead2.',
    'dl.',
    'ajax.',
    'content.',
    'i.',
    'tracking.',
    'graph.',
    'banners.',
    'widget.',
    'abtest.',
    'strm.yandex.ru',
    'yabs.yandex.ru',
    'push.yandex.ru',
    'bs.yandex.ru',
    'statistics.',
    'tags.',
    'cs',
    'adx',
    'img',
    'image',
    'ads',
    'ct.',
    'pics.',
    'clk.',
    'notify.',
    'data',
    'ocsp.',
    'files.',
    'dl-',
    'token.',
    'graphql.',
    'pushserver',
    'balancer.',
    'go.',
    'informer.',
    'clck.',
    'clicks.',
    'click.',
    'target.',
    'xray.',
    'tiles.',
    'gridserver.',
    'metrika.',
    'ntp.',
    'fronterr.',
    'lib.',
    'tracker',
    'appgateway',
    'frontend.',
    'mfa.',
    'gate.',
    'edge.',
    'chat.',
    'config.',
    'amp.',
    'widgets.',
    'dev.',
    'admin.',
    'health.',
    'callback.',
    'post.',
    'xxx-files',
    'cluster.',
    'ext.',
    'file.',
    'links.',
    'metrics',
]

tech_patterns_ends = [
    '.local',
    'googleapis.com',
    'googleusercontent.com',
    'vkuser.net',
    '.akamai',
    '.link',
    '.googleadservices',
    '.googleadserv',
]

non_tech_pattern_starts = set([
    'www.',
    'm.',
    "maps",
    "video",
    "online",
    "news",
    "forum",
    "berezniki",
    "mobile",
    "mail",
    'web.',
    'pda.',
    'wap.',
])


def predict_baseline(s):
    if re_is_ip.search(s) is not None:
        return True

    if not (validators.domain(s) is True):
        return True

    if not any(s.endswith(p) for p in domain_root_zones):
        return True

    if any(p in s for p in www_ignore_pattern):
        return True

    if any(s.endswith(p) for p in tech_patterns_ends):
        return True

    if any(s.startswith(p) for p in non_tech_pattern_starts):
        return False

    if s in good_hosts:
        return False

    if s in good_hosts_v2:  ###
        return False

    if s in good_hosts_v3:  ###
        return False

    if s in rambler:  ###
        return False

    return any(p in s for p in tech_patterns_contains) or (
            len(s.split('.')) > 3 and not s.startswith('www.') and not s.startswith('m.')
    ) or any(s.startswith(p) for p in tech_pattern_starts) or (
                   len(s.split('.')[0]) > 10 and len(s.split('.')) >= 3
           ) or re_m.search(s) is not None or (
                   s.endswith('google.com') and s != 'www.google.com'
           ) or (
                   len(s.split('.')) == 2 and len(s) > 25 and re_2numbers.search(s) is not None and '-' in s
           ) or (
                   len(s.split('.')) > 2 and 'api' in s.split('.')[0]
           ) or (
                   len(s.split('.')) > 2 and 'node' in s.split('.')[0]
           ) or (
                   len(s.split('.')) > 2 and s.split('.')[0].startswith('s')
           ) or (
                   len(s.split('.')) > 2 and re_dynamic_prefix.search(s) is not None
           ) or (
                   len(s.split('.')) > 2 and re_cache_prefix.search(s) is not None
           ) or (
                   len(s.split('.')) > 2 and re_digits_only.search(s.split('.')[0]) is not None
           ) or (
                   len(s.split('.')) > 2 and len(s.split('.')[0]) == 1 and s.split('.')[0] != 'm'
           ) or (
                   len(s.split('.')) > 2 and len(s.split('.')[0]) == 2 and s.split('.')[0] not in {
               'ru', 'en', 'de', 'us'
           }
           ) or (
                   len(s.split('.')) > 2 and re_infra_prefix.search(s.split('.')[0]) is not None
           ) or (
                   len(s.split('.')) > 2 and 'auth' in s.split('.')[0]
           ) or (
                   len(s.split('.')) > 2 and s.split('.')[0] in {
               'us-east-2',
               'us-east-1',
               'us-west-1',
               'us-west-2',
               'af-south-1',
               'ap-east-1',
               'ap-south-1',
               'ap-northeast-3',
               'ap-northeast-2',
               'ap-southeast-1',
               'ap-southeast-2',
               'ap-northeast-1',
               'ca-central-1',
               'eu-central-1',
               'eu-west-1',
               'eu-west-2',
               'eu-south-1',
               'eu-west-3',
               'eu-north-1',
               'me-south-1',
               'sa-east-1',
               'us-gov-east-1',
               'us-gov-west-1'
           }
           ) or (
                   len(s.split('.')) > 2 and 'counter' in s.split('.')[0]
           ) or (
                   len(s.split('.')) > 2 and not is_contains_english_words(s.split('.')[0])
           ) or (
               not is_contains_english_words(
                   ''.join(s.split('.')[:-1])
               )
           )


In [4]:
hosts = df['host'].values.tolist()
is_tech = [
    predict_baseline(host)
    for host in tqdm_notebook(hosts)
]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for host in tqdm_notebook(hosts)


  0%|          | 0/199944 [00:00<?, ?it/s]

In [5]:
df['is_tech'] = is_tech

In [6]:
df['is_tech'].sum() / len(df['is_tech'])  # 84% is tech

0.8425859240587364

-----

# Model

In [7]:

import dill
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
X, y = df[['host']], df['is_tech_v3'].values.astype(int).tolist()

In [9]:
y = pd.Series(y).map({
                         1: 0,
                         0: 1
                     })  # 1 - users, 0 - tech

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X[['host']], y, test_size=0.33, random_state=42)

In [12]:
def create_ngrams():
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
    X_train_tfidf = vectorizer.fit_transform(X_train['host'].str.replace('.', ''))
    vectorizer_model = LogisticRegression(C=5e1, solver='liblinear', random_state=17, n_jobs=1)
    vectorizer_model.fit(X_train_tfidf, [int(y_i) for y_i in y_train])

    vectorizer_model_weights = eli5.formatters.as_dataframe.explain_weights_df(
        estimator=vectorizer_model,
        feature_names=list(vectorizer.get_feature_names()),
        top=(100, 10)
    )
    return vectorizer_model_weights.set_index('feature')['weight'].to_dict()


ngrams = create_ngrams()

In [13]:
def create_features(hosts, ngram):
    import pandas as pd
    import re

    df_features = pd.DataFrame()
    df_features['host'] = hosts
    df_features['url_len'] = df_features['host'].apply(lambda s: len(s))
    df_features['max_domain_level'] = df_features['host'].apply(lambda s: len(s.split('.')))
    df_features['max_domain_part_len'] = df_features['host'].apply(
        lambda s: max((len(s_i) for s_i in s.split('.'))))

    df_features['ngram_max'] = df_features['host'].apply(lambda s: max([
                                                                           ngrams[ngram]
                                                                           for ngram in ngrams.keys()
                                                                           if ngram in s
                                                                       ] + [0]))

    df_features['ngram_min'] = df_features['host'].apply(lambda s: min([
                                                                           ngrams[ngram]
                                                                           for ngram in ngrams.keys()
                                                                           if ngram in s
                                                                       ] + [0]))

    re_digit = re.compile('\D')
    df_features['digits_count'] = df_features['host'].apply(lambda s: len(re_digit.sub('', s)))

    del df_features['host']

    return df_features

In [14]:
X_train = create_features(X_train['host'].tolist(), ngrams)
X_test = create_features(X_test['host'].tolist(), ngrams)

In [15]:
model = CatBoostClassifier(random_state=0, verbose=0, max_depth=2, n_estimators=5)
booster = model.fit(X_train, y_train)
print(pd.Series(dict(zip(booster.feature_names_, booster.feature_importances_))).sort_values(ascending=False))
predicts = booster.predict(X_test)
print(f'precision={round(precision_score(y_test, predicts), 2)}')
print(f'recall={round(recall_score(y_test, predicts), 2)}')

max_domain_level       76.847973
url_len                14.649828
digits_count            7.181749
max_domain_part_len     1.320449
ngram_min               0.000000
ngram_max               0.000000
dtype: float64
precision=0.92
recall=0.98


In [16]:
golden_url = ['yandex.ru', 'api.yandex.ru', 'cdn.vtb.ru', 'no-cdn.vtb.ru', 'rbc.ru', 'ya.ru']

booster.predict_proba(create_features(golden_url, ngrams))[:, 1]

array([0.9013329 , 0.03545867, 0.03595288, 0.03545867, 0.70261085,
       0.70261085])

# Validation

In [17]:
df_val_1 = pd.read_csv('validation_manual/df_non_tech_manual.csv')
df_val_1 = df_val_1[['host', 'is_tech_manual']]
df_val_1.loc[df_val_1['is_tech_manual'] == ' ', 'is_tech_manual'] = 1
df_val_1.dropna(inplace=True)
df_val_1['is_tech_manual'] = df_val_1['is_tech_manual'].astype('int')

df_val_2 = pd.read_csv('validation_manual/output_AM.csv')
df_val_2 = df_val_2[['host', 'is_tech_manual']]
df_val_2['is_tech_manual'] = df_val_2['is_tech_manual'].astype('int')

df_val = pd.concat([df_val_1, df_val_2])

In [18]:
X_val = df_val['host']
y_val = df_val['is_tech_manual'].map({
                                         1: 0,
                                         0: 1
                                     })

In [19]:
X_val = create_features(X_val.tolist(), ngrams)

In [20]:
predicts = booster.predict(X_val)
print(f'precision={round(precision_score(y_val, predicts), 2)}')
print(f'recall={round(recall_score(y_val, predicts), 2)}')

precision=0.72
recall=0.39


# Export model

In [21]:
class SharedPredictor:
    def __init__(self, booster, ngrams):
        self._booster = booster
        self._ngrams = ngrams

    def predict(self, hosts):
        return self._predict(self._booster, self._create_features(hosts, self._ngrams))

    @classmethod
    def _create_features(cls, hosts, ngrams):
        # copy features extract pipeline
        import pandas as pd
        import re

        df_features = pd.DataFrame()
        df_features['host'] = hosts
        df_features['url_len'] = df_features['host'].apply(lambda s: len(s))
        df_features['max_domain_level'] = df_features['host'].apply(lambda s: len(s.split('.')))
        df_features['max_domain_part_len'] = df_features['host'].apply(
            lambda s: max((len(s_i) for s_i in s.split('.'))))

        df_features['ngram_max'] = df_features['host'].apply(lambda s: max([
                                                                               ngrams[ngram]
                                                                               for ngram in ngrams.keys()
                                                                               if ngram in s
                                                                           ] + [0]))

        df_features['ngram_min'] = df_features['host'].apply(lambda s: min([
                                                                               ngrams[ngram]
                                                                               for ngram in ngrams.keys()
                                                                               if ngram in s
                                                                           ] + [0]))

        re_digit = re.compile('\D')
        df_features['digits_count'] = df_features['host'].apply(lambda s: len(re_digit.sub('', s)))

        del df_features['host']

        return df_features

    @classmethod
    def _predict(cls, booster, df_features):
        import shap

        russian_names = {
            'url_len': 'Длина хоста',
            'max_domain_level': 'Количество поддоменов',
            'max_domain_part_len': 'Максимальная длина поддомена',
            'ngram_max': 'Максимальный вес 3х буквенной n-граммы',
            'ngram_min': 'Минимальный вес 3х буквенной n-граммы',
            'digits_count': 'Число цифр в хосте'
        }

        explainer = shap.Explainer(booster)
        predicted_proba = round(booster.predict_proba(df_features)[0][1], 2)
        shap_values = explainer(df_features)
        shap_feature_importance = dict(zip(
            [russian_names.get(n, n) for n in shap_values.feature_names],
            [
                round(x, 2)
                for x in shap_values.values[0].tolist()
            ]
        ))
        predicted_proba = float(predicted_proba)

        return {
            'predict': predicted_proba > 0.5,
            'predicted_proba': predicted_proba,
            'shap_feature_importance': shap_feature_importance
        }


In [22]:
booster_export = model.fit(create_features(X['host'].tolist(), ngrams), y)
predictor = SharedPredictor(booster_export, ngrams)
with open('data/model.bin', 'wb') as f:
    dill.dump(predictor, f)

In [23]:
predictor.predict(['api.yandex.ru'])

{'predict': False,
 'predicted_proba': 0.04,
 'shap_feature_importance': {'Длина хоста': 0.07,
  'Количество поддоменов': -0.59,
  'Максимальная длина поддомена': 0.0,
  'Максимальный вес 3х буквенной n-граммы': 0.0,
  'Минимальный вес 3х буквенной n-граммы': -0.05,
  'Число цифр в хосте': 0.12}}

# P.S. 
дальше, что не успели :(

In [28]:
# def create_ngrams_5():
#     vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(4, 5))
#     X_train_tfidf = vectorizer.fit_transform(X_train['host'].str.replace('.', ''))
#     vectorizer_model = LogisticRegression(C=5e1, solver='liblinear', random_state=17, n_jobs=1)
#     vectorizer_model.fit(X_train_tfidf, [int(y_i) for y_i in y_train])

#     vectorizer_model_weights = eli5.formatters.as_dataframe.explain_weights_df(
#         estimator=vectorizer_model, 
#         feature_names= list(vectorizer.get_feature_names()),
#         top=(100, 10)
#     )
#     return vectorizer_model_weights.set_index('feature')['weight'].to_dict()

# ngrams_5 = create_ngrams_5()

In [156]:
# def create_features(hosts, ngram):
#     import pandas as pd
#     import re
#     import validators

#     df_features = pd.DataFrame()
#     df_features['host'] = hosts
#     df_features['url_len'] = df_features['host'].apply(lambda s: len(s))
#     df_features['max_domain_level'] = df_features['host'].apply(lambda s: len(s.split('.')))
#     df_features['max_domain_part_len'] = df_features['host'].apply(
#         lambda s: max((len(s_i) for s_i in s.split('.'))))

#     df_features['ngram_max'] = df_features['host'].apply(lambda s: max([
#         ngrams[ngram]
#         for ngram in ngrams.keys()
#         if ngram in s
#     ] + [0]))

#     df_features['ngram_max_5'] = df_features['host'].apply(lambda s: max([
#         ngrams_5[ngram]
#         for ngram in ngrams_5.keys()
#         if ngram in s
#     ] + [0]))

#     df_features['ngram_min'] = df_features['host'].apply(lambda s: min([
#         ngrams[ngram]
#         for ngram in ngrams.keys()
#         if ngram in s
#     ] + [0]))

#     df_features['ngram_min_5'] = df_features['host'].apply(lambda s: min([
#         ngrams_5[ngram]
#         for ngram in ngrams_5.keys()
#         if ngram in s
#     ] + [0]))

#     re_digit = re.compile('\D')
#     df_features['digits_count'] = df_features['host'].apply(lambda s: len(re_digit.sub('', s)))

#     del df_features['host']

#     return df_features

In [166]:
# check = pd.concat([pd.DataFrame(y_test), pd.DataFrame(predicts)], axis=1)
# check.columns = ['test', 'pred']

In [167]:
# check = pd.concat([X_test.reset_index(drop=True), check], axis=1)

In [24]:
# check[check['test'] != check['pred']]