In [12]:
import pandas as pd
import re
import validators
import json
from tqdm import tqdm_notebook
import re
from typing import List


import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/host.zip', names=['host']).drop_duplicates()

In [3]:
with open('data/good_prefix.txt') as f:
    good_prefix = json.load(f)
    
with open('data/good_suffix.txt') as f:
    good_suffix = set(json.load(f))
    
with open('data/good_hosts.txt') as f:
    good_hosts = set(json.load(f))

with open('data/tlds-alpha-by-domain.txt') as f:
    domain_root_zones = f.readlines()
    domain_root_zones = [
        d[:-1].lower()
        for d in domain_root_zones
    ]

with open('data/all_english_words.json') as f:
    english_words = json.load(f)

def is_contains_english_words(s):
    s = s.lower()
    return any(
        w in s
        for w in english_words
    )


re_is_ip = re.compile('^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')
re_m = re.compile('^m[0-9]+')
re_2numbers = re.compile('\d{3}')
re_dynamic_prefix = re.compile('^[a-z]{1,2}[0-9]{1,}[\-\._]')
re_cache_prefix = re.compile('cache-?[0-9]{1,2}')
re_digits_only = re.compile('^[0-9]+$')
re_infra_prefix = re.compile('^infra-[0-9]+')

www_ignore_pattern = [
    'www.gstatic.com',
    'www.googleadservices.com',
    'www.googleapis.com',    
    'www.google-analytics.com',
    'www.googletagmanager.com',
    'www.tns-counter.ru',
    'www.googletagservices.com',
    'cdn'
]

tech_patterns_contains = [
    'api.',
    '.api',
    'cdn',
    'ad.',
    'ads.',
    'static.',
    's3.',
    'cache.',
    'stat.',
    'logs.',
    'log.',
    'stats.',
    'auth.',
    'sentry.',
    'script.',
    'storage.',
    '--',
    'an.yandex.ru',
    'app-measurement.com',    
    'tpc.googlesyndication.com',
    'tpc.googlesyndication.com',
    'favicon.yandex.net',
    'googlesyndication.com',
    'analytics.'
]
tech_pattern_starts = [
    'api',
    'proxy',
    'log',
    'static',
    'counter',
    'sync.',
    's.',
    'a.',
    'c.',
    'pixel.',
    'v1.',
    'ssp.',
    'img.',
    'rtb.',
    'code.',
    'cm.',
    't.',
    'app.',
    'grs.',
    'analytics.',
    'match.',
    'adservice.',
    'data.',
    'd.',
    'mc.',
    'track.',
    'assets',
    'st.',
    'js',
    'connect.',
    'media.',
    'pagead2.',
    'dl.',
    'ajax.',
    'content.',
    'i.',
    'tracking.',
    'graph.',
    'banners.',
    'widget.',
    'abtest.',
    'strm.yandex.ru',
    'yabs.yandex.ru',
    'push.yandex.ru',
    'bs.yandex.ru',
    'statistics.',
    'tags.',
    'cs',
    'adx',
    'img',
    'image',
    'ads',
    'ct.',
    'pics.',
    'clk.',
    'notify.',
    'data',
    'ocsp.',
    'files.',
    'dl-',
    'token.',
    'graphql.',
    'pushserver',
    'balancer.',
    'go.',
    'informer.',
    'clck.',
    'clicks.',
    'click.',
    'target.',
    'xray.',
    'tiles.',
    'gridserver.',
    'metrika.',
    'ntp.',
    'fronterr.',
    'lib.',
    'tracker',
    'appgateway',
    'frontend.',
    'mfa.',
    'gate.',
    'edge.',
    'chat.',
    'config.',
    'amp.',
    'widgets.',
    'dev.',
    'admin.',
    'health.',
    'callback.',
    'post.',
    'xxx-files',
    'cluster.',
    'ext.',
    'file.',
    'links.',
    'metrics',
]

tech_patterns_ends = [
    '.local',
    'googleapis.com',
    'googleusercontent.com',
    'vkuser.net',
    '.akamai',
    '.link',
    '.googleadservices',
    '.googleadserv',
]

non_tech_pattern_starts = set([
    'www.',
    'm.',
    "maps",
  "video",
  "online",
  "news",
  "forum",
  "berezniki",
  "mobile",
    "mail",
    'web.',
    'pda.',
    'wap.',
])


def predict_baseline(s):
    if re_is_ip.search(s) is not None:
        return True
    
    if not (validators.domain(s) is True):
        return True
    
    if not any(s.endswith(p) for p in domain_root_zones):
        return True
    
    if any(p in s for p in www_ignore_pattern):
        return True
    
    if any(s.endswith(p) for p in tech_patterns_ends):
        return True

    if any(s.startswith(p) for p in non_tech_pattern_starts):
        return False    
    
    if s in good_hosts:
        return True
    
    
    return any(p in s for p in tech_patterns_contains) or (
        len(s.split('.')) > 3 and not s.startswith('www.') and not s.startswith('m.')
    ) or any(s.startswith(p) for p in tech_pattern_starts) or (
        len(s.split('.')[0]) > 10 and len(s.split('.')) >= 3
    ) or re_m.search(s) is not None or (
        s.endswith('google.com') and s != 'www.google.com'
    ) or (
        len(s.split('.')) == 2 and len(s) > 25 and re_2numbers.search(s) is not None and '-' in s
    ) or (
        len(s.split('.')) > 2 and 'api' in s.split('.')[0]
    ) or (
        len(s.split('.')) > 2 and 'node' in s.split('.')[0]
    ) or (
        len(s.split('.')) > 2 and s.split('.')[0].startswith('s')
    ) or (
        len(s.split('.')) > 2 and re_dynamic_prefix.search(s) is not None
    ) or (
        len(s.split('.')) > 2 and re_cache_prefix.search(s) is not None
    ) or (
        len(s.split('.')) > 2 and re_digits_only.search(s.split('.')[0]) is not None
    ) or (
        len(s.split('.')) > 2 and len(s.split('.')[0]) == 1 and s.split('.')[0] != 'm'
    ) or (
        len(s.split('.')) > 2 and len(s.split('.')[0]) == 2 and s.split('.')[0] not in {
            'ru', 'en', 'de', 'us'
        }
    ) or (
        len(s.split('.')) > 2 and re_infra_prefix.search(s.split('.')[0]) is not None
    ) or (
        len(s.split('.')) > 2 and 'auth' in s.split('.')[0]
    ) or (
        len(s.split('.')) > 2 and s.split('.')[0] in {
            'us-east-2',
 'us-east-1',
 'us-west-1',
 'us-west-2',
 'af-south-1',
 'ap-east-1',
 'ap-south-1',
 'ap-northeast-3',
 'ap-northeast-2',
 'ap-southeast-1',
 'ap-southeast-2',
 'ap-northeast-1',
 'ca-central-1',
 'eu-central-1',
 'eu-west-1',
 'eu-west-2',
 'eu-south-1',
 'eu-west-3',
 'eu-north-1',
 'me-south-1',
 'sa-east-1',
 'us-gov-east-1',
 'us-gov-west-1'
        }
    ) or (
        len(s.split('.')) > 2 and 'counter' in s.split('.')[0]
    ) or (
        len(s.split('.')) > 2 and not is_contains_english_words(s.split('.')[0])
    ) or (
        not is_contains_english_words(
            ''.join(s.split('.')[:-1])
        )
    )


In [4]:
hosts = df['host'].values.tolist()
is_tech = [
    predict_baseline(host)
    for host in tqdm_notebook(hosts)
]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for host in tqdm_notebook(hosts)


  0%|          | 0/199944 [00:00<?, ?it/s]

In [5]:
df['is_tech'] = is_tech

In [6]:
df['is_tech'].sum() / len(df['is_tech']) # 0.8426744585074386

0.8432761173128476

In [7]:
df[~df['is_tech']].sample(300)

Unnamed: 0,host,is_tech
146161,www.film.ru,False
66892,film.lordfilm-online.net,False
548765,yaporn.mobi,False
546230,www.poravkino.ru,False
171119,can.concilio.ru,False
...,...,...
776845,refpaftjaaey.top,False
608575,ada.baidu.com,False
635527,ugonavto.net,False
254022,orangeimage.ru,False


In [8]:
df.to_csv('data/train.csv', index=False)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
#corpus = df['host'].to_list()

In [25]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3,3))

In [82]:
X_vec = vectorizer.fit_transform(df['host'].apply(lambda x: x.replace('.', '')))

In [60]:
X_vec.shape

(199944, 48737)

In [61]:
from sklearn.linear_model import LogisticRegression
import eli5

In [83]:
logit = LogisticRegression(C=5e1, solver='liblinear', random_state=17, n_jobs=-1)

In [84]:
%%time
logit.fit(X_vec, df['is_tech'].astype('int'))



Wall time: 9.31 s


LogisticRegression(C=50.0, n_jobs=-1, random_state=17, solver='liblinear')

In [70]:
# feat_weight = eli5.show_weights(estimator=logit, 
#                   feature_names= list(vectorizer.get_feature_names()),
#                  top=(50, 5))

In [99]:
feat_weight = eli5.formatters.as_dataframe.explain_weights_df(estimator=logit, 
                  feature_names= list(vectorizer.get_feature_names()),
                 top=(100, 10))

In [108]:
import json
with open('ngrams.json', 'w') as f:
    ngrams = feat_weight.set_index('feature')['weight'].to_dict()
    json.dump(ngrams, f)

In [127]:
#ngrams.keys()

In [147]:
from collections import defaultdict

In [161]:
import numpy as np

TypeError: _amax_dispatcher() missing 1 required positional argument: 'a'

In [155]:
# d = {}
# for i in ngrams.keys():
#     for j in df['host'].values:
#         if i in j:
#             if j not in d.keys():
#                 d[j] = []
#             d[j].append(ngrams[i])

In [167]:
X, y = df[['host']], df['is_tech'].values.astype(int).tolist()
X_train, X_test, y_train, y_test = train_test_split(X[['host']], y, test_size=0.33, random_state=42)

In [168]:
def create_features(hosts: List[str]):
        import pandas as pd
        import re

        df_features = pd.DataFrame()
        df_features['host'] = hosts
        df_features['url_len'] = df_features['host'].apply(lambda s: len(s))
        df_features['max_domain_level'] = df_features['host'].apply(lambda s: len(s.split('.')))
        df_features['max_domain_part_len'] = df_features['host'].apply(
            lambda s: max((len(s_i) for s_i in s.split('.'))))
        
        df_features['ngram_max'] = df_features['host'].apply(lambda s: max([
            ngrams[ngram]
            for ngram in ngrams.keys()
            if ngram in s
        ] + [0]))

        df_features['ngram_min'] = df_features['host'].apply(lambda s: min([
            ngrams[ngram]
            for ngram in ngrams.keys()
            if ngram in s
        ] + [0]))

        re_digit = re.compile('\D')
        df_features['digits_count'] = df_features['host'].apply(lambda s: len(re_digit.sub('', s)))

        del df_features['host']
        return df_features

In [169]:
X = create_features(X['host'].tolist())
X_train = create_features(X_train['host'].tolist())
X_test = create_features(X_test['host'].tolist())

model = CatBoostClassifier(random_state=0, verbose=0)
booster = model.fit(X_train, y_train)
print(pd.Series(dict(zip(booster.feature_names_, booster.feature_importances_))).sort_values(ascending=False))
predicts = booster.predict(X_test)
print(f'precision={round(precision_score(y_test, predicts), 2)}')
print(f'recall={round(recall_score(y_test, predicts), 2)}')

ngram_max              26.068791
digits_count           21.281518
max_domain_level       18.110962
max_domain_part_len    17.346728
url_len                12.674312
ngram_min               4.517689
dtype: float64
precision=0.97
recall=0.97


In [195]:
golden_url = ['yandex.ru', 'api.yandex.ru', 'cdn.vtb.ru', 'no-cdn.vtb.ru', 'ssssexxx131231222221.com']

booster.predict_proba(create_features(golden_url))[:, 1]

array([0.05048001, 0.99499176, 0.99985821, 0.99966685, 0.79381322])