In [1]:
import pandas as pd
import re
import validators
import json
from tqdm import tqdm_notebook

In [2]:
df = pd.read_csv('data/', names=['host']).drop_duplicates()

In [3]:
with open('data/good_prefix.txt') as f:
    good_prefix = json.load(f)
    
with open('data/good_suffix.txt') as f:
    good_suffix = set(json.load(f))
    
with open('data/good_hosts.txt') as f:
    good_hosts = set(json.load(f))

with open('data/tlds-alpha-by-domain.txt') as f:
    domain_root_zones = f.readlines()
    domain_root_zones = [
        d[:-1].lower()
        for d in domain_root_zones
    ]

with open('data/all_english_words.json') as f:
    english_words = json.load(f)

def is_contains_english_words(s):
    s = s.lower()
    return any(
        w in s
        for w in english_words
    )


re_is_ip = re.compile('^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$')
re_m = re.compile('^m[0-9]+')
re_2numbers = re.compile('\d{3}')
re_dynamic_prefix = re.compile('^[a-z]{1,2}[0-9]{1,}[\-\._]')
re_cache_prefix = re.compile('cache-?[0-9]{1,2}')
re_digits_only = re.compile('^[0-9]+$')
re_infra_prefix = re.compile('^infra-[0-9]+')

www_ignore_pattern = [
    'www.gstatic.com',
    'www.googleadservices.com',
    'www.googleapis.com',    
    'www.google-analytics.com',
    'www.googletagmanager.com',
    'www.tns-counter.ru',
    'www.googletagservices.com',
    'cdn'
]

tech_patterns_contains = [
    'api.',
    '.api',
    'cdn',
    'ad.',
    'ads.',
    'static.',
    's3.',
    'cache.',
    'stat.',
    'logs.',
    'log.',
    'stats.',
    'auth.',
    'sentry.',
    'script.',
    'storage.',
    '--',
    'an.yandex.ru',
    'app-measurement.com',    
    'tpc.googlesyndication.com',
    'tpc.googlesyndication.com',
    'favicon.yandex.net',
    'googlesyndication.com',
]
tech_pattern_starts = [
    'api',
    'proxy',
    'log',
    'static',
    'counter',
    'sync.',
    's.',
    'a.',
    'c.',
    'pixel.',
    'v1.',
    'ssp.',
    'img.',
    'rtb.',
    'code.',
    'cm.',
    't.',
    'app.',
    'grs.',
    'analytics.',
    'match.',
    'adservice.',
    'data.',
    'd.',
    'mc.',
    'track.',
    'assets',
    'st.',
    'js',
    'connect.',
    'media.',
    'pagead2.',
    'dl.',
    'ajax.',
    'content.',
    'i.',
    'tracking.',
    'graph.',
    'banners.',
    'widget.',
    'abtest.',
    'strm.yandex.ru',
    'yabs.yandex.ru',
    'push.yandex.ru',
    'bs.yandex.ru',
    'statistics.',
    'tags.',
    'cs',
    'adx',
    'img',
    'image',
    'ads',
    'ct.',
    'pics.',
    'clk.',
    'notify.',
    'data',
    'ocsp.',
    'files.',
    'dl-',
    'token.',
    'graphql.',
    'pushserver',
    'balancer.',
    'go.',
    'informer.',
    'clck.',
    'clicks.',
    'click.',
    'target.',
    'xray.',
    'tiles.',
    'gridserver.',
    'metrika.',
    'ntp.',
    'fronterr.',
    'lib.',
    'tracker',
    'appgateway',
    'frontend.',
    'mfa.',
    'gate.',
    'edge.',
    'chat.',
    'config.',
    'amp.',
    'widgets.',
    'dev.',
    'admin.',
    'health.',
    'callback.',
    'post.',
    'xxx-files',
    'cluster.',
    'ext.',
    'file.',
    'links.',
    'metrics',
]

tech_patterns_ends = [
    '.local',
    'googleapis.com',
    'googleusercontent.com',
    'vkuser.net',
    '.akamai',
    '.link',
    '.googleadservices',
    '.googleadserv',
]

non_tech_pattern_starts = set([
    'www.',
    'm.',
    "maps",
  "video",
  "online",
  "news",
  "forum",
  "berezniki",
  "mobile",
    "mail",
    'web.',
    'pda.',
    'wap.',
])


def predict_baseline(s):
    if re_is_ip.search(s) is not None:
        return True
    
    if not (validators.domain(s) is True):
        return True
    
    if not any(s.endswith(p) for p in domain_root_zones):
        return True
    
    if any(p in s for p in www_ignore_pattern):
        return True
    
    if any(s.endswith(p) for p in tech_patterns_ends):
        return True

    if any(s.startswith(p) for p in non_tech_pattern_starts):
        return False    
    
    if s in good_hosts:
        return True
    
    
    return any(p in s for p in tech_patterns_contains) or (
        len(s.split('.')) > 3 and not s.startswith('www.') and not s.startswith('m.')
    ) or any(s.startswith(p) for p in tech_pattern_starts) or (
        len(s.split('.')[0]) > 10 and len(s.split('.')) >= 3
    ) or re_m.search(s) is not None or (
        s.endswith('google.com') and s != 'www.google.com'
    ) or (
        len(s.split('.')) == 2 and len(s) > 25 and re_2numbers.search(s) is not None and '-' in s
    ) or (
        len(s.split('.')) > 2 and 'api' in s.split('.')[0]
    ) or (
        len(s.split('.')) > 2 and 'node' in s.split('.')[0]
    ) or (
        len(s.split('.')) > 2 and s.split('.')[0].startswith('s')
    ) or (
        len(s.split('.')) > 2 and re_dynamic_prefix.search(s) is not None
    ) or (
        len(s.split('.')) > 2 and re_cache_prefix.search(s) is not None
    ) or (
        len(s.split('.')) > 2 and re_digits_only.search(s.split('.')[0]) is not None
    ) or (
        len(s.split('.')) > 2 and len(s.split('.')[0]) == 1 and s.split('.')[0] != 'm'
    ) or (
        len(s.split('.')) > 2 and len(s.split('.')[0]) == 2 and s.split('.')[0] not in {
            'ru', 'en', 'de', 'us'
        }
    ) or (
        len(s.split('.')) > 2 and re_infra_prefix.search(s.split('.')[0]) is not None
    ) or (
        len(s.split('.')) > 2 and 'auth' in s.split('.')[0]
    ) or (
        len(s.split('.')) > 2 and s.split('.')[0] in {
            'us-east-2',
 'us-east-1',
 'us-west-1',
 'us-west-2',
 'af-south-1',
 'ap-east-1',
 'ap-south-1',
 'ap-northeast-3',
 'ap-northeast-2',
 'ap-southeast-1',
 'ap-southeast-2',
 'ap-northeast-1',
 'ca-central-1',
 'eu-central-1',
 'eu-west-1',
 'eu-west-2',
 'eu-south-1',
 'eu-west-3',
 'eu-north-1',
 'me-south-1',
 'sa-east-1',
 'us-gov-east-1',
 'us-gov-west-1'
        }
    ) or (
        len(s.split('.')) > 2 and 'counter' in s.split('.')[0]
    ) or (
        len(s.split('.')) > 2 and not is_contains_english_words(s.split('.')[0])
    ) or (
        not is_contains_english_words(
            ''.join(s.split('.')[:-1])
        )
    )


In [4]:
hosts = df['host'].values.tolist()
is_tech = [
    predict_baseline(host)
    for host in tqdm_notebook(hosts)
]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/199944 [00:00<?, ?it/s]

In [6]:
df['is_tech'] = is_tech

In [7]:
df['is_tech'].sum() / len(df['is_tech']) # 0.8426744585074386

0.8432161005081423

In [8]:
df[~df['is_tech']].sample(300)

Unnamed: 0,host,is_tech
127066,www.github.com,False
144963,moefermerstvo.ru,False
441378,corona.lmao.ninja,False
187028,massage-cosmetology.ru,False
12457,maps.google.cf,False
...,...,...
47841,oxy.cloud,False
615747,24tule.ru,False
741962,art.ngfiles.com,False
187156,minzdrav.gov.by,False


In [16]:
df.to_csv('data/train.csv', index=False)