In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
tqdm.pandas()

In [2]:
hosts = pd.read_csv("data/host.csv", 
                    header=None, 
#                     nrows=100
                   )
hosts.columns = ["url"]

In [3]:
hosts.head()

Unnamed: 0,url
0,api.youla.io
1,favicon.yandex.net
2,w-74721.fp.kaspersky-labs.com
3,questtime.net
4,passport-authproxy.taxi.yandex.net


In [4]:
words = []

def collect_words(url):
    global words
    words += url.split(".")
    
hosts["url"].progress_apply(collect_words)   
words = pd.Series(words)

words.value_counts().head(30)

100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:01<00:00, 864609.92it/s]


com                  520391
ru                   172788
net                  153952
yandex                63034
userapi               46256
me                    45263
mycdn                 37291
googlevideo           32175
www                   24163
cdn                   23947
googleapis            23779
tiktokcdn             23406
api                   21447
googlesyndication     21416
apple                 20303
safeframe             19802
google                18399
img                   17551
st                    17017
strm                  16765
org                   16732
io                    16224
avito                 16071
mts                   14195
0                     11399
fbcdn                 10349
mail                   9982
push                   9884
match                  9682
akadns                 9592
dtype: int64

In [5]:
start_with_api = hosts["url"].str.contains("^api", regex=True).astype(int)
sum(start_with_api)

25037

In [6]:
has_userapi = hosts["url"].str.contains("userapi").astype(int)
sum(has_userapi)

46268

In [7]:
has_googleapis = hosts["url"].str.contains("googleapis").astype(int)
sum(has_googleapis)

23784

In [8]:
hosts["start_with_api"] = start_with_api
hosts["has_userapi"] = has_userapi
hosts["has_googleapis"] = has_googleapis

In [9]:
hosts["size_of_url"] = hosts["url"].progress_apply(lambda x: len(x))

100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:01<00:00, 771132.96it/s]


In [10]:
hosts["size_of_url_split"] = hosts["url"].progress_apply(lambda x: len(x.split(".")))

100%|████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:01<00:00, 774071.77it/s]


In [11]:
hosts.head()

Unnamed: 0,url,start_with_api,has_userapi,has_googleapis,size_of_url,size_of_url_split
0,api.youla.io,1,0,0,12,3
1,favicon.yandex.net,0,0,0,18,3
2,w-74721.fp.kaspersky-labs.com,0,0,0,29,4
3,questtime.net,0,0,0,13,2
4,passport-authproxy.taxi.yandex.net,0,0,0,34,4


In [12]:
hosts = hosts.head(100).copy()

In [13]:
status_codes = []
content_types = []
is_json = []
is_redirect = []
size_of_cookies = []
encodings = []

In [14]:
def process_url(url):
    try:
        response = requests.get("http://"+url, timeout=0.5)
        
        try: 
            status_codes.append(response.status_code)
        except:
            status_codes.append(-1)
        
        try:
            content_types.append(response.headers["Content-Type"].split(";")[0])
        except:
            content_types.append("none")
            
        try: 
            response.json()
            is_json.append(1)
        except:
            is_json.append(0)
            
        try:
            is_redirect.append(int(response.is_redirect))
        except:
            is_redirect.append(-1)
            
        try:
            size_of_cookies.append(len(response.cookies))
        except:
            size_of_cookies.append(-1)
            
        try:
            encodings.append(response.encoding if len(response.encoding) != 0 else "none")
        except: 
            encodings.append("none")
        
    except:
        status_codes.append(-1)
        content_types.append("none")
        is_json.append(-1)
        is_redirect.append(-1)
        size_of_cookies.append(-1)
        encodings.append("none")

In [15]:
hosts["url"].progress_apply(process_url)
print("Done")

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:35<00:00,  2.82it/s]

Done





In [16]:
content_types[:10]

['text/html',
 'none',
 'none',
 'text/html',
 'application/json',
 'none',
 'none',
 'text/html',
 'text/plain',
 'none']

In [17]:
status_codes[:10]

[200, 404, -1, 200, 404, -1, -1, 404, 204, -1]

In [18]:
is_json[:10]

[0, 0, -1, 0, 1, -1, -1, 0, 0, -1]

In [19]:
is_redirect[:10]

[0, 0, -1, 0, 0, -1, -1, 0, 0, -1]

In [20]:
encodings[:10]

['UTF-8',
 'none',
 'none',
 'UTF-8',
 'utf-8',
 'none',
 'none',
 'UTF-8',
 'ISO-8859-1',
 'none']

In [21]:
size_of_cookies[:10]

[0, 0, -1, 1, 0, -1, -1, 0, 0, -1]

In [22]:
hosts["content_type"] = content_types
hosts["status_code"] = status_codes
hosts["is_json"] = is_json
hosts["is_redirect"] = is_redirect
hosts["encoding"] = encodings
hosts["size_of_cookies"] = size_of_cookies

In [23]:
hosts.head()

Unnamed: 0,url,start_with_api,has_userapi,has_googleapis,size_of_url,size_of_url_split,content_type,status_code,is_json,is_redirect,encoding,size_of_cookies
0,api.youla.io,1,0,0,12,3,text/html,200,0,0,UTF-8,0
1,favicon.yandex.net,0,0,0,18,3,none,404,0,0,none,0
2,w-74721.fp.kaspersky-labs.com,0,0,0,29,4,none,-1,-1,-1,none,-1
3,questtime.net,0,0,0,13,2,text/html,200,0,0,UTF-8,1
4,passport-authproxy.taxi.yandex.net,0,0,0,34,4,application/json,404,1,0,utf-8,0
