sudo scp access.log.1 litinsky@192.168.10.221:/home/litinsky/dev/log_analysis/access.log.1

pd.set_option('display.max_colwidth', 200) default: 50

**INIT**

In [164]:
import re
from pathlib import Path
from datetime import datetime

import pandas as pd

In [180]:
LOG_DIR_PATH = '../nginx'
HASHES_PATH = 'hashes.txt'
PICKLE_PATH = 'logs_df.pkl'
REPORTS_DIR_PATH = 'reports'

LOG_FILE_NAME_PATTERN = r'^access\.log.*'
LOG_ARC_FILE_NAME_PATTERN = r'.+\.gz$'
LOG_ENTRY_PATTERN = '^(\S+?)\s(\S+?)\s(\S+?)\s(\[.+?\])\s(".+?")\s(.+?)\s(.+?)\s(".+?")\s(".+?")\s(".+?")$'

log_dir = Path('nginx').resolve()
if not log_dir.is_dir():
    raise FileNotFoundError(f'No source log dir found {str(log_dir)}')
    
hashes_file = Path('hashes.txt').resolve()
pickle_file = Path('logs_df.pkl').resolve()
reports_dir = Path('reports').resolve()

In [178]:

    
hashes_file = Path(HASHES_PATH).resolve()

In [3]:
def load_log_raw() -> pd.DataFrame:
    log_path = LOG_PATH
    
    with open(log_path, 'r') as f:
        log_list = f.readlines()
        
    df = pd.DataFrame(data=log_list, columns=['raw'])
    
    return df

In [25]:
def load_log_parsed() -> pd.DataFrame:
    log_path = LOG_PATH
    regexp = '^(\S+?)\s(\S+?)\s(\S+?)\s(\[.+?\])\s(".+?")\s(.+?)\s(.+?)\s(".+?")\s(".+?")\s(".+?")$'
    columns = ['ip_from', 'domain', 'x1', 'timestamp', 'request', 'response_code', 'time', 'from', 'app', 'x4']
    
    with open(log_path, 'r') as f:
        log_str = f.read()
    
    parsed = re.findall(regexp, log_str, flags = re.MULTILINE)
    df = pd.DataFrame(data=parsed, columns=columns)
    
    return df

**GET DATA**

**raw**

In [26]:
log_raw = load_log_raw()

In [27]:
log_raw.describe()

Unnamed: 0,raw
count,781102
unique,175319
top,"10.11.1.9 2242.lnsigo.mipt.ru - [18/Dec/2018:09:38:59 +0300] ""GET /botad558a10-8a7e-48ac-95a2-3cc9aa318dbd/getUpdates HTTP/1.1"" 200 26 ""-"" ""python-requests/2.16.5"" ""-""\n"
freq,9


**parsed**

In [157]:
log_parsed = load_log_parsed()

In [158]:
pd.set_option('display.max_colwidth', 200)
log_parsed.describe()

Unnamed: 0,ip_from,domain,x1,timestamp,request,response_code,time,from,app,x4
count,781101,781101,781101,781101,781101,781101,781101,781101,781101,781101
unique,260,214,1,86210,620,8,341,16,106,7
top,10.11.1.9,2242.lnsigo.mipt.ru,-,[18/Dec/2018:12:05:06 +0300],"""GET /botad558a10-8a7e-48ac-95a2-3cc9aa318dbd/getUpdates HTTP/1.1""",200,26,"""-""","""python-requests/2.16.5""","""-"""
freq,775511,775522,781101,206,691953,776031,775513,780648,691953,781089


In [159]:
log_parsed.head(5)

Unnamed: 0,ip_from,domain,x1,timestamp,request,response_code,time,from,app,x4
0,10.11.1.9,2242.lnsigo.mipt.ru,-,[18/Dec/2018:06:25:14 +0300],"""GET /botad558a10-8a7e-48ac-95a2-3cc9aa318dbd/getUpdates HTTP/1.1""",200,26,"""-""","""python-requests/2.16.5""","""-"""
1,10.11.1.9,2242.lnsigo.mipt.ru,-,[18/Dec/2018:06:25:14 +0300],"""GET /botad558a10-8a7e-48ac-95a2-3cc9aa318dbd/getUpdates HTTP/1.1""",200,26,"""-""","""python-requests/2.16.5""","""-"""
2,10.11.1.9,2242.lnsigo.mipt.ru,-,[18/Dec/2018:06:25:14 +0300],"""GET /bot105561dd-4850-45b4-94be-e767ad48c97a/getUpdates HTTP/1.1""",200,26,"""-""","""python-requests/2.18.4""","""-"""
3,10.11.1.9,2242.lnsigo.mipt.ru,-,[18/Dec/2018:06:25:14 +0300],"""GET /botad558a10-8a7e-48ac-95a2-3cc9aa318dbd/getUpdates HTTP/1.1""",200,26,"""-""","""python-requests/2.16.5""","""-"""
4,10.11.1.9,2242.lnsigo.mipt.ru,-,[18/Dec/2018:06:25:14 +0300],"""GET /botad558a10-8a7e-48ac-95a2-3cc9aa318dbd/getUpdates HTTP/1.1""",200,26,"""-""","""python-requests/2.16.5""","""-"""


**filter out download requests**

In [160]:
log_filtered = log_parsed[(log_parsed.domain.str.match(r'files.deeppavlov.ai')==True) & 
                          (log_parsed.request.str.match(r'^"GET /.+md5 HTTP/1\.1"$')==False) & 
                          (log_parsed.request.str.match(r'^"GET /\s+HTTP/1\.1"$')==False) & 
                          (log_parsed.request.str.match(r'^"GET /robots.txt HTTP/1\.1"$')==False)]
log_filtered.describe()

Unnamed: 0,ip_from,domain,x1,timestamp,request,response_code,time,from,app,x4
count,735,735,735,735,735,735,735,735,735,735
unique,67,1,1,666,127,2,148,8,32,6
top,10.11.1.102,files.deeppavlov.ai,-,[18/Dec/2018:18:42:52 +0300],"""GET /faq/school/faq_school.csv HTTP/1.1""",404,29257,"""-""","""python-requests/2.19.1""","""-"""
freq,274,735,735,4,36,732,36,724,613,724


In [139]:
#log_filtered = log_filtered.transform({'timestamp': (lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))})
#log_filtered.loc[:, 'timestamp'] = log_filtered.timestamp.apply(lambda x: 'a')
#log_filtered.describe()

In [161]:
log_group = log_filtered.groupby(['request', 'ip_from'])

In [162]:
log_group.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,domain,x1,timestamp,response_code,time,from,app,x4
request,ip_from,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"""GET /datasets/UD2.0_source/en.tar.gz HTTP/1.1""",10.11.1.102,3,3,3,3,3,3,3,3
"""GET /datasets/UD2.0_source/en.tar.gz HTTP/1.1""",10.11.1.2,1,1,1,1,1,1,1,1
"""GET /datasets/UD2.0_source/ru_syntagrus.tar.gz HTTP/1.1""",10.11.1.102,3,3,3,3,3,3,3,3
"""GET /datasets/UD2.0_source/ru_syntagrus.tar.gz HTTP/1.1""",10.11.1.2,1,1,1,1,1,1,1,1
"""GET /datasets/UD2.0_source/ru_syntagrus.tar.gz HTTP/1.1""",35.184.32.206,1,1,1,1,1,1,1,1
"""GET /datasets/ag_news_data.tar.gz HTTP/1.1""",10.11.1.101,1,1,1,1,1,1,1,1
"""GET /datasets/ag_news_data.tar.gz HTTP/1.1""",10.11.1.102,3,3,3,3,3,3,3,3
"""GET /datasets/ag_news_data.tar.gz HTTP/1.1""",10.11.1.2,1,1,1,1,1,1,1,1
"""GET /datasets/ag_news_data.tar.gz HTTP/1.1""",10.11.1.56,2,2,2,2,2,2,2,2
"""GET /datasets/dstc2_v2.tar.gz HTTP/1.1""",10.11.1.101,1,1,1,1,1,1,1,1
