In [24]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

Use basic probability to identify anomalous request methods. You will want to make sure the text is normalized in order to reduce the noise.

In [25]:
colnames=['ip', 'timestamp', 'request_method', 'status', 'size',
          'destination', 'request_agent']
df_orig = pd.read_csv('http://python.zach.lol/access.log',          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0, 3, 4, 5, 6, 7, 8]
)

new = pd.DataFrame([["95.31.18.119", "[21/Apr/2019:10:02:41+0000]", 
                     "GET /api/v1/items/HTTP/1.1", 200, 1153005, np.nan, 
                     "python-requests/2.21.0"],
                    ["95.31.16.121", "[17/Apr/2019:19:36:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 1005, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.15.120", "[18/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 2560, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.19.58", "[19/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 200, 2056327, np.nan, 
                     "python-requests/2.21.0"]], columns=colnames)

df = df_orig.append(new)

In [26]:
#parse datetime
df.timestamp = df.timestamp.str.replace(r'(\[|\])', '', regex=True)
df.timestamp= pd.to_datetime(df.timestamp.str.replace(':', ' ', 1)) 
df = df.set_index('timestamp')

In [32]:
#clean up text
for col in ['request_method', 'request_agent', 'destination']:
    df[col] = df[col].str.replace('"', '')

df['request_method'] = df.request_method.str.replace(r'\?page=[0-9]+', '', regex=True)

df.head()

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent,size_mb
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,512495,,python-requests/2.21.0,0.488753
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/items HTTP/1.1,200,3561,,python-requests/2.21.0,0.003396
2019-04-16 19:34:44+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510103,,python-requests/2.21.0,0.486472
2019-04-16 19:34:46+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510003,,python-requests/2.21.0,0.486377
2019-04-16 19:34:48+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,511963,,python-requests/2.21.0,0.488246


In [33]:
df['size_mb'] = [n/1024/1024 for n in df['size']]

In [34]:
df.head()

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent,size_mb
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,512495,,python-requests/2.21.0,0.488753
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/items HTTP/1.1,200,3561,,python-requests/2.21.0,0.003396
2019-04-16 19:34:44+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510103,,python-requests/2.21.0,0.486472
2019-04-16 19:34:46+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510003,,python-requests/2.21.0,0.486377
2019-04-16 19:34:48+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,511963,,python-requests/2.21.0,0.488246


In [35]:
request = pd.DataFrame(df.request_method)
standard = "GET"

In [36]:
request.head()

Unnamed: 0_level_0,request_method
timestamp,Unnamed: 1_level_1
2019-04-16 19:34:42+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:42+00:00,GET /api/v1/items HTTP/1.1
2019-04-16 19:34:44+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:46+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:48+00:00,GET /api/v1/sales HTTP/1.1


In [37]:
all_request = list(request.request_method.unique())

In [38]:
all_request[0]

'GET /api/v1/sales HTTP/1.1'

In [39]:
list(df.request_method.unique())

['GET /api/v1/sales HTTP/1.1',
 'GET /api/v1/items HTTP/1.1',
 'GET /api/v1/stores HTTP/1.1',
 'GET / HTTP/1.1',
 'GET /documentation HTTP/1.1',
 'GET /api/V1/HiZach! HTTP/1.1',
 'GET /favicon.ico HTTP/1.1',
 'GET /api/v1/items/next_page HTTP/1.1',
 'GET /api/v1/ HTTP/1.1',
 'GET /api/v1//api/v1/items HTTP/1.1',
 'GET /api/v1//api/v1/items/next_page HTTP/1.1',
 'GET /api/v1items HTTP/1.1',
 'GET /api/v1 HTTP/1.1',
 'GET /api/v1/items/api/v1/items HTTP/1.1',
 'GET /api/v1/helloclass! HTTP/1.1',
 'GET /api/v1/I_DIDNT_DO_IT!!!! HTTP/1.1',
 'GET /api/v1/itemsitems HTTP/1.1',
 'GET /api/v1/items&page=0 HTTP/1.1',
 'GET /api/v1/sales/ HTTP/1.1',
 'GET /api/v1/store HTTP/1.1',
 'GET /api/v1/items/HTTP/1.1',
 'GET /api/v1/sales/HTTP/1.1']

In [40]:
request.head()

Unnamed: 0_level_0,request_method
timestamp,Unnamed: 1_level_1
2019-04-16 19:34:42+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:42+00:00,GET /api/v1/items HTTP/1.1
2019-04-16 19:34:44+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:46+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:48+00:00,GET /api/v1/sales HTTP/1.1


In [41]:
# request.request_method = request.request_method.str.strip()

In [42]:
request.head()

Unnamed: 0_level_0,request_method
timestamp,Unnamed: 1_level_1
2019-04-16 19:34:42+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:42+00:00,GET /api/v1/items HTTP/1.1
2019-04-16 19:34:44+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:46+00:00,GET /api/v1/sales HTTP/1.1
2019-04-16 19:34:48+00:00,GET /api/v1/sales HTTP/1.1


In [43]:
request.request_method = request.request_method.str.split(" ")

In [44]:
request["url"] = request.request_method.apply(lambda x:x[1])

In [51]:
request.url.value_counts()

/api/v1/sales                      12403
/api/v1/items                       1065
/api/v1/stores                       229
/                                    107
/documentation                       100
/favicon.ico                          26
/api/v1//api/v1/items                 11
/api/v1/items/api/v1/items             7
/api/v1/items/next_page                5
/api/v1/                               4
/api/v1/sales/                         3
/api/v1/sales/HTTP/1.1                 3
/api/v1/itemsitems                     3
/api/v1/store                          3
/api/v1items                           2
/api/v1/helloclass!                    1
/api/v1/items&page=0                   1
/api/v1                                1
/api/V1/HiZach!                        1
/api/v1/items/HTTP/1.1                 1
/api/v1//api/v1/items/next_page        1
/api/v1/I_DIDNT_DO_IT!!!!              1
Name: url, dtype: int64