# Discrete Anomaly Detection

In [1]:
from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
%matplotlib inline

In [2]:
colnames=['ip', 'timestamp', 'request_method', 'status', 'size',
          'destination', 'request_agent']
df_orig = pd.read_csv('http://python.zach.lol/access.log',          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0, 3, 4, 5, 6, 7, 8]
)

new = pd.DataFrame([["95.31.18.119", "[21/Apr/2019:10:02:41+0000]", 
                     "GET /api/v1/items/HTTP/1.1", 200, 1153005, np.nan, 
                     "python-requests/2.21.0"],
                    ["95.31.16.121", "[17/Apr/2019:19:36:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 1005, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.15.120", "[18/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 2560, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.19.58", "[19/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 200, 2056327, np.nan, 
                     "python-requests/2.21.0"]], columns=colnames)

df = df_orig.append(new)


In [3]:
df

Unnamed: 0,ip,timestamp,request_method,status,size,destination,request_agent
0,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/sales?page=81 HTTP/1.1""",200,512495,,"""python-requests/2.21.0"""
1,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/items HTTP/1.1""",200,3561,,"""python-requests/2.21.0"""
2,97.105.19.58,[16/Apr/2019:19:34:44 +0000],"""GET /api/v1/sales?page=82 HTTP/1.1""",200,510103,,"""python-requests/2.21.0"""
3,97.105.19.58,[16/Apr/2019:19:34:46 +0000],"""GET /api/v1/sales?page=83 HTTP/1.1""",200,510003,,"""python-requests/2.21.0"""
4,97.105.19.58,[16/Apr/2019:19:34:48 +0000],"""GET /api/v1/sales?page=84 HTTP/1.1""",200,511963,,"""python-requests/2.21.0"""
...,...,...,...,...,...,...,...
13973,97.105.19.58,[17/Apr/2019:12:55:14 +0000],"""GET /api/v1/sales?page=71 HTTP/1.1""",200,510166,,"""python-requests/2.21.0"""
0,95.31.18.119,[21/Apr/2019:10:02:41+0000],GET /api/v1/items/HTTP/1.1,200,1153005,,python-requests/2.21.0
1,95.31.16.121,[17/Apr/2019:19:36:41+0000],GET /api/v1/sales?page=79/HTTP/1.1,301,1005,,python-requests/2.21.0
2,97.105.15.120,[18/Apr/2019:19:42:41+0000],GET /api/v1/sales?page=79/HTTP/1.1,301,2560,,python-requests/2.21.0


In [4]:
df.timestamp = df.timestamp.str.replace(r'(\[|\])', '', regex=True)
df.timestamp= pd.to_datetime(df.timestamp.str.replace(':', ' ', 1)) 
df = df.set_index('timestamp')

In [5]:
for col in ['request_method', 'request_agent', 'destination']:
    df[col] = df[col].str.replace('"', '')

df['request_method'] = df.request_method.str.replace(r'\?page=[0-9]+', '', regex=True)

df.head()

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,512495,,python-requests/2.21.0
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/items HTTP/1.1,200,3561,,python-requests/2.21.0
2019-04-16 19:34:44+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510103,,python-requests/2.21.0
2019-04-16 19:34:46+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510003,,python-requests/2.21.0
2019-04-16 19:34:48+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,511963,,python-requests/2.21.0


In [6]:
df['size_mb'] = [n/1024/1024 for n in df['size']]

In [10]:
ip_df = pd.DataFrame(df.ip.value_counts(dropna=False)).reset_index().\
                rename(index=str, columns={'index': 'ip', 'ip': 'ip_count'})
ip_df2 = pd.DataFrame(df.ip.value_counts(dropna=False)/df.ip.count()).reset_index().\
                rename(index=str, columns={'index': 'ip', 'ip': 'ip_proba'})
ip_df = ip_df.merge(ip_df2)


# see those where rate < 1% 
ip_df

Unnamed: 0,ip,ip_count,ip_proba
0,97.105.19.58,11999,0.85842
1,173.173.113.51,1059,0.075762
2,72.181.113.170,613,0.043855
3,72.181.105.81,246,0.017599
4,68.201.219.223,21,0.001502
5,24.26.242.9,21,0.001502
6,52.87.230.102,2,0.000143
7,35.175.171.137,2,0.000143
8,70.121.214.34,2,0.000143
9,54.145.52.184,1,7.2e-05


In [78]:
colnames=['date', "time", 'destination', 'unknown_1',
          'unknown_2', "ip"]
df_orig = pd.read_csv("curriculum-access.txt",          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                     )

# pd.read_csv("curriculum-access.txt")

In [79]:
df_orig["time_stamp"] = pd.to_datetime(df_orig.date + " " + df_orig.time)

In [80]:
df_orig.drop(columns=["time", "date"], inplace = True)

In [81]:
df = df_orig.sort_values(by="time_stamp")

In [83]:
df["unknown_2"] = df["unknown_2"].astype(float)

In [84]:
df

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp
0,/,1,8.0,98.106.20.62,2018-01-26 09:55:03
1,java-ii,1,8.0,98.106.20.62,2018-01-26 09:56:02
2,java-ii/object-oriented-programming,1,8.0,98.106.20.62,2018-01-26 09:56:05
3,slides/object_oriented_programming,1,8.0,98.106.20.62,2018-01-26 09:56:06
4,javascript-i/conditionals,2,22.0,98.106.20.62,2018-01-26 09:56:24
...,...,...,...,...,...
543591,java-ii/file-io,602,56.0,48.188.241.68,2020-05-15 15:01:40
543592,9-timeseries/3-prep,582,55.0,48.135.163.38,2020-05-15 15:01:54
543593,jquery/ajax/requests-and-responses,637,57.0,71.121.18.17,2020-05-15 15:03:37
543594,10-anomaly-detection/3-discrete-probabilistic-...,11,28.0,77.186.132.227,2020-05-15 15:05:14


In [86]:
# What are the two unknowns?

# unknown_1 is likely the a device id, so if I access the curriculumn from my phone and my computer, that would
# generate different id's

In [113]:
df.isnull().sum()

destination        1
unknown_1          0
unknown_2      27856
ip                 0
time_stamp         0
dtype: int64

In [131]:
df.groupby(["unknown_1", "unknown_2"]).count().head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,destination,ip,time_stamp,ip_count,ip_proba
unknown_1,unknown_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,8.0,1617,1617,1617,1617,1617
1,28.0,3820,3820,3820,3820,3820
2,22.0,1541,1541,1541,1541,1541
3,22.0,1562,1562,1562,1562,1562
4,22.0,692,692,692,692,692
5,22.0,1701,1701,1701,1701,1701
6,22.0,1408,1408,1408,1408,1408
7,22.0,1803,1803,1803,1803,1803
8,22.0,483,483,483,483,483
9,22.0,585,585,585,585,585


In [112]:
df[df.unknown_2 == 5]

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp
535518,/,649,5.0,71.126.151.42,2020-05-06 19:06:32


In [62]:
ip_df = pd.DataFrame(df.ip.value_counts(dropna=False)).reset_index().\
                rename(index=str, columns={'index': 'ip', 'ip': 'ip_count'})
ip_df2 = pd.DataFrame(df.ip.value_counts(dropna=False)/df.ip.count()).reset_index().\
                rename(index=str, columns={'index': 'ip', 'ip': 'ip_proba'})
ip_df = ip_df.merge(ip_df2)

In [63]:
ip_df

Unnamed: 0,ip,ip_count,ip_proba
0,98.106.20.59,284579,0.523512
1,98.106.20.62,61662,0.113434
2,193.172.118.211,9029,0.016610
3,13.107.209.195,4219,0.007761
4,173.125.68.94,2980,0.005482
...,...,...,...
3490,32.15.73.20,1,0.000002
3491,108.78.222.73,1,0.000002
3492,175.208.8.216,1,0.000002
3493,13.98.188.17,1,0.000002


In [126]:
df = df.merge(ip_df)

In [None]:
# our cohort is 575 to 591

In [165]:
df[df.unknown_1 == 248].head(50).sort_values(by="time_stamp")

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp,ip_count,ip_proba
67255,4.4_functions,248,28.0,193.172.118.211,2019-03-01 08:50:28,9029,0.01661
67256,4.3_control_structures,248,28.0,193.172.118.211,2019-03-01 08:50:30,9029,0.01661
67257,4.2_data_types_and_variables,248,28.0,193.172.118.211,2019-03-01 08:50:34,9029,0.01661
67258,4.3_control_structures,248,28.0,193.172.118.211,2019-03-01 08:50:37,9029,0.01661
67259,Intro_to_Regression_Module,248,28.0,193.172.118.211,2019-03-01 08:51:59,9029,0.01661
67262,3.0-mysql-overview,248,28.0,193.172.118.211,2019-03-01 10:51:23,9029,0.01661
67263,3.1-mysql-introduction,248,28.0,193.172.118.211,2019-03-01 10:52:27,9029,0.01661
67264,4.0_overview,248,28.0,193.172.118.211,2019-03-01 10:52:29,9029,0.01661
67265,4.1_introduction,248,28.0,193.172.118.211,2019-03-01 10:52:31,9029,0.01661
67266,4.2_data_types_and_variables,248,28.0,193.172.118.211,2019-03-01 10:52:33,9029,0.01661


In [164]:
df[df.unknown_1 == 575].head(50).sort_values(by="time_stamp")

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp,ip_count,ip_proba
369343,login,575,55.0,98.106.20.59,2020-02-03 15:39:34,284579,0.523512
369348,login,575,55.0,98.106.20.59,2020-02-03 15:39:37,284579,0.523512
369350,/,575,55.0,98.106.20.59,2020-02-03 15:40:05,284579,0.523512
370042,1-fundamentals/1.1-intro-to-data-science,575,55.0,98.106.20.59,2020-02-04 10:56:45,284579,0.523512
370044,1-fundamentals/AI-ML-DL-timeline.jpg,575,55.0,98.106.20.59,2020-02-04 10:56:46,284579,0.523512
370043,1-fundamentals/modern-data-scientist.jpg,575,55.0,98.106.20.59,2020-02-04 10:56:46,284579,0.523512
370091,1-fundamentals/AI-ML-DL-timeline.jpg,575,55.0,98.106.20.59,2020-02-04 11:05:26,284579,0.523512
370089,1-fundamentals/1.1-intro-to-data-science,575,55.0,98.106.20.59,2020-02-04 11:05:26,284579,0.523512
370090,1-fundamentals/modern-data-scientist.jpg,575,55.0,98.106.20.59,2020-02-04 11:05:26,284579,0.523512
372008,/,575,55.0,98.106.20.59,2020-02-06 08:21:36,284579,0.523512


In [162]:
df.sort_values(by="ip_proba")

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp,ip_count,ip_proba
439385,java-i/syntax-types-and-variables,337,29.0,100.204.27.98,2019-03-20 07:52:54,1,0.000002
430908,java-ii/object-oriented-programming,274,26.0,108.78.222.64,2019-02-10 14:27:04,1,0.000002
432073,spring/fundamentals/controllers,262,26.0,72.41.248.92,2019-02-12 11:19:11,1,0.000002
433415,toc,339,29.0,173.57.7.32,2019-02-13 15:33:03,1,0.000002
489492,toc,274,26.0,108.78.223.93,2019-12-20 18:57:07,1,0.000002
...,...,...,...,...,...,...,...
312766,html-css/css-ii/bootstrap-introduction,498,51.0,98.106.20.59,2019-10-08 13:25:04,284579,0.523512
312765,html-css,498,51.0,98.106.20.59,2019-10-08 13:25:00,284579,0.523512
312764,appendix/code-standards/java,453,33.0,98.106.20.59,2019-10-08 13:22:30,284579,0.523512
312772,5-stats/4.4-compare-group-membership,484,34.0,98.106.20.59,2019-10-08 13:34:06,284579,0.523512
