# Discrete Anomaly Detection

In [1]:
from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
%matplotlib inline

In [2]:
colnames=['ip', 'timestamp', 'request_method', 'status', 'size',
          'destination', 'request_agent']
df_orig = pd.read_csv('http://python.zach.lol/access.log',          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0, 3, 4, 5, 6, 7, 8]
)

new = pd.DataFrame([["95.31.18.119", "[21/Apr/2019:10:02:41+0000]", 
                     "GET /api/v1/items/HTTP/1.1", 200, 1153005, np.nan, 
                     "python-requests/2.21.0"],
                    ["95.31.16.121", "[17/Apr/2019:19:36:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 1005, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.15.120", "[18/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 2560, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.19.58", "[19/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 200, 2056327, np.nan, 
                     "python-requests/2.21.0"]], columns=colnames)

df = df_orig.append(new)


In [3]:
df

Unnamed: 0,ip,timestamp,request_method,status,size,destination,request_agent
0,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/sales?page=81 HTTP/1.1""",200,512495,,"""python-requests/2.21.0"""
1,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/items HTTP/1.1""",200,3561,,"""python-requests/2.21.0"""
2,97.105.19.58,[16/Apr/2019:19:34:44 +0000],"""GET /api/v1/sales?page=82 HTTP/1.1""",200,510103,,"""python-requests/2.21.0"""
3,97.105.19.58,[16/Apr/2019:19:34:46 +0000],"""GET /api/v1/sales?page=83 HTTP/1.1""",200,510003,,"""python-requests/2.21.0"""
4,97.105.19.58,[16/Apr/2019:19:34:48 +0000],"""GET /api/v1/sales?page=84 HTTP/1.1""",200,511963,,"""python-requests/2.21.0"""
...,...,...,...,...,...,...,...
13973,97.105.19.58,[17/Apr/2019:12:55:14 +0000],"""GET /api/v1/sales?page=71 HTTP/1.1""",200,510166,,"""python-requests/2.21.0"""
0,95.31.18.119,[21/Apr/2019:10:02:41+0000],GET /api/v1/items/HTTP/1.1,200,1153005,,python-requests/2.21.0
1,95.31.16.121,[17/Apr/2019:19:36:41+0000],GET /api/v1/sales?page=79/HTTP/1.1,301,1005,,python-requests/2.21.0
2,97.105.15.120,[18/Apr/2019:19:42:41+0000],GET /api/v1/sales?page=79/HTTP/1.1,301,2560,,python-requests/2.21.0


In [4]:
df.timestamp = df.timestamp.str.replace(r'(\[|\])', '', regex=True)
df.timestamp= pd.to_datetime(df.timestamp.str.replace(':', ' ', 1)) 
df = df.set_index('timestamp')

In [5]:
for col in ['request_method', 'request_agent', 'destination']:
    df[col] = df[col].str.replace('"', '')

df['request_method'] = df.request_method.str.replace(r'\?page=[0-9]+', '', regex=True)

df.head()

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,512495,,python-requests/2.21.0
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/items HTTP/1.1,200,3561,,python-requests/2.21.0
2019-04-16 19:34:44+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510103,,python-requests/2.21.0
2019-04-16 19:34:46+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510003,,python-requests/2.21.0
2019-04-16 19:34:48+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,511963,,python-requests/2.21.0


In [6]:
df['size_mb'] = [n/1024/1024 for n in df['size']]

In [7]:
ip_df = pd.DataFrame(df.ip.value_counts(dropna=False)).reset_index().\
                rename(index=str, columns={'index': 'ip', 'ip': 'ip_count'})
ip_df2 = pd.DataFrame(df.ip.value_counts(dropna=False)/df.ip.count()).reset_index().\
                rename(index=str, columns={'index': 'ip', 'ip': 'ip_proba'})
ip_df = ip_df.merge(ip_df2)


# see those where rate < 1% 
ip_df

Unnamed: 0,ip,ip_count,ip_proba
0,97.105.19.58,11999,0.85842
1,173.173.113.51,1059,0.075762
2,72.181.113.170,613,0.043855
3,72.181.105.81,246,0.017599
4,24.26.242.9,21,0.001502
5,68.201.219.223,21,0.001502
6,70.121.214.34,2,0.000143
7,35.175.171.137,2,0.000143
8,52.87.230.102,2,0.000143
9,35.174.209.2,1,7.2e-05


In [8]:
colnames=['date', "time", 'destination', 'unknown_1',
          'unknown_2', "ip"]
df_orig = pd.read_csv("curriculum-access.txt",          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                     )

# pd.read_csv("curriculum-access.txt")

In [9]:
df_orig["time_stamp"] = pd.to_datetime(df_orig.date + " " + df_orig.time)

In [10]:
df_orig.drop(columns=["time", "date"], inplace = True)

In [11]:
df = df_orig.sort_values(by="time_stamp")

In [12]:
df["unknown_2"] = df["unknown_2"].astype(float)

In [13]:
df

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp
0,/,1,8.0,98.106.20.62,2018-01-26 09:55:03
1,java-ii,1,8.0,98.106.20.62,2018-01-26 09:56:02
2,java-ii/object-oriented-programming,1,8.0,98.106.20.62,2018-01-26 09:56:05
3,slides/object_oriented_programming,1,8.0,98.106.20.62,2018-01-26 09:56:06
4,javascript-i/conditionals,2,22.0,98.106.20.62,2018-01-26 09:56:24
...,...,...,...,...,...
543591,java-ii/file-io,602,56.0,48.188.241.68,2020-05-15 15:01:40
543592,9-timeseries/3-prep,582,55.0,48.135.163.38,2020-05-15 15:01:54
543593,jquery/ajax/requests-and-responses,637,57.0,71.121.18.17,2020-05-15 15:03:37
543594,10-anomaly-detection/3-discrete-probabilistic-...,11,28.0,77.186.132.227,2020-05-15 15:05:14


In [14]:
# What are the two unknowns?

# unknown_1 is likely the a device id, so if I access the curriculumn from my phone and my computer, that would
# generate different id's

In [15]:
df.isnull().sum()

destination        1
unknown_1          0
unknown_2      27856
ip                 0
time_stamp         0
dtype: int64

In [16]:
df.groupby(["unknown_1", "unknown_2"]).count().head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,destination,ip,time_stamp
unknown_1,unknown_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8.0,1617,1617,1617
1,28.0,3820,3820,3820
2,22.0,1541,1541,1541
3,22.0,1562,1562,1562
4,22.0,692,692,692
5,22.0,1701,1701,1701
6,22.0,1408,1408,1408
7,22.0,1803,1803,1803
8,22.0,483,483,483
9,22.0,585,585,585


In [17]:
df[df.unknown_2 == 5]

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp
535518,/,649,5.0,71.126.151.42,2020-05-06 19:06:32


In [18]:
ip_df = pd.DataFrame(df.ip.value_counts(dropna=False)).reset_index().\
                rename(index=str, columns={'index': 'ip', 'ip': 'ip_count'})
ip_df2 = pd.DataFrame(df.ip.value_counts(dropna=False)/df.ip.count()).reset_index().\
                rename(index=str, columns={'index': 'ip', 'ip': 'ip_proba'})
ip_df = ip_df.merge(ip_df2)

In [19]:
ip_df

Unnamed: 0,ip,ip_count,ip_proba
0,98.106.20.59,284579,0.523512
1,98.106.20.62,61662,0.113434
2,193.172.118.211,9029,0.016610
3,13.107.209.195,4219,0.007761
4,173.125.68.94,2980,0.005482
...,...,...,...
3490,73.180.167.182,1,0.000002
3491,132.195.173.31,1,0.000002
3492,175.208.26.209,1,0.000002
3493,166.92.13.129,1,0.000002


In [20]:
df = df.merge(ip_df)

In [21]:
# our cohort is 575 to 591

In [22]:
df[df.unknown_1 == 248].sort_values(by="time_stamp")

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp,ip_count,ip_proba
133446,/,248,1.0,98.106.20.59,2018-09-05 14:28:23,284579,0.523512
134485,/,248,1.0,98.106.20.59,2018-09-07 14:53:23,284579,0.523512
134486,html-css,248,1.0,98.106.20.59,2018-09-07 14:54:13,284579,0.523512
134487,html-css/introduction,248,1.0,98.106.20.59,2018-09-07 14:56:07,284579,0.523512
134489,html-css/elements,248,1.0,98.106.20.59,2018-09-07 14:57:00,284579,0.523512
...,...,...,...,...,...,...,...
509287,9-timeseries/5.1-modeling-lesson1,248,28.0,174.174.116.202,2020-05-14 09:21:04,360,0.000662
509288,9-timeseries/4-explore,248,28.0,174.174.116.202,2020-05-14 16:01:54,360,0.000662
509289,9-timeseries/5.1-modeling-lesson1,248,28.0,174.174.116.202,2020-05-14 16:05:13,360,0.000662
509290,9-timeseries/4-explore,248,28.0,174.174.116.202,2020-05-14 16:35:11,360,0.000662


In [23]:
df[df.unknown_1 == 575].head(50).sort_values(by="time_stamp")

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp,ip_count,ip_proba
369343,login,575,55.0,98.106.20.59,2020-02-03 15:39:34,284579,0.523512
369348,login,575,55.0,98.106.20.59,2020-02-03 15:39:37,284579,0.523512
369350,/,575,55.0,98.106.20.59,2020-02-03 15:40:05,284579,0.523512
370042,1-fundamentals/1.1-intro-to-data-science,575,55.0,98.106.20.59,2020-02-04 10:56:45,284579,0.523512
370044,1-fundamentals/AI-ML-DL-timeline.jpg,575,55.0,98.106.20.59,2020-02-04 10:56:46,284579,0.523512
370043,1-fundamentals/modern-data-scientist.jpg,575,55.0,98.106.20.59,2020-02-04 10:56:46,284579,0.523512
370091,1-fundamentals/AI-ML-DL-timeline.jpg,575,55.0,98.106.20.59,2020-02-04 11:05:26,284579,0.523512
370089,1-fundamentals/1.1-intro-to-data-science,575,55.0,98.106.20.59,2020-02-04 11:05:26,284579,0.523512
370090,1-fundamentals/modern-data-scientist.jpg,575,55.0,98.106.20.59,2020-02-04 11:05:26,284579,0.523512
372008,/,575,55.0,98.106.20.59,2020-02-06 08:21:36,284579,0.523512


In [24]:
df.sort_values(by="ip_proba")

Unnamed: 0,destination,unknown_1,unknown_2,ip,time_stamp,ip_count,ip_proba
439385,java-i/syntax-types-and-variables,337,29.0,100.204.27.98,2019-03-20 07:52:54,1,0.000002
430908,java-ii/object-oriented-programming,274,26.0,108.78.222.64,2019-02-10 14:27:04,1,0.000002
432073,spring/fundamentals/controllers,262,26.0,72.41.248.92,2019-02-12 11:19:11,1,0.000002
433415,toc,339,29.0,173.57.7.32,2019-02-13 15:33:03,1,0.000002
489492,toc,274,26.0,108.78.223.93,2019-12-20 18:57:07,1,0.000002
...,...,...,...,...,...,...,...
312766,html-css/css-ii/bootstrap-introduction,498,51.0,98.106.20.59,2019-10-08 13:25:04,284579,0.523512
312765,html-css,498,51.0,98.106.20.59,2019-10-08 13:25:00,284579,0.523512
312764,appendix/code-standards/java,453,33.0,98.106.20.59,2019-10-08 13:22:30,284579,0.523512
312772,5-stats/4.4-compare-group-membership,484,34.0,98.106.20.59,2019-10-08 13:34:06,284579,0.523512


In [25]:
df = df.rename(columns={"unknown_1": "user_id", "unknown_2": "cohort"})

In [26]:
# What users have a nan for cohort class?

df[df.cohort.isnull() == True].user_id.unique()

array([ 48,  58,  78,  61,  79, 111, 354, 365, 363, 353, 366, 357, 369,
       359, 355, 372, 351, 362, 364, 367, 403, 352, 406, 544,  88, 349,
        54,  59,  62,  63,  73,  74,  86,  89, 213, 350, 100, 103, 137,
       166, 176, 356, 358, 360, 361, 368, 375, 429, 247, 317,  64])

In [27]:
# Why is there a nan in the destination?

df[df.destination.isnull() == True]

Unnamed: 0,destination,user_id,cohort,ip,time_stamp,ip_count,ip_proba
506973,,586,55.0,73.178.241.52,2020-04-08 09:25:18,202,0.000372


In [28]:
# Are there any access logs that seems like an infiltration?

df = df.set_index("time_stamp")

In [29]:
# Accessing at weird times? 

# Can we find the students versus the instructors?

# can we use classifcation models to classify ds vs web sience

In [30]:
web_dev = df[df.destination == "java-ii"].groupby("user_id").user_id.sum().index

In [31]:
df["is_wd"] = np.nan

In [32]:
(1,2,3) in web_dev

False

In [33]:
def is_wd():
    web_dev = df[df.destination == "java-ii"].groupby("user_id").user_id.sum().index
    
    if df.user_id in web_dev:
        df.is_wd.fillna(1)
    else:
        df.is_wd.fillna(0)

In [34]:
df['is_wd'] = df['user_id'].apply(lambda x: 1 if x in web_dev else 0)

In [35]:
ds = df[df.destination == "1-fundamentals/1.1-intro-to-data-science"].groupby("user_id").user_id.sum().index

In [36]:
df['is_ds'] = df['user_id'].apply(lambda x: 1 if x in ds else 0)

In [37]:
# it seems that there might be about 25 users that have accesssed both the web_dev and ds curriculum

df[(df.is_ds == 1) & (df.is_wd == 1)].user_id.unique().shape

(25,)

In [47]:
# are there different times when ds and webdev access the curriculumn?

df[(df.is_wd == 1) & (df.is_ds == 0)].groupby("time_stamp").count().sort_values(by="destination", ascending=False)

Unnamed: 0_level_0,destination,user_id,cohort,ip,ip_count,ip_proba,is_wd,is_ds
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-03-03 22:52:09,30,30,30,30,30,30,30,30
2019-03-03 22:52:07,30,30,30,30,30,30,30,30
2019-03-03 22:52:11,25,25,25,25,25,25,25,25
2019-03-03 22:52:10,25,25,25,25,25,25,25,25
2019-03-03 22:52:08,25,25,25,25,25,25,25,25
...,...,...,...,...,...,...,...,...
2018-11-29 09:18:43,1,1,1,1,1,1,1,1
2018-11-29 09:18:41,1,1,1,1,1,1,1,1
2018-11-29 09:18:35,1,1,1,1,1,1,1,1
2018-11-29 09:18:34,1,1,1,1,1,1,1,1


In [48]:
df[(df.is_wd == 0) & (df.is_ds == 1)].groupby("time_stamp").count().sort_values(by="destination", ascending=False)

Unnamed: 0_level_0,destination,user_id,cohort,ip,ip_count,ip_proba,is_wd,is_ds
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-08-26 16:29:17,7,7,7,7,7,7,7,7
2020-02-06 09:26:58,6,6,6,6,6,6,6,6
2019-08-21 13:45:48,5,5,5,5,5,5,5,5
2020-02-06 10:02:13,5,5,5,5,5,5,5,5
2020-02-05 19:58:07,5,5,5,5,5,5,5,5
...,...,...,...,...,...,...,...,...
2019-09-04 21:49:31,1,1,1,1,1,1,1,1
2019-09-04 21:53:46,1,1,1,1,1,1,1,1
2019-09-04 21:56:16,1,1,1,1,1,1,1,1
2019-09-04 22:06:51,1,1,1,1,1,1,1,1


In [62]:
# What is the most popular destination for ds students?

df_2020 = df["2020"]

df_2020[(df_2020.is_wd == 0) & (df_2020.is_ds == 1)].groupby(["destination", "ip"]).count().nlargest(10, columns="user_id")

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,cohort,ip_count,ip_proba,is_wd,is_ds
destination,ip,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
/,98.106.20.59,404,404,404,404,404,404
1-fundamentals/AI-ML-DL-timeline.jpg,98.106.20.59,302,302,302,302,302,302
1-fundamentals/modern-data-scientist.jpg,98.106.20.59,301,301,301,301,301,301
1-fundamentals/1.1-intro-to-data-science,98.106.20.59,300,300,300,300,300,300
search/search_index.json,98.106.20.59,166,166,166,166,166,166
3-sql/1-mysql-overview,98.106.20.59,159,159,159,159,159,159
4-python/3-data-types-and-variables,98.106.20.59,159,159,159,159,159,159
4-python/8.4.3-dataframes,98.106.20.59,157,157,157,157,157,157
appendix/cli-git-overview,98.106.20.59,152,152,152,152,152,152
4-python/8.4.4-advanced-dataframes,98.106.20.59,133,133,133,133,133,133


In [61]:
# what is the ds science with the most logins?

df_2020[(df_2020.is_wd == 0) & (df_2020.is_ds == 1)].groupby(["user_id", "ip"]).count().nlargest(10, columns="cohort")

Unnamed: 0_level_0,Unnamed: 1_level_0,destination,cohort,ip_count,ip_proba,is_wd,is_ds
user_id,ip,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
581,73.191.188.174,1319,1319,1319,1319,1319,1319
576,100.127.114.141,894,894,894,894,894,894
580,75.193.168.160,637,637,637,637,637,637
248,98.106.20.59,518,518,518,518,518,518
582,70.232.93.36,517,517,517,517,517,517
584,71.124.237.192,452,452,452,452,452,452
584,98.106.20.59,452,452,452,452,452,452
581,98.106.20.59,433,433,433,433,433,433
587,98.106.20.59,417,417,417,417,417,417
576,98.106.20.59,395,395,395,395,395,395


In [60]:
# User 581 has the most logins ... interestingly 

df[df.user_id == 581]

Unnamed: 0_level_0,destination,user_id,cohort,ip,ip_count,ip_proba,is_wd,is_ds
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-02-03 15:40:35,/,581,55.0,98.106.20.59,284579,0.523512,0,1
2020-02-03 15:42:00,/,581,55.0,98.106.20.59,284579,0.523512,0,1
2020-02-03 15:43:47,1-fundamentals/1.1-intro-to-data-science,581,55.0,98.106.20.59,284579,0.523512,0,1
2020-02-03 15:43:48,1-fundamentals/modern-data-scientist.jpg,581,55.0,98.106.20.59,284579,0.523512,0,1
2020-02-03 15:43:48,1-fundamentals/AI-ML-DL-timeline.jpg,581,55.0,98.106.20.59,284579,0.523512,0,1
...,...,...,...,...,...,...,...,...
2020-05-15 13:54:12,10-anomaly-detection/AnomalyDetectionCartoon.jpeg,581,55.0,73.191.188.174,1319,0.002426,0,1
2020-05-15 14:17:06,6-regression/1-overview,581,55.0,73.191.188.174,1319,0.002426,0,1
2020-05-15 14:17:13,9-timeseries/5.1-modeling-lesson1,581,55.0,73.191.188.174,1319,0.002426,0,1
2020-05-15 14:17:18,9-timeseries/5.2-modeling-lesson2,581,55.0,73.191.188.174,1319,0.002426,0,1


In [87]:
# can we tell what id is the instructures?

df.groupby(["cohort"])[["user_id"]].nunique()

Unnamed: 0_level_0,user_id
cohort,Unnamed: 1_level_1
1.0,18
2.0,3
4.0,1
5.0,1
6.0,2
7.0,6
8.0,5
9.0,1
11.0,5
12.0,4


In [105]:
df.groupby(["user_id", "cohort"]).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,destination,user_id,cohort,ip,ip_count,ip_proba,is_wd,is_ds
user_id,cohort,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,8.0,248,1,1,10,8,8,1,1
1,28.0,653,1,1,19,16,16,1,1
2,22.0,151,1,1,12,11,11,1,1
3,22.0,169,1,1,25,22,22,1,1
4,22.0,143,1,1,11,8,8,1,1
...,...,...,...,...,...,...,...,...,...
646,8.0,23,1,1,1,1,1,1,1
647,14.0,6,1,1,3,3,3,1,1
648,56.0,130,1,1,1,1,1,1,1
649,5.0,1,1,1,1,1,1,1,1


It seems that our cohort is cohort # 55. I believe the other ds cohorts were #34 and #19. Do remember that there are no values between 34 adn 51.

Some of the odd groups:

* cohorts 4, 5 and 9 have only one user
* cohort 28 I think is the instructors, as I many have different id's and it's the second cohort they recive usually

* User 248 is a bit of an anomaly in the ds group, as the first login took place in March 2019 (a month after the Ada class started). The user has had consistent logins into the website through May 2020.

In [116]:
df[df.user_id == 248].

Unnamed: 0_level_0,destination,user_id,cohort,ip,ip_count,ip_proba,is_wd,is_ds
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-03-01 08:50:28,4.4_functions,248,28.0,193.172.118.211,9029,0.016610,0,1
2019-03-01 08:50:30,4.3_control_structures,248,28.0,193.172.118.211,9029,0.016610,0,1
2019-03-01 08:50:34,4.2_data_types_and_variables,248,28.0,193.172.118.211,9029,0.016610,0,1
2019-03-01 08:50:37,4.3_control_structures,248,28.0,193.172.118.211,9029,0.016610,0,1
2019-03-01 08:51:59,Intro_to_Regression_Module,248,28.0,193.172.118.211,9029,0.016610,0,1
...,...,...,...,...,...,...,...,...
2020-05-14 09:21:04,9-timeseries/5.1-modeling-lesson1,248,28.0,174.174.116.202,360,0.000662,0,1
2020-05-14 16:01:54,9-timeseries/4-explore,248,28.0,174.174.116.202,360,0.000662,0,1
2020-05-14 16:05:13,9-timeseries/5.1-modeling-lesson1,248,28.0,174.174.116.202,360,0.000662,0,1
2020-05-14 16:35:11,9-timeseries/4-explore,248,28.0,174.174.116.202,360,0.000662,0,1


In [120]:
df[df.user_id == 616]

Unnamed: 0_level_0,destination,user_id,cohort,ip,ip_count,ip_proba,is_wd,is_ds
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-16 19:16:11,2-storytelling/2.1-understand,616,55.0,71.115.29.45,14,0.000026,0,1
2020-03-16 19:16:16,2-storytelling/2.2-create,616,55.0,71.115.29.45,14,0.000026,0,1
2020-03-16 19:16:26,2-storytelling/1-overview,616,55.0,71.115.29.45,14,0.000026,0,1
2020-03-16 19:16:40,2-storytelling/2.1-understand,616,55.0,71.115.29.45,14,0.000026,0,1
2020-03-16 19:16:47,2-storytelling/2.2-create,616,55.0,71.115.29.45,14,0.000026,0,1
...,...,...,...,...,...,...,...,...
2020-05-15 09:06:11,9-timeseries/5.2-modeling-lesson2,616,55.0,71.115.10.242,382,0.000703,0,1
2020-05-15 11:20:58,9-timeseries/5.2-modeling-lesson2,616,55.0,71.115.10.242,382,0.000703,0,1
2020-05-15 11:32:34,9-timeseries/5.2-modeling-lesson2,616,55.0,71.115.10.242,382,0.000703,0,1
2020-05-15 11:49:03,/,616,55.0,71.115.10.242,382,0.000703,0,1
