# Anomaly Detection Project

In [1]:
from prepare import wrangle_curriculum

import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
pd.plotting.register_matplotlib_converters()
import seaborn as sns

In [2]:
df = wrangle_curriculum()

In [3]:
df

Unnamed: 0,page_viewed,user_id,cohort_id,ip,datetime,year,month,day,hour,weekday
0,/,1,8,97.105.19.61,2018-01-26 09:55:03,2018,1,26,9,Friday
1,java-ii,1,8,97.105.19.61,2018-01-26 09:56:02,2018,1,26,9,Friday
2,java-ii/object-oriented-programming,1,8,97.105.19.61,2018-01-26 09:56:05,2018,1,26,9,Friday
3,slides/object_oriented_programming,1,8,97.105.19.61,2018-01-26 09:56:06,2018,1,26,9,Friday
4,javascript-i/conditionals,2,22,97.105.19.61,2018-01-26 09:56:24,2018,1,26,9,Friday
...,...,...,...,...,...,...,...,...,...,...
719454,javascript-i/coffee-project,763,62,107.192.148.199,2020-11-02 16:48:13,2020,11,2,16,Monday
719455,javascript-i/mapbox-api,771,62,172.125.226.175,2020-11-02 16:48:17,2020,11,2,16,Monday
719456,javascript-i/coffee-project,771,62,172.125.226.175,2020-11-02 16:48:18,2020,11,2,16,Monday
719457,javascript-i/bom-and-dom/bom,771,62,172.125.226.175,2020-11-02 16:48:28,2020,11,2,16,Monday


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719459 entries, 0 to 719458
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   page_viewed  719458 non-null  object
 1   user_id      719459 non-null  object
 2   cohort_id    674619 non-null  object
 3   ip           719459 non-null  object
 4   datetime     719459 non-null  object
 5   year         719459 non-null  object
 6   month        719459 non-null  object
 7   day          719459 non-null  object
 8   hour         719459 non-null  object
 9   weekday      719459 non-null  object
dtypes: object(10)
memory usage: 54.9+ MB


### Splitting off Nan values in cohort_id

In [5]:
cohort_0 = df[df.cohort_id.isnull()]

In [6]:
cohort_0

Unnamed: 0,page_viewed,user_id,cohort_id,ip,datetime,year,month,day,hour,weekday
411,/,48,,97.105.19.61,2018-01-26 16:46:16,2018,1,26,16,Friday
412,spring/extra-features/form-validation,48,,97.105.19.61,2018-01-26 16:46:24,2018,1,26,16,Friday
425,/,48,,97.105.19.61,2018-01-26 17:54:24,2018,1,26,17,Friday
435,/,48,,97.105.19.61,2018-01-26 18:32:03,2018,1,26,18,Friday
436,mysql/relationships/joins,48,,97.105.19.61,2018-01-26 18:32:17,2018,1,26,18,Friday
...,...,...,...,...,...,...,...,...,...,...
719411,java-iii,717,,136.50.18.157,2020-11-02 16:02:22,2020,11,2,16,Monday
719412,java-iii/servlets,717,,136.50.18.157,2020-11-02 16:02:26,2020,11,2,16,Monday
719414,appendix/further-reading/java/intellij-tomcat-...,727,,70.94.165.107,2020-11-02 16:03:50,2020,11,2,16,Monday
719416,java-iii/servlets,727,,70.94.165.107,2020-11-02 16:04:18,2020,11,2,16,Monday


In [7]:
df = df[~df.cohort_id.isnull()]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 674619 entries, 0 to 719458
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   page_viewed  674618 non-null  object
 1   user_id      674619 non-null  object
 2   cohort_id    674619 non-null  object
 3   ip           674619 non-null  object
 4   datetime     674619 non-null  object
 5   year         674619 non-null  object
 6   month        674619 non-null  object
 7   day          674619 non-null  object
 8   hour         674619 non-null  object
 9   weekday      674619 non-null  object
dtypes: object(10)
memory usage: 56.6+ MB


In [9]:
cohort = pd.read_csv('cohorts.csv')

In [10]:
cohort

Unnamed: 0,cohort_id,name,start_date,end_date,program_id
0,1,Arches,2014-02-04,2014-04-22,1
1,2,Badlands,2014-06-04,2014-08-22,1
2,3,Carlsbad,2014-09-04,2014-11-05,1
3,4,Denali,2014-10-20,2015-01-18,1
4,5,Everglades,2014-11-18,2015-02-24,1
5,6,Franklin,2015-02-03,2015-05-26,1
6,7,Glacier,2015-06-05,2015-10-06,1
7,8,Hampton,2015-09-22,2016-02-06,1
8,9,Apollo,2015-03-30,2015-07-29,4
9,10,Balboa,2015-11-03,2016-03-11,4


In [11]:
df = df.merge(cohort, how = 'left', left_on = "cohort_id", right_on = "cohort_id", left_index=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 674619 entries, 7 to 44
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   page_viewed  674618 non-null  object
 1   user_id      674619 non-null  object
 2   cohort_id    674619 non-null  object
 3   ip           674619 non-null  object
 4   datetime     674619 non-null  object
 5   year         674619 non-null  object
 6   month        674619 non-null  object
 7   day          674619 non-null  object
 8   hour         674619 non-null  object
 9   weekday      674619 non-null  object
 10  name         674619 non-null  object
 11  start_date   674619 non-null  object
 12  end_date     674619 non-null  object
 13  program_id   674619 non-null  int64 
dtypes: int64(1), object(13)
memory usage: 77.2+ MB


In [13]:
df = df.set_index('datetime')

In [14]:
df

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,year,month,day,hour,weekday,name,start_date,end_date,program_id
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-26 09:55:03,/,1,8,97.105.19.61,2018,1,26,9,Friday,Hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:02,java-ii,1,8,97.105.19.61,2018,1,26,9,Friday,Hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8,97.105.19.61,2018,1,26,9,Friday,Hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:06,slides/object_oriented_programming,1,8,97.105.19.61,2018,1,26,9,Friday,Hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:24,javascript-i/conditionals,2,22,97.105.19.61,2018,1,26,9,Friday,Teddy,2018-01-08,2018-05-17,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-02 16:48:13,javascript-i/coffee-project,763,62,107.192.148.199,2020,11,2,16,Monday,Jupiter,2020-09-21,2021-03-30,2
2020-11-02 16:48:17,javascript-i/mapbox-api,771,62,172.125.226.175,2020,11,2,16,Monday,Jupiter,2020-09-21,2021-03-30,2
2020-11-02 16:48:18,javascript-i/coffee-project,771,62,172.125.226.175,2020,11,2,16,Monday,Jupiter,2020-09-21,2021-03-30,2
2020-11-02 16:48:28,javascript-i/bom-and-dom/bom,771,62,172.125.226.175,2020,11,2,16,Monday,Jupiter,2020-09-21,2021-03-30,2


## Splitting Data into Seperate Cohorts

### Hampton

In [15]:
hampton = df[df.name == 'Hampton']

### Teddy

In [18]:
teddy = df[df.name == 'Teddy']

### Sequoia

In [21]:
sequoia = df[df.name == 'Sequoia']

### Arches

In [24]:
arches = df[df.name == 'Arches']

### Niagara

In [27]:
niagara = df[df.name == 'Niagara']

### Pinnacles

In [30]:
pinnacles = df[df.name == 'Pinnacles']

### Quincy

In [33]:
quincy = df[df.name == 'Quincy']

### Kings

In [36]:
kings = df[df.name == 'Kings']

### Lassen

In [39]:
lassen = df[df.name == 'Lassen']

### Mammoth

In [42]:
mammoth = df[df.name == 'Mammoth']

### Glacier

In [45]:
glacier = df[df.name == 'Glacier']

### Denali

In [48]:
denali = df[df.name == 'Denali']

### Joshua

In [51]:
joshua = df[df.name == 'Joshua']

### Olympic

In [54]:
olympic = df[df.name == 'Olympic']

### Ulysses

In [57]:
ulysses = df[df.name == 'Ulysses']

### Badlands

In [60]:
badlands = df[df.name == 'Badlands']

### Apollo

In [63]:
apollo = df[df.name == 'Apollo']

### Ike

In [66]:
ike = df[df.name == 'Ike']

### Voyageurs

In [69]:
voyageurs = df[df.name == 'Voyageurs']

### Wrangell

In [72]:
wrangell = df[df.name == 'Wrangell']

### Xanadu

In [75]:
xanadu = df[df.name == 'Xanadu']

### Franklin

In [78]:
franklin = df[df.name == 'Franklin']

### Yosemite

In [81]:
yosemite = df[df.name == 'Yosemite']

### Staff 

In [84]:
staff = df[df.name == 'Staff']

### Zion

In [87]:
zion = df[df.name == 'Zion']

### Andromeda

In [90]:
andromeda = df[df.name == 'Andromeda']

### Betelgeuse

In [93]:
betelgeuse = df[df.name == 'Betelgeuse']

### Ceres

In [96]:
ceres = df[df.name == 'Ceres']

### Bayes

In [99]:
bayes = df[df.name == 'Bayes']

### Deimos

In [102]:
deimos = df[df.name == 'Deimos']

### Europa

In [105]:
europa = df[df.name == 'Europa']

### Fortuna 

In [108]:
fortuna = df[df.name == 'Fortuna']

### Curie 

In [111]:
curie = df[df.name == 'Curie']

### Apex

In [114]:
apex = df[df.name == 'Apex']

### Ganymede

In [117]:
ganymede = df[df.name == 'Ganymede']

### Everglades

In [120]:
everglades = df[df.name == 'Everglades']

### Hyperion

In [123]:
hyperion = df[df.name == 'Hyperion']

### Darden 

In [126]:
darden = df[df.name == 'Darden']

### Bash 

In [129]:
bash = df[df.name == 'Bash']

### Jupiter


In [132]:
jupiter = df[df.name == 'Jupiter']