### Questions for Board Meeting
### MVP
- 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
- 2. Which lessons are least accessed?
- 3. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
- 4. What topics are grads continuing to reference after graduation and into their jobs (for each program)?
- 5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

### Follow through

- 1. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?
- 2. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students

In [1]:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import env
import os

from env import host, user, password

# DBSCAN import
from sklearn.cluster import DBSCAN

# Scaler import
from sklearn.preprocessing import MinMaxScaler

import acquire

In [2]:
#acquire function built
df = acquire.acquire()

In [3]:
df

Unnamed: 0,date,time,path,user_id,cohort_id,program_id,ip,name,slack,start_date,end_date,created_at,updated_at
0,2018-01-26,09:55:03,/,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26
1,2018-01-26,09:56:02,java-ii,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,2,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
847325,2021-04-21,16:41:51,jquery/personal-site,64,28.0,2,71.150.217.33,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19
847326,2021-04-21,16:42:02,jquery/mapbox-api,64,28.0,2,71.150.217.33,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19
847327,2021-04-21,16:42:09,jquery/ajax/weather-map,64,28.0,2,71.150.217.33,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19
847328,2021-04-21,16:44:37,anomaly-detection/discrete-probabilistic-methods,744,28.0,2,24.160.137.86,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19


In [4]:
# preliminary

conditions = [df.program_id == 1, df.program_id == 2, df.program_id == 3, df.program_id == 4]
result = ['web_dev','web_dev','data_science','web_dev']
df['program'] = np.select(conditions, result)
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,program_id,ip,name,slack,start_date,end_date,created_at,updated_at,program
0,2018-01-26,09:55:03,/,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev
1,2018-01-26,09:56:02,java-ii,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,2,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,web_dev


In [5]:
df.name

0         Hampton
1         Hampton
2         Hampton
3         Hampton
4           Teddy
           ...   
847325      Staff
847326      Staff
847327      Staff
847328      Staff
847329      Staff
Name: name, Length: 847330, dtype: object

In [6]:
df_wd = df[df.program == 'web_dev']
df_wd.head()

Unnamed: 0,date,time,path,user_id,cohort_id,program_id,ip,name,slack,start_date,end_date,created_at,updated_at,program
0,2018-01-26,09:55:03,/,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev
1,2018-01-26,09:56:02,java-ii,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,2,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,web_dev


In [7]:
df_wd.shape

(743918, 14)

In [8]:
#pulled from kan - good code for framing my head
df_wd_lesson_count = df_wd.groupby(by=['path'])[['user_id']].agg('count').sort_values('user_id', ascending=False)[1:60].reset_index()
df_wd_lesson_count.head(60)

Unnamed: 0,path,user_id
0,javascript-i,18193
1,toc,17580
2,search/search_index.json,15331
3,java-iii,13162
4,html-css,13111
5,java-ii,12173
6,spring,11877
7,jquery,11037
8,mysql,10602
9,java-i,10460


In [9]:
# What do we need to prepare???
    # define by lesson (web_dev or DS) - that is offered now * 
    # split path id 
    # how many class mates per cohort (index) - might be useful
    # split df into two * 
    # drop staff *
    # nulls? *
    # drop update_at column

In [10]:
# 1 null
df.isnull().sum()

date          0
time          0
path          1
user_id       0
cohort_id     0
program_id    0
ip            0
name          0
slack         0
start_date    0
end_date      0
created_at    0
updated_at    0
program       0
dtype: int64

In [11]:
df.shape

(847330, 14)

In [12]:
df.dropna(inplace = True)
df.shape

(847329, 14)

In [13]:
df.updated_at.sort_values

<bound method Series.sort_values of 0         2016-06-14 19:52:26
1         2016-06-14 19:52:26
2         2016-06-14 19:52:26
3         2016-06-14 19:52:26
4         2018-01-08 13:59:10
                 ...         
847325    2018-12-06 17:04:19
847326    2018-12-06 17:04:19
847327    2018-12-06 17:04:19
847328    2018-12-06 17:04:19
847329    2018-12-06 17:04:19
Name: updated_at, Length: 847329, dtype: object>

In [14]:
df['url'] = df['path'].str.split('/').str[0]
df['url'].value_counts().head(50)

#lets see this after dropping staff - there might be certain pages only staff sees

javascript-i                  113838
html-css                       81490
mysql                          79416
jquery                         58317
spring                         55481
java-iii                       54564
java-ii                        52925
                               45854
java-i                         39644
javascript-ii                  37477
appendix                       35599
toc                            17591
search                         17539
examples                       14169
classification                 11453
fundamentals                   11127
1-fundamentals                  9970
content                         9102
sql                             9035
3-sql                           7408
slides                          7319
python                          6718
6-regression                    5845
4-python                        5788
5-stats                         3984
stats                           3936
regression                      3828
7

In [15]:
df.shape
# 847329

(847329, 15)

In [16]:
df = df[df['name'] != 'Staff']
df.shape
#763298

(763298, 15)

In [17]:
df['url'] = df['path'].str.split('/').str[0]
df['url'].value_counts()

javascript-i                   107537
html-css                        76168
mysql                           73741
jquery                          54084
spring                          49915
                                ...  
console-io                          1
6.01.03_Summarize                   1
4.2-compare-means                   1
curie-statistics-assessment         1
6.03_Explore                        1
Name: url, Length: 471, dtype: int64

In [18]:
#split program
df_wd = df[df.program == 'web_dev']
df_ds = df[df.program == 'data_science']


In [19]:
df_wd.shape, df_ds.shape


((659887, 15), (103411, 15))

In [20]:
# look at path split column by datascience as I know this and it is smaller 
df_ds['url'].value_counts()
#153

fundamentals              8746
classification            8620
                          8358
1-fundamentals            7945
sql                       7505
                          ... 
misleading3_deaths.jpg       1
.gitignore                   1
Index.html                   1
index.html                   1
7.4.1-pandas-overview        1
Name: url, Length: 153, dtype: int64

In [21]:
df_ds['url'].value_counts(normalize = True)
#we can look at percent - 
# this is for question 1 

fundamentals              0.084575
classification            0.083357
                          0.080823
1-fundamentals            0.076829
sql                       0.072574
                            ...   
misleading3_deaths.jpg    0.000010
.gitignore                0.000010
Index.html                0.000010
index.html                0.000010
7.4.1-pandas-overview     0.000010
Name: url, Length: 153, dtype: float64

In [22]:
df_ds['url'].value_counts().loc[lambda x : x>100] 
# 29 values above 100 hits

#### WE CAN GROUP THESE BY SIMILAR WORDS - Mangeable and scratches 124 values that are likely outliers


fundamentals            8746
classification          8620
                        8358
1-fundamentals          7945
sql                     7505
3-sql                   6165
python                  5599
4-python                4856
6-regression            4562
appendix                3944
5-stats                 3361
stats                   3299
7-classification        3220
regression              2945
search                  2206
clustering              2191
10-anomaly-detection    2185
anomaly-detection       2147
8-clustering            2081
11-nlp                  1865
9-timeseries            1723
storytelling            1685
2-storytelling          1627
timeseries              1592
nlp                     1343
13-advanced-topics      1011
12-distributed-ml        996
distributed-ml           659
advanced-topics          267
Name: url, dtype: int64

In [23]:
#make variable
ds_hits = df_ds['url'].value_counts().loc[lambda x : x>100] 
ds_hits.head(30)

fundamentals            8746
classification          8620
                        8358
1-fundamentals          7945
sql                     7505
3-sql                   6165
python                  5599
4-python                4856
6-regression            4562
appendix                3944
5-stats                 3361
stats                   3299
7-classification        3220
regression              2945
search                  2206
clustering              2191
10-anomaly-detection    2185
anomaly-detection       2147
8-clustering            2081
11-nlp                  1865
9-timeseries            1723
storytelling            1685
2-storytelling          1627
timeseries              1592
nlp                     1343
13-advanced-topics      1011
12-distributed-ml        996
distributed-ml           659
advanced-topics          267
Name: url, dtype: int64

In [24]:
ds_hits = pd.DataFrame(ds_hits)
ds_hits

Unnamed: 0,url
fundamentals,8746
classification,8620
,8358
1-fundamentals,7945
sql,7505
3-sql,6165
python,5599
4-python,4856
6-regression,4562
appendix,3944


In [25]:
#sets limit to 30 so i can see all
pd.set_option('display.max_rows', 30)
ds_hits

Unnamed: 0,url
fundamentals,8746
classification,8620
,8358
1-fundamentals,7945
sql,7505
3-sql,6165
python,5599
4-python,4856
6-regression,4562
appendix,3944


In [26]:
ds_hits['url'].astype(int)

fundamentals            8746
classification          8620
                        8358
1-fundamentals          7945
sql                     7505
3-sql                   6165
python                  5599
4-python                4856
6-regression            4562
appendix                3944
5-stats                 3361
stats                   3299
7-classification        3220
regression              2945
search                  2206
clustering              2191
10-anomaly-detection    2185
anomaly-detection       2147
8-clustering            2081
11-nlp                  1865
9-timeseries            1723
storytelling            1685
2-storytelling          1627
timeseries              1592
nlp                     1343
13-advanced-topics      1011
12-distributed-ml        996
distributed-ml           659
advanced-topics          267
Name: url, dtype: int64

In [27]:
# .replace isn't working for merging these rows, need to think of an alternative but moving on for now

ds_hits['url'] = ds_hits['url'].replace(['fundamentals', '1-fundamentals'], 'fundamentals')
ds_hits

Unnamed: 0,url
fundamentals,8746
classification,8620
,8358
1-fundamentals,7945
sql,7505
3-sql,6165
python,5599
4-python,4856
6-regression,4562
appendix,3944


In [28]:
ds_hits

Unnamed: 0,url
fundamentals,8746
classification,8620
,8358
1-fundamentals,7945
sql,7505
3-sql,6165
python,5599
4-python,4856
6-regression,4562
appendix,3944


In [29]:
ds_hits.dtypes

url    int64
dtype: object

In [30]:
ds_hits = pd.DataFrame(ds_hits)
ds_hits

Unnamed: 0,url
fundamentals,8746
classification,8620
,8358
1-fundamentals,7945
sql,7505
3-sql,6165
python,5599
4-python,4856
6-regression,4562
appendix,3944


In [31]:
######## lets take a look at Web Dev

In [32]:
df_wd.url

0                     
1              java-ii
2              java-ii
3               slides
4         javascript-i
              ...     
847317          jquery
847318        html-css
847319        java-iii
847320        java-iii
847324    javascript-i
Name: url, Length: 659887, dtype: object

In [33]:
df_wd['url'].value_counts().loc[lambda x : x>100]
#why isn't it give me a count

javascript-i      107525
html-css           76142
mysql              73732
jquery             54080
spring             49907
java-iii           49810
java-ii            48698
java-i             37067
javascript-ii      34823
                   31156
appendix           26787
toc                16669
search             13984
examples           13640
content             8683
slides              7153
capstone            2296
web-design          1891
mkdocs              1865
index.html          1113
prework              855
1-fundamentals       175
assets               118
Name: url, dtype: int64

In [34]:
df_wd['url'].value_counts().loc[lambda x : x>100].count()
# so we have 23 values over 100 for web dev - I can likely scratch under 1000 to bring to 20

23

In [35]:
# this is what we need
wd_hits = df_wd['url'].value_counts().loc[lambda x : x>1000]
wd_hits = pd.DataFrame(wd_hits)
wd_hits
#change into a dataframe
#wd_hits = pd.DataFrame(wd_hits)
#see full list
pd.set_option('display.max_rows', 30)


In [36]:
wd_hits

Unnamed: 0,url
javascript-i,107525
html-css,76142
mysql,73732
jquery,54080
spring,49907
java-iii,49810
java-ii,48698
java-i,37067
javascript-ii,34823
,31156


In [None]:
#prepare
#
conditions = [df.program_id == 1, df.program_id == 2, df.program_id == 3, df.program_id == 4]
result = ['web_dev','web_dev','data_science','web_dev']
df['program'] = np.select(conditions, result)
#drop na
df.dropna(inplace = True)
# eliminated staff from dataset
df = df[df['name'] != 'Staff']

#define main path for url
df['url'] = df['path'].str.split('/').str[0]

# creating subpath for page hits
df['subpath'] = df.path.apply(lambda x: x.split('/'))

# split programs into two datasets - this one last in prepare
df_wd = df[df.program == 'web_dev']
df_ds = df[df.program == 'data_science']


In [38]:
df.path

0                                             /
1                                       java-ii
2           java-ii/object-oriented-programming
3            slides/object_oriented_programming
4                     javascript-i/conditionals
                          ...                  
847317                     jquery/personal-site
847318    html-css/css-ii/bootstrap-grid-system
847319                                 java-iii
847320                        java-iii/servlets
847324             javascript-i/bom-and-dom/dom
Name: path, Length: 763298, dtype: object

In [39]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,program_id,ip,name,slack,start_date,end_date,created_at,updated_at,program,url
0,2018-01-26,09:55:03,/,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev,
1,2018-01-26,09:56:02,java-ii,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev,java-ii
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev,java-ii
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev,slides
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,2,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,web_dev,javascript-i


## 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?



In [52]:
df['subpath'] = df.path.apply(lambda x: x.split('/'))


In [54]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,program_id,ip,name,slack,start_date,end_date,created_at,updated_at,program,url,subpath
0,2018-01-26,09:55:03,/,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev,,"[, ]"
1,2018-01-26,09:56:02,java-ii,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev,java-ii,[java-ii]
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev,java-ii,"[java-ii, object-oriented-programming]"
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,web_dev,slides,"[slides, object_oriented_programming]"
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,2,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,web_dev,javascript-i,"[javascript-i, conditionals]"


In [40]:
df_ds.groupby('name')['url','user_id'].nunique()


Unnamed: 0_level_0,url,user_id
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bayes,86,23
Curie,55,21
Darden,74,29
Easley,38,17
Florence,42,21


In [50]:
pd.set_option('display.max_rows', 41)

df_wd.groupby('name')['url','user_id'].nunique()


Unnamed: 0_level_0,url,user_id
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Andromeda,41,28
Apex,28,24
Apollo,2,1
Arches,35,18
Badlands,8,3
Bash,39,23
Betelgeuse,36,22
Ceres,40,29
Deimos,38,27
Denali,3,1


In [48]:
df_wd.groupby('name')['path'].count()

name
Andromeda    25359
Apex         33568
Apollo           5
Arches        8890
Badlands        93
             ...  
Voyageurs    35636
Wrangell     25586
Xanadu       27749
Yosemite     20743
Zion         38096
Name: path, Length: 41, dtype: int64

In [45]:
df.path.value_counts()

/                             39514
toc                           16680
javascript-i                  16386
search/search_index.json      16185
html-css                      11843
                              ...  
About_NLP                         1
8.0_Intro_Module                  1
introduction-to-matplotlib        1
2.0_Intro_Stats                   1
13.5_Tableau                      1
Name: path, Length: 1844, dtype: int64

In [46]:
df.url.value_counts()

javascript-i                   107537
html-css                        76168
mysql                           73741
jquery                          54084
spring                          49915
                                ...  
console-io                          1
6.01.03_Summarize                   1
4.2-compare-means                   1
curie-statistics-assessment         1
6.03_Explore                        1
Name: url, Length: 471, dtype: int64