## Aims
- look at the data in processed.json
- construct data frame to be able to slice the data 

## Task
Read from processed.json

In [1]:
import json
import pandas as pd

with open("./data/processed.json") as data_file:
    json_data = json.load(data_file)

    affiliates = [d.get("affiliates") for d in json_data]
    key = [d.get("key") for d in json_data]
    title = [d.get("title") for d in json_data]
    level = [d.get("level") for d in json_data]
    num_of_projects = [d.get("num_of_projects") for d in json_data]
    tags = [d.get("tags") for d in json_data]
    average_rating = [d.get("average_rating") for d in json_data]
    count = [d.get("count") for d in json_data]
    stats = [d.get("stats") for d in json_data]


    data = {
        "title": title,
        "affiliates": affiliates,
        "key": key,
        "level": level,
        "num_of_projects": num_of_projects,
        "tags": tags,
        "average_rating": average_rating,
        "count": count,
        "stats": stats,
    }

    df = pd.DataFrame(data, columns = ["title", "affiliates", "key", "level", "num_of_projects", "tags", "average_rating", "count", "stats"])

In [2]:
df.head()

Unnamed: 0,title,affiliates,key,level,num_of_projects,tags,average_rating,count,stats
0,Product Manager,[],nd036,beginner,0,"[Product Marketing, Pricing Strategy, Market R...",4.568627,51,"[{'rating': 5, 'count': 32, 'percentage': 62.7..."
1,AI for Business Leaders,[],nd054,intermediate,0,"[Deep Learning, Unsupervised Learning, Compute...",5.0,2,"[{'rating': 5, 'count': 2, 'percentage': 100, ..."
2,Intro to Machine Learning with TensorFlow,[{'image': '/assets/iridium/images/shared/part...,nd230,beginner,0,"[Machine Learning, Google]",4.847826,46,"[{'rating': 5, 'count': 39, 'percentage': 84.7..."
3,UX Designer,[],nd578,beginner,0,"[UX, UI, Wireframing, Product Design, UX/UI, U...",4.724324,185,"[{'rating': 5, 'count': 145, 'percentage': 78...."
4,Data Streaming,[],nd029,advanced,0,"[faust, python, ksql, sparksql, apache avro, a...",3.884058,68,"[{'rating': 5, 'count': 28, 'percentage': 40.5..."


---

## Look at average Ratings and spread of average ratings

In [5]:
average_ratings_df = df["average_rating"]
average_ratings_df.describe()

count    63.000000
mean      4.416854
std       0.877063
min       0.000000
25%       4.500000
50%       4.588435
75%       4.711656
max       5.000000
Name: average_rating, dtype: float64

## Lets just remove some of chaff - Like affiliates data

In [7]:
df = df[
    ["title", "average_rating", "key", "count", "level", "stats", "num_of_projects", "tags"]
].dropna()

df.head()

Unnamed: 0,title,average_rating,key,count,level,stats,num_of_projects,tags
0,Product Manager,4.568627,nd036,51,beginner,"[{'rating': 5, 'count': 32, 'percentage': 62.7...",0,"[Product Marketing, Pricing Strategy, Market R..."
1,AI for Business Leaders,5.0,nd054,2,intermediate,"[{'rating': 5, 'count': 2, 'percentage': 100, ...",0,"[Deep Learning, Unsupervised Learning, Compute..."
2,Intro to Machine Learning with TensorFlow,4.847826,nd230,46,beginner,"[{'rating': 5, 'count': 39, 'percentage': 84.7...",0,"[Machine Learning, Google]"
3,UX Designer,4.724324,nd578,185,beginner,"[{'rating': 5, 'count': 145, 'percentage': 78....",0,"[UX, UI, Wireframing, Product Design, UX/UI, U..."
4,Data Streaming,3.884058,nd029,68,advanced,"[{'rating': 5, 'count': 28, 'percentage': 40.5...",0,"[faust, python, ksql, sparksql, apache avro, a..."


## Lets group the courses by level

In [13]:
df_levels = df.groupby(["level"]).mean()
df_levels.head()

Unnamed: 0_level_0,average_rating,count,num_of_projects
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Advanced,4.72093,82.0,4.0
Beginner,4.563096,129.666667,3.666667
Intermediate,3.973497,144.0,4.333333
Intermediate,4.425632,178.0,5.0
advanced,4.515061,179.272727,1.909091


looks like the level `intermediate` and `advanced` are normalized (so should clean that)


## Lets clean the table for now

In [14]:
df.loc[df["level"].str.contains("Advanced"), "level"] = "advanced"
df.loc[df["level"].str.contains("Intermediate"), "level"] = "intermediate"
df.loc[df["level"].str.contains("Beginner"), "level"] = "beginner"

In [15]:
df.head()

Unnamed: 0,title,average_rating,key,count,level,stats,num_of_projects,tags
0,Product Manager,4.568627,nd036,51,beginner,"[{'rating': 5, 'count': 32, 'percentage': 62.7...",0,"[Product Marketing, Pricing Strategy, Market R..."
1,AI for Business Leaders,5.0,nd054,2,intermediate,"[{'rating': 5, 'count': 2, 'percentage': 100, ...",0,"[Deep Learning, Unsupervised Learning, Compute..."
2,Intro to Machine Learning with TensorFlow,4.847826,nd230,46,beginner,"[{'rating': 5, 'count': 39, 'percentage': 84.7...",0,"[Machine Learning, Google]"
3,UX Designer,4.724324,nd578,185,beginner,"[{'rating': 5, 'count': 145, 'percentage': 78....",0,"[UX, UI, Wireframing, Product Design, UX/UI, U..."
4,Data Streaming,3.884058,nd029,68,advanced,"[{'rating': 5, 'count': 28, 'percentage': 40.5...",0,"[faust, python, ksql, sparksql, apache avro, a..."


## Lets see if the cleaning worked above

In [17]:
groupby_df = df.groupby(["level"]).mean()
groupby_df.head()

Unnamed: 0_level_0,average_rating,count,num_of_projects
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
advanced,4.532217,171.166667,2.083333
beginner,4.669868,248.666667,3.714286
intermediate,4.142525,266.851852,4.703704


Ok cool - looks

In [28]:
sorted_df = df.sort_values(by = ["level", "average_rating", "count", "num_of_projects"], ascending = False)
sorted_df.head()

Unnamed: 0,title,average_rating,key,count,level,stats,num_of_projects,tags
1,AI for Business Leaders,5.0,nd054,2,intermediate,"[{'rating': 5, 'count': 2, 'percentage': 100, ...",0,"[Deep Learning, Unsupervised Learning, Compute..."
23,iOS Developer,5.0,nd003-br,0,intermediate,"[{'rating': 5, 'count': 4, 'percentage': 100, ...",0,[]
43,Learn Unreal VR Foundations,4.857143,nd117,4,intermediate,"[{'rating': 5, 'count': 6, 'percentage': 85.71...",0,[]
50,Android Developer,4.734921,nd801,901,intermediate,"[{'rating': 5, 'count': 1616, 'percentage': 81...",14,"[android, App Development, Android App Develop..."
21,Become a Data Analyst,4.722488,nd002-ent,161,intermediate,"[{'rating': 5, 'count': 170, 'percentage': 81....",22,"[r, hadoop, python, d3]"


---

## Beginner courses sorted by average-rating, count and number of projects

In [32]:
beginner_df = sorted_df[sorted_df["level"] == "beginner"]
beginner_df.head(10)

Unnamed: 0,title,average_rating,key,count,level,stats,num_of_projects,tags
39,VR Foundations,5.0,nd105,0,beginner,"[{'rating': 5, 'count': 1, 'percentage': 100, ...",0,[Virtual Reality]
53,Android Basics,4.884615,nd803-br,5,beginner,"[{'rating': 5, 'count': 23, 'percentage': 88.4...",0,[]
2,Intro to Machine Learning with TensorFlow,4.847826,nd230,46,beginner,"[{'rating': 5, 'count': 39, 'percentage': 84.7...",0,"[Machine Learning, Google]"
52,Android Basics,4.846098,nd803,1399,beginner,"[{'rating': 5, 'count': 2415, 'percentage': 88...",12,"[mobile, apps, development, android, App Devel..."
16,Programming for Data Science with R,4.833333,nd118,12,beginner,"[{'rating': 5, 'count': 10, 'percentage': 83.3...",3,"[R, nd118, SQL, Advance SQl, Basic Data Scienc..."
13,Intro to Machine Learning with PyTorch,4.748148,nd229,129,beginner,"[{'rating': 5, 'count': 105, 'percentage': 77....",4,"[pyhton, python, sql, predictive, supervised, ..."
3,UX Designer,4.724324,nd578,185,beginner,"[{'rating': 5, 'count': 145, 'percentage': 78....",0,"[UX, UI, Wireframing, Product Design, UX/UI, U..."
38,Programming for Data Science with Python,4.702381,nd104,239,beginner,"[{'rating': 5, 'count': 195, 'percentage': 77....",3,"[Data Analytics, SQL, SQL join, Python, Numpy,..."
18,Marketing Analytics,4.696203,nd028,76,beginner,"[{'rating': 5, 'count': 63, 'percentage': 79.7...",8,"[digital marketing, google, Data Analytics, Da..."
10,Data Visualization,4.688889,nd197,45,beginner,"[{'rating': 5, 'count': 35, 'percentage': 77.7...",4,"[data visualization, Build Dashboard, Dashboar..."


## Intermediate courses sorted by average-rating, count and number of projects

In [39]:
intermediate_df = sorted_df[sorted_df["level"] == "intermediate"]
intermediate_df.head(15)

Unnamed: 0,title,average_rating,key,count,level,stats,num_of_projects,tags
1,AI for Business Leaders,5.0,nd054,2,intermediate,"[{'rating': 5, 'count': 2, 'percentage': 100, ...",0,"[Deep Learning, Unsupervised Learning, Compute..."
23,iOS Developer,5.0,nd003-br,0,intermediate,"[{'rating': 5, 'count': 4, 'percentage': 100, ...",0,[]
43,Learn Unreal VR Foundations,4.857143,nd117,4,intermediate,"[{'rating': 5, 'count': 6, 'percentage': 85.71...",0,[]
50,Android Developer,4.734921,nd801,901,intermediate,"[{'rating': 5, 'count': 1616, 'percentage': 81...",14,"[android, App Development, Android App Develop..."
21,Become a Data Analyst,4.722488,nd002-ent,161,intermediate,"[{'rating': 5, 'count': 170, 'percentage': 81....",22,"[r, hadoop, python, d3]"
42,Intro to Self-Driving Cars,4.680217,nd113,224,intermediate,"[{'rating': 5, 'count': 288, 'percentage': 78....",8,"[Self Driving Car, Automation, Bayesian Thinki..."
59,Front End Web Developer,4.663966,nd001,1301,intermediate,"[{'rating': 5, 'count': 2192, 'percentage': 77...",0,"[css, html, javascript, UX Design, User interf..."
51,Android Developer,4.653846,nd801-br,6,intermediate,"[{'rating': 5, 'count': 23, 'percentage': 88.4...",0,[]
24,Predictive Analytics for Business,4.641667,nd008t,106,intermediate,"[{'rating': 5, 'count': 91, 'percentage': 75.8...",8,"[Data Analytics, No Coding, Alyterx, Tableau, ..."
37,Deep Learning,4.639004,nd101,1043,intermediate,"[{'rating': 5, 'count': 1661, 'percentage': 76...",5,"[Neural Network, AI, Pytorch, Numpy, CNN, Mach..."


### Advanced courses as well

In [38]:
advanced_df = sorted_df[sorted_df["level"] == "advanced"]
advanced_df.head(5)

Unnamed: 0,title,average_rating,key,count,level,stats,num_of_projects,tags
32,Mobile Web Specialist,4.807229,nd024,163,advanced,"[{'rating': 5, 'count': 287, 'percentage': 86....",3,[]
28,Self Driving Car Engineer,4.746765,nd013,520,advanced,"[{'rating': 5, 'count': 897, 'percentage': 82....",14,"[Computer Vision, Sensor Fusion, Localization,..."
9,Sensor Fusion Engineer,4.72093,nd313,82,advanced,"[{'rating': 5, 'count': 67, 'percentage': 77.9...",4,"[Lidar, Radar, Kalman Filters, Radar Calibrati..."
57,Deep Reinforcement Learning,4.676259,nd893,117,advanced,"[{'rating': 5, 'count': 105, 'percentage': 75....",0,"[Viedeo Games, neural network, neural networks..."
55,Computer Vision,4.59144,nd891,214,advanced,"[{'rating': 5, 'count': 184, 'percentage': 71....",0,"[CNN, Recognise Faces, Image Captioning, RNN, ..."
