In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('data/updated_edx_courses.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3082 entries, 0 to 3081
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          3082 non-null   int64  
 1   course_name         3082 non-null   object 
 2   course_url          3082 non-null   object 
 3   course_photo        3082 non-null   object 
 4   course_level        3082 non-null   object 
 5   course_effort       3082 non-null   object 
 6   course_length       3082 non-null   object 
 7   course_language     3082 non-null   object 
 8   course_subtitle     3082 non-null   object 
 9   course_price        3082 non-null   object 
 10  course_partner      3082 non-null   object 
 11  university_link     3082 non-null   object 
 12  course_enrollments  3082 non-null   object 
 13  subject             2536 non-null   object 
 14  title               2529 non-null   object 
 15  rating              628 non-null    float64
 16  number

In [3]:
data['number_of_students'].value_counts(dropna=False)

number_of_students
Unknown    1345
NaN         546
10946         2
17718         2
5050          2
           ... 
11884         1
19077         1
14603         1
27037         1
9509          1
Name: count, Length: 1174, dtype: int64

In [6]:
#the model I want to build will be focus on the number of students so I'll drop the rows where this variable is unknown or nan
data.dropna(subset=['number_of_students'], inplace=True)
data.shape

(2536, 19)

In [7]:
data = data.drop(data[data['number_of_students'] == 'Unknown'].index)
data.shape

(1191, 19)

In [8]:
#dropping the unnamed column
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data

Unnamed: 0,course_name,course_url,course_photo,course_level,course_effort,course_length,course_language,course_subtitle,course_price,course_partner,university_link,course_enrollments,subject,title,rating,number_of_ratings,number_of_students,price
0,The ArchitecturalImagination…,https://www.edx.org/course/the-architectural-i...,https://prod-discovery.edx-cdn.org/media/cours...,['Introductory'],['3–5 hours per week'],['10 Weeks'],['English'],['English'],['$199 USD'],['Harvard University'],['https://www.edx.org/school/harvardx'],415051,Architecture,HarvardX: The Architectural Imagination,4.7,51.0,565890,249
1,The Path to Happiness:What Chinese PhilosophyT...,https://www.edx.org/course/the-path-to-happiness,https://prod-discovery.edx-cdn.org/media/cours...,['Introductory'],['1–2 hours per week'],['13 Weeks'],['English'],['English'],['$99 USD'],['Harvard University'],['https://www.edx.org/school/harvardx'],29623,Philosophy & Ethics,HarvardX: The Path to Happiness: What Chinese ...,4.7,30.0,130686,149
2,Pyramids of Giza: AncientEgyptian Art andArcha...,https://www.edx.org/course/pyramids-of-giza-an...,https://prod-discovery.edx-cdn.org/media/cours...,['Introductory'],['2–4 hours per week'],['8 Weeks'],['English'],['English'],['$149 USD'],['Harvard University'],['https://www.edx.org/school/harvardx'],127414,Art & Culture,HarvardX: Pyramids of Giza: Ancient Egyptian A...,,,178825,219
3,"U.S. Public Policy: Social,Economic, and Forei...",https://www.edx.org/course/us-public-policy-so...,https://prod-discovery.edx-cdn.org/media/cours...,['Introductory'],['2–4 hours per week'],['4 Weeks'],['English'],['English'],['$99 USD'],['Harvard University'],['https://www.edx.org/school/harvardx'],46472,Social Sciences,"HarvardX: U.S. Public Policy: Social, Economic...",4.9,36.0,81922,149
4,"Women Making History:Ten Objects, ManyStories…",https://www.edx.org/course/women-making-histor...,https://prod-discovery.edx-cdn.org/media/cours...,['Introductory'],['2–3 hours per week'],['8 Weeks'],['English'],['English'],['$139 USD'],['Harvard University'],['https://www.edx.org/school/harvardx'],40788,History,"HarvardX: Women Making History: Ten Objects, M...",5.0,5.0,68103,209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3069,Estadística Aplicada a losNegocios…,https://www.edx.org/course/estadistica-aplicad...,https://prod-discovery.edx-cdn.org/media/cours...,['Introductory'],['5–6 hours per week'],['5 Weeks'],['Español'],['Español'],['$49 USD'],['Universidad Galileo'],['https://www.edx.org/school/galileox'],25537,Data Analysis & Statistics,GalileoX: Estadística Aplicada a los Negocios,,,30848,39
3072,Data LiteracyFoundations…,https://www.edx.org/course/data-literacy-found...,https://prod-discovery.edx-cdn.org/media/cours...,['Intermediate'],['6–8 hours per week'],['4 Weeks'],['English'],['English'],['$249 USD'],['Rochester Institute of Technology'],['https://www.edx.org/school/ritx'],not-mentioned,Data Analysis & Statistics,RITx: Data Literacy Foundations,,,7158,249
3075,Data Processing andAnalysis with Excel…,https://www.edx.org/course/data-processing-and...,https://prod-discovery.edx-cdn.org/media/cours...,['Intermediate'],['6–8 hours per week'],['4 Weeks'],['English'],['English'],['$249 USD'],['Rochester Institute of Technology'],['https://www.edx.org/school/ritx'],not-mentioned,Data Analysis & Statistics,RITx: Data Processing and Analysis with Excel,3.1,9.0,36528,249
3076,Data Representation andVisualization in Tableau…,https://www.edx.org/course/data-representation...,https://prod-discovery.edx-cdn.org/media/cours...,['Intermediate'],['6–8 hours per week'],['4 Weeks'],['English'],['English'],['$249 USD'],['Rochester Institute of Technology'],['https://www.edx.org/school/ritx'],not-mentioned,Data Analysis & Statistics,RITx: Data Representation and Visualization in...,,,15559,249


In [9]:
#i'll drop language, subtitle, price, course_partner and course_name (since I scrape the titles)
data.drop(['course_name', 'course_photo', 'course_language', 'course_subtitle', 'course_price', 'course_partner', 'university_link'], axis=1, inplace=True)
data.shape

(1191, 11)

In [10]:
data.isna().sum()

course_url              0
course_level            0
course_effort           0
course_length           0
course_enrollments      0
subject                 0
title                   0
rating                667
number_of_ratings     667
number_of_students      0
price                   0
dtype: int64

In [11]:
for column in data.columns:
    display(data[column].value_counts(dropna=False))

course_url
https://www.edx.org/course/the-architectural-imagination                          1
https://www.edx.org/course/molecular-biology-part-1-dna-replication-and-repair    1
https://www.edx.org/course/from-the-ground-up-managing-and-preserving-our-ter     1
https://www.edx.org/course/human-reproduction                                     1
https://www.edx.org/course/como-vivir-una-vida-saludable-y-activa                 1
                                                                                 ..
https://www.edx.org/course/introduction-to-hyperledger-blockchain-technologie     1
https://www.edx.org/course/music-technology-foundations                           1
https://www.edx.org/course/introduccion-a-la-robotica-y-sistemas-para-la-indu     1
https://www.edx.org/course/visualizacion-de-datos-y-storytelling                  1
https://www.edx.org/course/essentials-of-data-literacy                            1
Name: count, Length: 1191, dtype: int64

course_level
['Introductory']    687
['Intermediate']    370
['Advanced']        133
[]                    1
Name: count, dtype: int64

course_effort
['2–3 hours per week']      129
['4–6 hours per week']      119
['2–4 hours per week']      116
['3–5 hours per week']      108
['3–4 hours per week']       99
['1–2 hours per week']       73
['8–10 hours per week']      73
['6–8 hours per week']       64
['4–5 hours per week']       64
['5–8 hours per week']       32
['3–6 hours per week']       31
['5–10 hours per week']      28
['5–6 hours per week']       28
['1–3 hours per week']       28
['5–7 hours per week']       26
['8–12 hours per week']      22
['2–5 hours per week']       17
['10–12 hours per week']     14
['4–8 hours per week']       12
['10–20 hours per week']      9
['10–14 hours per week']      8
['6–10 hours per week']       7
['4–10 hours per week']       7
['6–9 hours per week']        7
[]                            7
['9–10 hours per week']       5
['7–10 hours per week']       5
['10–15 hours per week']      4
['10–11 hours per week']      4
['2–6 hours per week']        4
['7–8 hours per week']    

course_length
['6 Weeks']     231
['4 Weeks']     222
['5 Weeks']     171
['7 Weeks']     123
['8 Weeks']     112
['10 Weeks']     92
['3 Weeks']      51
['12 Weeks']     42
['9 Weeks']      30
['1 Weeks']      27
['15 Weeks']     22
['2 Weeks']      19
['14 Weeks']     11
['16 Weeks']     11
['13 Weeks']     10
['11 Weeks']      8
['18 Weeks']      4
[]                3
['20 Weeks']      1
['24 Weeks']      1
Name: count, dtype: int64

course_enrollments
not-mentioned    320
5,334              3
24,404             2
12,904             2
13,153             2
                ... 
99,502             1
28,189             1
5,767              1
38,330             1
25,537             1
Name: count, Length: 860, dtype: int64

subject
Business & Management                                                     219
Computer Science                                                          184
Engineering                                                                97
Data Analysis & Statistics                                                 73
Social Sciences                                                            63
Humanities                                                                 62
Economics & Finance                                                        58
Medicine                                                                   45
Communication                                                              41
Math                                                                       31
Biology & Life Sciences                                                    29
Environmental Studies                                                      29
Education & Teacher Training                            

title
HKUx: Epidemics I                                                                      2
HarvardX: The Architectural Imagination                                                1
MITx: Molecular Biology - Part 1: DNA Replication and Repair                           1
SDGAcademyX: From the Ground Up: Managing and Preserving Our Terrestrial Ecosystems    1
AdelaideX: Human Reproduction                                                          1
                                                                                      ..
AdelaideX: Music Technology Foundations                                                1
AnahuacX: Introducción a la robótica e industria 4.0                                   1
TecdeMonterreyX: Visualización de Datos y Storytelling                                 1
TecdeMonterreyX: Herramientas para el Análisis de Big Data                             1
DavidsonX: The Essentials of Data Literacy Online Course                               1
Name: count, Le

rating
NaN    667
4.6     82
4.8     79
4.4     67
4.7     65
4.5     62
4.3     46
4.9     26
4.2     22
4.0     19
5.0     16
4.1     14
3.9      9
3.6      4
3.8      4
3.4      3
3.5      3
3.7      1
2.6      1
3.1      1
Name: count, dtype: int64

number_of_ratings
NaN      667
5.0       70
7.0       48
6.0       40
10.0      38
        ... 
41.0       1
289.0      1
93.0       1
101.0      1
60.0       1
Name: count, Length: 76, dtype: int64

number_of_students
11498    2
16608    2
17718    2
10946    2
8982     2
        ..
11884    1
19077    1
14603    1
27037    1
9509     1
Name: count, Length: 1172, dtype: int64

price
249        111
59         101
149         91
189         80
Unknown     77
39          63
99          58
199         55
25          42
179         41
49          41
69          36
19          31
139         29
209         28
29          27
119         26
129         25
219         25
299         24
150         21
169         20
50          20
45          14
55           7
309          6
159          6
549          5
124          5
175          5
125          5
89           4
278          4
337          4
5            4
399          4
369          4
300          4
250          4
350          4
825          3
449          3
225          3
185          2
409          2
100          2
79           2
54.98        1
229          1
49.99        1
17           1
64           1
259          1
833.33       1
60           1
14           1
77           1
62           1
48.98        1
439          1
Name: count, dtype: int64

In [12]:
#cleaning course_level (based on the same values as udemy courses)

def clean_level(x):
    if x == "['Introductory']":
        return 'Beginner Level'
    elif x == "['Intermediate']":
        return 'Intermediate Level'
    elif x == "['Advanced']":
        return 'Expert Level'
    else:
        return 'Beginner Level'  #mode of this variable
    
data['course_level'] = list(map(clean_level, data['course_level']))
data['course_level'].value_counts(dropna=False)

course_level
Beginner Level        688
Intermediate Level    370
Expert Level          133
Name: count, dtype: int64

In [27]:
import re

#cleaning course_effort
def clean_effort(x):
    if x == "[]":
        return float('nan')
    else:
        pattern="\[|\]|'"
        value=re.sub(pattern,'', x)
        value=value.split()[0]
        value=value.split("–")[1]
        return value
    
data['course_effort'] = list(map(clean_effort, data['course_effort']))
data['course_effort'].value_counts(dropna=False)

course_effort
4      215
5      191
6      182
3      157
10     125
8      112
2       73
12      43
7       33
20      14
9       11
14       8
NaN      7
16       5
11       4
15       4
18       3
28       2
30       1
24       1
Name: count, dtype: int64

In [29]:
#replace nan with mean
data['course_effort'] = data['course_effort'].astype(float)
data['course_effort'].fillna(data['course_effort'].mean(), inplace=True)
data['course_effort'].value_counts(dropna=False)

course_effort
4.000000     215
5.000000     191
6.000000     182
3.000000     157
10.000000    125
8.000000     112
2.000000      73
12.000000     43
7.000000      33
20.000000     14
9.000000      11
14.000000      8
6.128378       7
16.000000      5
11.000000      4
15.000000      4
18.000000      3
28.000000      2
30.000000      1
24.000000      1
Name: count, dtype: int64

In [31]:
#cleaning course_length
def clean_length(x):
    if x == "[]":
        return float('nan')
    else:
        pattern="\[|\]|'"
        value=re.sub(pattern,'', x)
        value=value.split()[0]
        return value
    
data['course_length'] = list(map(clean_length, data['course_length']))
data['course_length'].value_counts(dropna=False)

course_length
6      231
4      222
5      171
7      123
8      112
10      92
3       51
12      42
9       30
1       27
15      22
2       19
14      11
16      11
13      10
11       8
18       4
NaN      3
20       1
24       1
Name: count, dtype: int64

In [32]:
#replace nan with mean
data['course_length'] = data['course_length'].astype(float)
data['course_length'].fillna(data['course_length'].mean(), inplace=True)
data['course_length'].isna().sum()

0

In [33]:
#cleaning course_enrollments

def clean_enrollments(x):
    if x == 'not-mentioned':
        return float('nan')
    else:
        value=x.replace(',','')
        return value
    
data['course_enrollments'] = list(map(clean_enrollments, data['course_enrollments']))
data['course_enrollments'].value_counts(dropna=False)

course_enrollments
NaN      320
5334       3
24404      2
12904      2
13153      2
        ... 
99502      1
28189      1
5767       1
38330      1
25537      1
Name: count, Length: 860, dtype: int64

In [34]:
#replace nan with mean
data['course_enrollments'] = data['course_enrollments'].astype(float)
data['course_enrollments'].fillna(data['course_enrollments'].mean(), inplace=True)
data['course_enrollments'].isna().sum()

0

In [36]:
#wrangling subject to have similar categories as udemy
business_list=["Business & Management", "From Fossil Resources to Biomass: A Business and Economics Perspective", "Communication"]
dev_list=["Computer Science", "Data Analysis & Statistics", "Engineering", "Electronics"]
health_list=["Medicine", "Health & Safety", "Food & Nutrition"]
science_list=["Math", "Biology & Life Sciences", "Environmental Studies", "Physics", "Energy & Earth Sciences", "Science", 'Chemistry',"PH525.2x: Introduction to Linear Models and Matrix Algebra","From Fossil Resources to Biomass: a Chemistry Perspective", "Human Microbiome", "Catalytic Conversions for Biobased Chemicals and Products"]
finance_list=["Economics & Finance", "Landscape Finance: Investing in Innovation for Sustainable Landscapes", "Circular Economy: An Interdisciplinary Approach"]
social_list=["Social Sciences","Humanities","Philosophy & Ethics", "Ethics"]

def clean_subject(x):
    if x in business_list:
        return 'Business'
    elif x in dev_list:
        return 'Development'
    elif x in health_list:
        return "Health"
    elif x in science_list:
        return 'Sciences'
    elif x in finance_list:
        return 'Finance & Accounting'
    elif x in social_list:
        return 'Social Sciences'
    elif x == 'Education & Teacher Training':
        return 'Teaching & Academics'
    else:
        return x
    
data['subject'] = list(map(clean_subject, data['subject']))
data['subject'].value_counts(dropna=False)

subject
Development             365
Business                262
Sciences                160
Social Sciences         135
Health                   70
Finance & Accounting     60
Teaching & Academics     28
Language                 26
History                  19
Architecture             17
Law                      15
Art & Culture            12
Music                     8
Design                    8
Literature                6
Name: count, dtype: int64

In [38]:
data['subject'].isna().sum()

0

In [40]:
#filling nan with mean for rating and number of rating
data['rating'].fillna(data['rating'].mean(), inplace=True)
data['number_of_ratings'].fillna(data['number_of_ratings'].mean(), inplace=True)
data.isna().sum()

course_url            0
course_level          0
course_effort         0
course_length         0
course_enrollments    0
subject               0
title                 0
rating                0
number_of_ratings     0
number_of_students    0
price                 0
dtype: int64

In [44]:
#cleaning price (replacing unknow values by mean)

def clean_price(x):
    if x =='Unknown':
        return float('nan')
    else:
        return x
    
data['price'] = list(map(clean_price, data['price']))
data['price'].isna().sum()

77

In [46]:
#filling nan with mean
data['price'] = data['price'].astype(float)
data['price'].fillna(data['price'].mean(), inplace=True)
data['price'].isna().sum()

0

In [47]:
#creating a total_length column based on effort and lenght of courses
data['total_length']= data['course_effort']*data['course_length']
data

Unnamed: 0,course_url,course_level,course_effort,course_length,course_enrollments,subject,title,rating,number_of_ratings,number_of_students,price,total_length
0,https://www.edx.org/course/the-architectural-i...,Beginner Level,5.0,10.0,415051.000000,Architecture,HarvardX: The Architectural Imagination,4.700000,51.000000,565890,249.0,50.0
1,https://www.edx.org/course/the-path-to-happiness,Beginner Level,2.0,13.0,29623.000000,Social Sciences,HarvardX: The Path to Happiness: What Chinese ...,4.700000,30.000000,130686,149.0,26.0
2,https://www.edx.org/course/pyramids-of-giza-an...,Beginner Level,4.0,8.0,127414.000000,Art & Culture,HarvardX: Pyramids of Giza: Ancient Egyptian A...,4.506679,21.236641,178825,219.0,32.0
3,https://www.edx.org/course/us-public-policy-so...,Beginner Level,4.0,4.0,46472.000000,Social Sciences,"HarvardX: U.S. Public Policy: Social, Economic...",4.900000,36.000000,81922,149.0,16.0
4,https://www.edx.org/course/women-making-histor...,Beginner Level,3.0,8.0,40788.000000,History,"HarvardX: Women Making History: Ten Objects, M...",5.000000,5.000000,68103,209.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3069,https://www.edx.org/course/estadistica-aplicad...,Beginner Level,6.0,5.0,25537.000000,Development,GalileoX: Estadística Aplicada a los Negocios,4.506679,21.236641,30848,39.0,30.0
3072,https://www.edx.org/course/data-literacy-found...,Intermediate Level,8.0,4.0,51943.173364,Development,RITx: Data Literacy Foundations,4.506679,21.236641,7158,249.0,32.0
3075,https://www.edx.org/course/data-processing-and...,Intermediate Level,8.0,4.0,51943.173364,Development,RITx: Data Processing and Analysis with Excel,3.100000,9.000000,36528,249.0,32.0
3076,https://www.edx.org/course/data-representation...,Intermediate Level,8.0,4.0,51943.173364,Development,RITx: Data Representation and Visualization in...,4.506679,21.236641,15559,249.0,32.0


In [48]:
data.drop(['course_effort', 'course_length'], axis=1, inplace=True)
data

Unnamed: 0,course_url,course_level,course_enrollments,subject,title,rating,number_of_ratings,number_of_students,price,total_length
0,https://www.edx.org/course/the-architectural-i...,Beginner Level,415051.000000,Architecture,HarvardX: The Architectural Imagination,4.700000,51.000000,565890,249.0,50.0
1,https://www.edx.org/course/the-path-to-happiness,Beginner Level,29623.000000,Social Sciences,HarvardX: The Path to Happiness: What Chinese ...,4.700000,30.000000,130686,149.0,26.0
2,https://www.edx.org/course/pyramids-of-giza-an...,Beginner Level,127414.000000,Art & Culture,HarvardX: Pyramids of Giza: Ancient Egyptian A...,4.506679,21.236641,178825,219.0,32.0
3,https://www.edx.org/course/us-public-policy-so...,Beginner Level,46472.000000,Social Sciences,"HarvardX: U.S. Public Policy: Social, Economic...",4.900000,36.000000,81922,149.0,16.0
4,https://www.edx.org/course/women-making-histor...,Beginner Level,40788.000000,History,"HarvardX: Women Making History: Ten Objects, M...",5.000000,5.000000,68103,209.0,24.0
...,...,...,...,...,...,...,...,...,...,...
3069,https://www.edx.org/course/estadistica-aplicad...,Beginner Level,25537.000000,Development,GalileoX: Estadística Aplicada a los Negocios,4.506679,21.236641,30848,39.0,30.0
3072,https://www.edx.org/course/data-literacy-found...,Intermediate Level,51943.173364,Development,RITx: Data Literacy Foundations,4.506679,21.236641,7158,249.0,32.0
3075,https://www.edx.org/course/data-processing-and...,Intermediate Level,51943.173364,Development,RITx: Data Processing and Analysis with Excel,3.100000,9.000000,36528,249.0,32.0
3076,https://www.edx.org/course/data-representation...,Intermediate Level,51943.173364,Development,RITx: Data Representation and Visualization in...,4.506679,21.236641,15559,249.0,32.0


In [49]:
data.to_csv('cleaned_edx.csv', index=False)