In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# YOUR CODE HERE (OPTION) 
# If you need other libraries, you can import them here
import seaborn as sns

In [3]:
raw_df = pd.read_csv('../Data/raw_data.csv', sep=',',engine='python', encoding='utf-8')

In [3]:
raw_df.head()

Unnamed: 0,name,enrollment,language,rating,level,duration,description,skill,instructor,offered by
0,Integrated and Strategic Communication Campaigns,,Taught in English,,Beginner level,6 hours (approximately),Understand what integrated communication campa...,"Strategic Management, Marketing Communications...",None {'Erin Willis'},University of Colorado Boulder
1,The Science of Success: What Researchers Know ...,"173,770 already enrolled",Taught in English,4.8,Beginner level,Approx. 11 hours to complete,Understand how to get better results at work a...,"Personal Branding, Planning, Social Capital, S...",4.8 {'Paula Caproni'},University of Michigan
2,Microsoft Azure Databricks for Data Engineering,"12,942 already enrolled",Taught in English,4.4,Intermediate level,21 hours (approximately),How to work with large amounts of data from mu...,"Microsoft Azure, Information Engineering, Data...",4.2 {' Microsoft'},Microsoft
3,Thinking About Care,,Taught in English,,Beginner level,7 hours (approximately),Develop a deeper understanding of careCritical...,,None {'Don Grant'},University of Colorado Boulder
4,Finance for Everyone: Debt,"15,586 already enrolled",Taught in English,4.8,,11 hours (approximately),,,4.7 {'Arshad Ahmad'},McMaster University


In [4]:
raw_df.shape

(2984, 10)

Drop all row with none rating

In [5]:
raw_df.dropna(subset=['rating'], inplace=True)
raw_df.shape  # shape after delete none rating

(2248, 10)

In [6]:
drop_index = raw_df.loc[~raw_df['language'].str.contains('English')].index

In [7]:
raw_df.drop(index=drop_index, inplace=True)

In [8]:
raw_df

Unnamed: 0,name,enrollment,language,rating,level,duration,description,skill,instructor,offered by
1,The Science of Success: What Researchers Know ...,"173,770 already enrolled",Taught in English,4.8,Beginner level,Approx. 11 hours to complete,Understand how to get better results at work a...,"Personal Branding, Planning, Social Capital, S...",4.8 {'Paula Caproni'},University of Michigan
2,Microsoft Azure Databricks for Data Engineering,"12,942 already enrolled",Taught in English,4.4,Intermediate level,21 hours (approximately),How to work with large amounts of data from mu...,"Microsoft Azure, Information Engineering, Data...",4.2 {' Microsoft'},Microsoft
4,Finance for Everyone: Debt,"15,586 already enrolled",Taught in English,4.8,,11 hours (approximately),,,4.7 {'Arshad Ahmad'},McMaster University
7,Basics of Statutory Compliance and Taxation,"3,123 already enrolled",Taught in English,4.8,Intermediate level,46 hours (approximately),,"Financial Accounting, TDS, Export and Import, ...",None {'Tally Education Pvt. Ltd'},Tally Education
9,Preventing Chronic Pain: A Human Systems Approach,"33,519 already enrolled",Taught in English,4.7,Beginner level,Approx. 44 hours to complete,,"Chronic Pain Management, Plan, Pain Management...","4.8 {'Dr. James Fricton, DDS, MS'}",University of Minnesota
...,...,...,...,...,...,...,...,...,...,...
2976,Introduction to Participatory Approaches in Pu...,"1,651 already enrolled",Taught in English,4.4,Beginner level,18 hours (approximately),,"Public Participation, Participatory Action Res...",4.7 {'Helen Ward '},Imperial College London
2977,Introducing Security: Aligning Asset and Risk ...,"7,815 already enrolled",Taught in English,4.6,Beginner level,7 hours (approximately),,"Asset, Risk Management",4.7 {'(ISC)Â² Education & Training'},ISC2
2978,Academic Information Seeking,"28,386 already enrolled",Taught in English,4.7,,Approx. 6 hours to complete,,,"4.7 {'Thomas Skov Jensen ', 'Birgitte Munk', ...",Technical University of Denmark (DTU)
2979,A Blueprint for Success â Your Video Pre-Pro...,"5,438 already enrolled",Taught in English,4.5,Beginner level,11 hours (approximately),Identify equipment and software needs to creat...,"Video Editing, Linear Editing, Video Productio...","4.7 {'Emilie Johnson', 'Paul Daugherty'}",University of Colorado Boulder


Check duplicate row

In [9]:
is_duplicate = raw_df.duplicated().any()
is_duplicate

True

In [10]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1762 entries, 1 to 2983
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         1762 non-null   object
 1   enrollment   1688 non-null   object
 2   language     1762 non-null   object
 3   rating       1762 non-null   object
 4   level        1423 non-null   object
 5   duration     1762 non-null   object
 6   description  756 non-null    object
 7   skill        1003 non-null   object
 8   instructor   1759 non-null   object
 9   offered by   1759 non-null   object
dtypes: object(10)
memory usage: 151.4+ KB


In [11]:
raw_df['enrollment'] = raw_df['enrollment'].str.extract('(\d+,\d+)')
try:
    raw_df['enrollment'] = pd.to_numeric(raw_df['enrollment'].replace(',', '', regex=True), errors='coerce').astype('Int64')
except ValueError:
    raw_df['enrollment'] = None

In [12]:
raw_df['rating'] = raw_df['rating'].astype(float)

In [13]:
raw_df['language'] = 'English'

In [14]:
raw_df['level'].unique()

array(['Beginner level', 'Intermediate level', nan, 'Advanced level'],
      dtype=object)

- `level` column has three levels at this time, we gonna fill `none` by `Other`
- Remove 'level' in values.
- Then check `none` value in this columns

In [15]:
raw_df['level'] = raw_df['level'].fillna('Other')
raw_df['level'] = raw_df['level'].str.replace('level', '')
raw_df['level'].isnull().sum()

0

Get value of duration 

In [16]:
raw_df['duration'] = raw_df['duration'].str.extract('(\d+)').astype(int)

In [17]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1762 entries, 1 to 2983
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         1762 non-null   object 
 1   enrollment   1688 non-null   Int64  
 2   language     1762 non-null   object 
 3   rating       1762 non-null   float64
 4   level        1762 non-null   object 
 5   duration     1762 non-null   int64  
 6   description  756 non-null    object 
 7   skill        1003 non-null   object 
 8   instructor   1759 non-null   object 
 9   offered by   1759 non-null   object 
dtypes: Int64(1), float64(1), int64(1), object(7)
memory usage: 153.1+ KB


Drop `description`, `skill` columns

In [18]:
raw_df.drop(['description','skill'], axis = 1, inplace=True)

Get `instructor_rate` columms

In [19]:
raw_df['instructor_rate'] = raw_df['instructor'].str.replace(r'\{(.+)\}', '', regex=True)
raw_df['instructor_rate'] = raw_df['instructor_rate'].str.strip()

raw_df['instructor'] = raw_df['instructor'].str.extract(r'\{(.+)\}')
raw_df['instructor'] = raw_df['instructor'].str.replace("'", '')

Astype `instructor_rate` to float

In [20]:
raw_df['instructor_rate'] = pd.to_numeric(raw_df['instructor_rate'], errors='coerce')
raw_df['instructor_rate'] = raw_df['instructor_rate'].astype(float)


In [21]:
# raw_df.to_csv('../Data/pr_data_2.csv', index=False)

In [22]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1762 entries, 1 to 2983
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             1762 non-null   object 
 1   enrollment       1688 non-null   Int64  
 2   language         1762 non-null   object 
 3   rating           1762 non-null   float64
 4   level            1762 non-null   object 
 5   duration         1762 non-null   int64  
 6   instructor       1759 non-null   object 
 7   offered by       1759 non-null   object 
 8   instructor_rate  1610 non-null   float64
dtypes: Int64(1), float64(2), int64(1), object(5)
memory usage: 139.4+ KB


Fill `enrollment` and `instructor_rate` by median. `enrollment` should be astype as `int`.

In [23]:
raw_df['enrollment'] = raw_df['enrollment'].fillna(int(raw_df['enrollment'].median()))
raw_df['instructor_rate'] =raw_df['instructor_rate'].fillna(raw_df['instructor_rate'].median())


In [24]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1762 entries, 1 to 2983
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             1762 non-null   object 
 1   enrollment       1762 non-null   Int64  
 2   language         1762 non-null   object 
 3   rating           1762 non-null   float64
 4   level            1762 non-null   object 
 5   duration         1762 non-null   int64  
 6   instructor       1759 non-null   object 
 7   offered by       1759 non-null   object 
 8   instructor_rate  1762 non-null   float64
dtypes: Int64(1), float64(2), int64(1), object(5)
memory usage: 139.4+ KB


In [25]:
raw_df.to_csv('../Data/processed_data.csv', index=False)