## Import libraries

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import types as sql_types
from emagapi.connect import Courses

## Database connection

In [2]:
def connection():
    db_user = 'bc0e0e4f733dda'
    db_password = '00f61efe'
    db_name = 'heroku_8149febc614deb5'
    db_host = 'eu-cdbr-west-02.cleardb.net'

    return create_engine('mysql+pymysql://{}:{}@{}/{}'.format(db_user,
                                                              db_password,
                                                              db_host,
                                                              db_name))

## Retrieving leads
I will get a sample of leads from Emagister database gererated in the last year. The sample table `leads` contains 25000 leads.

In [3]:
leads_query = '''SELECT user_id,
    course_id,
    course_title,
    center,
    created_on
FROM leads
ORDER BY created_on DESC
'''

leads_df = pd.read_sql_query(leads_query, con=connection())

In [4]:
leads_df.head()

Unnamed: 0,user_id,course_id,course_title,center,created_on
0,faf61312a2da38d95051130f9a9162f2,170641938,Level 3 Diploma in Counselling and Psychotherapy,One Education,2019-12-08 18:50:22
1,0fea90bc02208acef1ed8d68ff87fa64,170660881,MBA - Finance & Accounting,IUBH Online,2019-12-08 18:47:12
2,08515f84f44d7a52fa053ee8918952f6,170663641,Master of computer science,IUBH Online,2019-12-08 18:30:52
3,fdc6ae1b37c9a3b5855f981cbd5aca84,170439892,Carpentry Apprenticeship,Birmingham Metropolitan College,2019-12-08 18:24:45
4,3125ddd787800a6b45f71f722089421c,170641213,Personal Leadership and Success,Columbia Business School Executive Education,2019-12-08 18:22:52


In [5]:
leads_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 5 columns):
user_id         25000 non-null object
course_id       25000 non-null int64
course_title    25000 non-null object
center          25000 non-null object
created_on      25000 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 976.6+ KB


### Search for missing values

In [6]:
leads_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 5 columns):
user_id         25000 non-null object
course_id       25000 non-null int64
course_title    25000 non-null object
center          25000 non-null object
created_on      25000 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 976.6+ KB


There are no missing values in this dataframe

### Search for duplicated
Duplicated leads, in this case is when a user generates more than one lead into the same course

In [7]:
leads_df.duplicated(['user_id', 'course_id']).sum()

92

I will remove the duplicates keeping the first lead generated by the user, that's the one that was created earlier.

In [8]:
leads_df.drop_duplicates(['user_id', 'course_id'], inplace=True, keep='last')

In [9]:
leads_df.duplicated(['user_id', 'course_id']).sum()

0

### Save clean leads

In [10]:
types = {'user_id': sql_types.CHAR(length=36),
         'course_id': sql_types.INT(),
         'course_title': sql_types.TEXT(),
         'center': sql_types.VARCHAR(length=100),
         'created_on': sql_types.DateTime()}

leads_df.to_sql('clean_leads', con=connection(), index=False, if_exists='replace', dtype=types)

## Retrieving reviews
The reviews come from the Emagister database and are a sample of those that have been made in the last 18 months.

In [11]:
reviews_query = '''SELECT user_id,
    course_id,
    course_title,
    center,
    rating,
    created_on
FROM reviews
ORDER BY created_on DESC
'''

reviews_df = pd.read_sql_query(reviews_query, con=connection())

In [12]:
reviews_df.head()

Unnamed: 0,user_id,course_id,course_title,center,rating,created_on
0,427c2efbe90b6cbd333629a349b12cee,170628943,Bachelor in Aviation Management,IUBH University of Applied Sciences,10,2019-12-08 12:34:57
1,d64790132660cf9f5c1269782f5edd1e,170663414,International relations – Master’s Degree Prog...,Vistula University,6,2019-12-07 22:39:20
2,36bc70187533012825175d3d1950ff47,170568616,Teaching English to Speakers of Other Language...,Bath Spa University,8,2019-12-07 16:54:15
3,2ad535742bf54cec3b3ddb4a8a87ce9b,170230799,Central Heating Wiring & Controls e-Course,CTS Consultants Ltd,2,2019-12-07 15:06:27
4,90979704ad6d54ba451d0387a5eb8547,170663653,Managing Projects Effectively,Frankfurt School of Finance & Management,10,2019-12-06 08:05:05


### Search for missing values

In [13]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19489 entries, 0 to 19488
Data columns (total 6 columns):
user_id         19489 non-null object
course_id       19489 non-null int64
course_title    19489 non-null object
center          19489 non-null object
rating          19489 non-null int64
created_on      19489 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 913.6+ KB


There are no missing values in this dataframe

### Search for outliers in `rating` column

In [14]:
reviews_df['rating'].describe()

count    19489.000000
mean         9.296731
std          1.141179
min          2.000000
25%          8.000000
50%         10.000000
75%         10.000000
max         10.000000
Name: rating, dtype: float64

There seems to be no outliers

### Search for duplicated rows

In [15]:
reviews_df.duplicated(['user_id', 'course_id']).sum()

184

There are 184 duplicated reviews. I will remove them keeping the first review made by the user.

In [16]:
reviews_df.drop_duplicates(['user_id', 'course_id'], inplace=True, keep='last')

In [17]:
reviews_df.duplicated(['user_id', 'course_id']).sum()

0

In [18]:
types = {'user_id': sql_types.CHAR(length=36),
         'course_id': sql_types.INT(),
         'course_title': sql_types.TEXT(),
         'center': sql_types.VARCHAR(length=100),
         'rating': sql_types.INT(),
         'created_on': sql_types.DateTime()}

reviews_df.to_sql('clean_reviews', con=connection(), index=False, if_exists='replace', dtype=types)

## Retrieving courses
I will retrieve a sample of 25000 courses from [Emagister API](https://github.com/fdelgados/EmagisterAPI).

In [19]:
courses_api = Courses(country='uk', page_size=100)

subset = {'course_id': 'id',
          'title': 'name',
          'description': 'description',
          'center': 'center_name',
          'skills': 'skills',
          'type': 'type',
          'price': 'price',
          'start_date': 'start_date',
          'flexible_start': 'flexible_start_date',
          'avg_rating': 'rating'}

courses = courses_api.get(subset, max_records=25000)

courses_df = pd.DataFrame(courses, columns=subset.keys())

GET: https://www.emagister.co.uk/api/1.0/courses?page=1&size=100 [200]
Records: 100/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=2&size=100 [200]
Records: 200/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=3&size=100 [200]
Records: 300/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=4&size=100 [200]
Records: 400/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=5&size=100 [200]
Records: 500/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=6&size=100 [200]
Records: 600/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=7&size=100 [200]
Records: 700/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=8&size=100 [200]
Records: 800/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=9&size=100 [200]
Records: 900/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=10&size=100 [200]
Records: 1000/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=11&size=100 [200]
Records: 1100/25000
GET: h

GET: https://www.emagister.co.uk/api/1.0/courses?page=91&size=100 [200]
Records: 9100/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=92&size=100 [200]
Records: 9200/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=93&size=100 [200]
Records: 9300/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=94&size=100 [200]
Records: 9400/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=95&size=100 [200]
Records: 9500/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=96&size=100 [200]
Records: 9600/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=97&size=100 [200]
Records: 9700/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=98&size=100 [200]
Records: 9800/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=99&size=100 [200]
Records: 9900/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=100&size=100 [200]
Records: 10000/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=101&size=100 [200]
Recor

GET: https://www.emagister.co.uk/api/1.0/courses?page=179&size=100 [200]
Records: 17900/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=180&size=100 [200]
Records: 18000/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=181&size=100 [200]
Records: 18100/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=182&size=100 [200]
Records: 18200/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=183&size=100 [200]
Records: 18300/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=184&size=100 [200]
Records: 18400/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=185&size=100 [200]
Records: 18500/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=186&size=100 [200]
Records: 18600/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=187&size=100 [200]
Records: 18700/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=188&size=100 [200]
Records: 18800/25000
GET: https://www.emagister.co.uk/api/1.0/courses?page=189&si

In [22]:
courses_df.head()

Unnamed: 0,course_id,title,description,center,skills,type,price,start_date,flexible_start,avg_rating
0,170648149,Advanced Sales Management,Discover new tools for turning the art of mana...,Salessense,"[Sales Manager, Sales Management, Sales Manage...",Course,"£5,995",,True,10.0
1,170646268,Storytelling Skills (2 day course),This two-day Storytelling course has been desi...,PTP Training & Marketing Ltd,"[Industry, Education, Structure, Preparation, ...",Short course,"£9,000",,False,10.0
2,170040246,"International Commercial Contracts School, Dubai",This specialist five-day seminar running in Du...,Falconbury Ltd,"[Contract Law, International Relations, Busine...",Course,"£2,599",2020-09-01,False,10.0
3,170385873,Massage Therapy,Do you want to learn more about massage therap...,International Career Institute,"[Massage, Reflexology, Massage Therapy, Indust...",Course,£798,,True,9.0
4,170601656,Java Programming Beginner's Course,Are you ready to introduce yourself in the pro...,PCWorkshops,"[Decision Making, Object oriented training, Ja...",Short course,£750,,True,10.0


### Search for missing values

In [23]:
courses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 10 columns):
course_id         25000 non-null object
title             25000 non-null object
description       25000 non-null object
center            25000 non-null object
skills            25000 non-null object
type              25000 non-null object
price             25000 non-null object
start_date        8352 non-null object
flexible_start    25000 non-null bool
avg_rating        25000 non-null float64
dtypes: bool(1), float64(1), object(8)
memory usage: 1.7+ MB


Only `start_date` column has missing values. When a course has no start date, it is for one of the following reasons:
* It is a self-paced course and starts at the time the registration is formalized (`flexible_start = False`).
* There are several registration dates per year (`flexible_start = True`).

I will use this column to sort the courses, so I will convert the `start_date` column to datetime format.

In [24]:
courses_df['start_date'] = pd.to_datetime(courses_df.start_date)

In [25]:
courses_df.head()

Unnamed: 0,course_id,title,description,center,skills,type,price,start_date,flexible_start,avg_rating
0,170648149,Advanced Sales Management,Discover new tools for turning the art of mana...,Salessense,"[Sales Manager, Sales Management, Sales Manage...",Course,"£5,995",NaT,True,10.0
1,170646268,Storytelling Skills (2 day course),This two-day Storytelling course has been desi...,PTP Training & Marketing Ltd,"[Industry, Education, Structure, Preparation, ...",Short course,"£9,000",NaT,False,10.0
2,170040246,"International Commercial Contracts School, Dubai",This specialist five-day seminar running in Du...,Falconbury Ltd,"[Contract Law, International Relations, Busine...",Course,"£2,599",2020-09-01,False,10.0
3,170385873,Massage Therapy,Do you want to learn more about massage therap...,International Career Institute,"[Massage, Reflexology, Massage Therapy, Indust...",Course,£798,NaT,True,9.0
4,170601656,Java Programming Beginner's Course,Are you ready to introduce yourself in the pro...,PCWorkshops,"[Decision Making, Object oriented training, Ja...",Short course,£750,NaT,True,10.0


### Search for duplicated rows

In [26]:
courses_df.duplicated('course_id').sum()

0

There are no duplicated courses

### Search for outliers
Search for outliers in the `avg_rating` column

In [27]:
courses_df['avg_rating'].describe()

count    25000.000000
mean         2.064209
std          3.878249
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         10.000000
Name: avg_rating, dtype: float64

The minimum rating is 1, so 0-rated courses have not actually been rated. This column will be used to order the courses, so I will not change the value.

### Categorize price
There are too many unique values, so the best option seems to convert these values to a range to make them more manageable. Also, values in `price` column are a mix of categorical and continuous data.

In [28]:
courses_df.groupby('price')['price'].unique()

price
&pound; 1001-2000                  [&pound; 1001-2000]
&pound; 101-200                      [&pound; 101-200]
&pound; 2001-3000                  [&pound; 2001-3000]
&pound; 201-500                      [&pound; 201-500]
&pound; 3001-4000                  [&pound; 3001-4000]
&pound; 4001-5000                  [&pound; 4001-5000]
&pound; 5001-6000                  [&pound; 5001-6000]
&pound; 501-1000                    [&pound; 501-1000]
&pound; 6001-7000                  [&pound; 6001-7000]
&pound; 7001-8000                  [&pound; 7001-8000]
&pound; 8001-9000                  [&pound; 8001-9000]
Free                                            [Free]
Higher than &pound; 9000    [Higher than &pound; 9000]
Price on request                    [Price on request]
Up to &pound; 100                  [Up to &pound; 100]
£1                                                [£1]
£1,000                                        [£1,000]
£1,009                                        [£1,009]
£1,0

There are price ranges as value, so I will take them as a reference to group the prices into those ranges.
* Free
* Up to &pound; 100
* &pound; 101-200
* &pound; 201-500
* &pound; 501-1000
* &pound; 1001-2000
* &pound; 2001-3000
* &pound; 3001-4000
* &pound; 4001-5000
* &pound; 5001-6000
* &pound; 6001-7000
* &pound; 7001-8000
* &pound; 8001-9000
* Higher than &pound; 9000
* Price on request

First of all, this column needs some cleaning operations

In [29]:
def clean_price(price):
    price = price.replace('&pound; ', '')
    price = price.replace('Higher than ', '>')
    price = price.replace('Up to ', '<=')
    price = price.replace('£', '')
    
    return price

In [30]:
courses_df['price'] = courses_df['price'].apply(clean_price)

In [31]:
courses_df.groupby('price')['price'].unique()

price
1                                  [1]
1,000                          [1,000]
1,009                          [1,009]
1,013                          [1,013]
1,020                          [1,020]
1,025                          [1,025]
1,030                          [1,030]
1,034                          [1,034]
1,040                          [1,040]
1,048                          [1,048]
1,050                          [1,050]
1,056                          [1,056]
1,060                          [1,060]
1,068                          [1,068]
1,070                          [1,070]
1,071                          [1,071]
1,074                          [1,074]
1,080                          [1,080]
1,083                          [1,083]
1,090                          [1,090]
1,095                          [1,095]
1,096                          [1,096]
1,098                          [1,098]
1,099                          [1,099]
1,100                          [1,100]
1,120              

There seems to be some outlier, specifically £1 and £99,999

In [32]:
courses_df[courses_df['price'] == '1']

Unnamed: 0,course_id,title,description,center,skills,type,price,start_date,flexible_start,avg_rating
23000,170658682,Design Management MA,Our course will help you to successfully manag...,Northumbria University,[Design],Master,1,2019-09-01,False,0.0


![Design Management MA](../img/pound1.png)

In [33]:
courses_df[courses_df['price'] == '99,999']

Unnamed: 0,course_id,title,description,center,skills,type,price,start_date,flexible_start,avg_rating
15762,170405488,Master's Degree in Railway Systems,"As a fast and economic mean of transportation,...",Universidad Pontificia Comillas,"[Market, Civil Engineering, Economics, Transpo...",Master,99999,NaT,False,0.0
15763,170405489,Master's Degree in the Electric Power Industry...,"Not only in Spain, but globally, the Electric ...",Universidad Pontificia Comillas,"[Retail, Market, Renewable Energy, Engineering...",Master,99999,NaT,False,0.0


![Master's Degree in Railway Systems](../img/pound99999_1.png)
![Master's Degree in the Electric Power Industry (MEPI)](../img/pound99999_2.png)
After all, they are real prices.

Now I will group prices into the price ranges

In [34]:
def categorize_price(price):
    non_numeric = ['<=100', '>9000', 'Free', 'Price on request']
    price_ranges = ['101-200',
                    '201-500',
                    '501-1000',
                    '1001-2000',
                    '2001-3000',
                    '3001-4000',
                    '4001-5000',
                    '5001-6000',
                    '6001-7000',
                    '7001-8000',
                    '8001-9000']
    
    if price in non_numeric:
        return price
    
    if price in price_ranges:
        return price
    
    formatted_price = float(price.replace(',', ''))
    
    if formatted_price <= 100:
        return '<=100'
    
    if formatted_price > 9000:
        return '>9000'
    
    for price_range in price_ranges:
        bounds = price_range.split('-')
        min_price = float(bounds[0])
        max_price = float(bounds[1])
        
        if formatted_price >= min_price and formatted_price <= max_price:
            return price_range
        

In [35]:
courses_df['price_range'] = courses_df['price'].apply(categorize_price)

In [36]:
courses_df.head()

Unnamed: 0,course_id,title,description,center,skills,type,price,start_date,flexible_start,avg_rating,price_range
0,170648149,Advanced Sales Management,Discover new tools for turning the art of mana...,Salessense,"[Sales Manager, Sales Management, Sales Manage...",Course,5995,NaT,True,10.0,5001-6000
1,170646268,Storytelling Skills (2 day course),This two-day Storytelling course has been desi...,PTP Training & Marketing Ltd,"[Industry, Education, Structure, Preparation, ...",Short course,9000,NaT,False,10.0,8001-9000
2,170040246,"International Commercial Contracts School, Dubai",This specialist five-day seminar running in Du...,Falconbury Ltd,"[Contract Law, International Relations, Busine...",Course,2599,2020-09-01,False,10.0,2001-3000
3,170385873,Massage Therapy,Do you want to learn more about massage therap...,International Career Institute,"[Massage, Reflexology, Massage Therapy, Indust...",Course,798,NaT,True,9.0,501-1000
4,170601656,Java Programming Beginner's Course,Are you ready to introduce yourself in the pro...,PCWorkshops,"[Decision Making, Object oriented training, Ja...",Short course,750,NaT,True,10.0,501-1000


In [37]:
courses_df['price_range'].value_counts()

Price on request    11220
<=100                4116
201-500              2444
>9000                2077
501-1000             1442
101-200              1114
1001-2000             526
3001-4000             501
2001-3000             375
4001-5000             290
6001-7000             232
8001-9000             183
5001-6000             172
7001-8000             156
Free                  152
Name: price_range, dtype: int64

In [38]:
# Remove price column
courses_df.drop('price', axis=1, inplace=True)

### Remove square brackets in `skills` column

In [39]:
courses_df['skills'] = courses_df['skills'].astype(str).str.strip('[]')

In [40]:
courses_df.head()

Unnamed: 0,course_id,title,description,center,skills,type,start_date,flexible_start,avg_rating,price_range
0,170648149,Advanced Sales Management,Discover new tools for turning the art of mana...,Salessense,"'Sales Manager', 'Sales Management', 'Sales Ma...",Course,NaT,True,10.0,5001-6000
1,170646268,Storytelling Skills (2 day course),This two-day Storytelling course has been desi...,PTP Training & Marketing Ltd,"'Industry', 'Education', 'Structure', 'Prepara...",Short course,NaT,False,10.0,8001-9000
2,170040246,"International Commercial Contracts School, Dubai",This specialist five-day seminar running in Du...,Falconbury Ltd,"'Contract Law', 'International Relations', 'Bu...",Course,2020-09-01,False,10.0,2001-3000
3,170385873,Massage Therapy,Do you want to learn more about massage therap...,International Career Institute,"'Massage', 'Reflexology', 'Massage Therapy', '...",Course,NaT,True,9.0,501-1000
4,170601656,Java Programming Beginner's Course,Are you ready to introduce yourself in the pro...,PCWorkshops,"'Decision Making', 'Object oriented training',...",Short course,NaT,True,10.0,501-1000


In [41]:
courses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 10 columns):
course_id         25000 non-null object
title             25000 non-null object
description       25000 non-null object
center            25000 non-null object
skills            25000 non-null object
type              25000 non-null object
start_date        8352 non-null datetime64[ns]
flexible_start    25000 non-null bool
avg_rating        25000 non-null float64
price_range       25000 non-null object
dtypes: bool(1), datetime64[ns](1), float64(1), object(7)
memory usage: 1.7+ MB


### Save courses

In [42]:
types={'course_id': sql_types.INT(), 
       'title':  sql_types.TEXT(),
       'description': sql_types.TEXT(),
       'center': sql_types.VARCHAR(length=100),
       'skills': sql_types.TEXT(),
       'type': sql_types.VARCHAR(length=40),
       'start_date': sql_types.DateTime(),
       'flexible_start': sql_types.BOOLEAN(),
       'price_range': sql_types.VARCHAR(length=20)}

courses_df.to_sql('courses', con=connection(), index=False, if_exists='replace', dtype=types)

## Create a courses dataframe

I will create a dataframe with all courses. Columns will be: `course_id`, `course_title` and `center`.

In [43]:
courses_df[['course_id', 'title', 'center']]

Unnamed: 0,course_id,title,center
0,170648149,Advanced Sales Management,Salessense
1,170646268,Storytelling Skills (2 day course),PTP Training & Marketing Ltd
2,170040246,"International Commercial Contracts School, Dubai",Falconbury Ltd
3,170385873,Massage Therapy,International Career Institute
4,170601656,Java Programming Beginner's Course,PCWorkshops
5,170628400,MCSA: SQL Server 2016 - Database Administration,Training Square
6,170630042,Short Term General English,Twin
7,170628762,Master of Advanced Studies in Interaction Design,Master Of Advanced Studies in Interaction Desi...
8,170345323,NLP+® Multi Certification Practitioner Training,The Taylored Life Company Limited
9,170633067,Postgraduate Pathway in Environmental Science ...,London Brunel International College


In [46]:
reviews_courses = reviews_df[~reviews_df['course_id'].isin(courses_df['course_id'])].drop_duplicates('course_id')[['course_id', 'course_title', 'center']]

In [47]:
leads_courses = leads_df[~leads_df['course_id'].isin(courses_df['course_id'])].drop_duplicates('course_id')[['course_id', 'course_title', 'center']]

In [48]:
missing_courses = pd.concat([reviews_courses, leads_courses]).drop_duplicates().reset_index(drop=True)

In [51]:
missing_courses.rename(columns={'course_title': 'title'}, inplace=True)

In [52]:
all_courses = pd.concat([courses_df[['course_id', 'title', 'center']], missing_courses]).drop_duplicates().reset_index(drop=True)

In [53]:
all_courses.shape

(33107, 3)

In [54]:
all_courses.to_csv('../data/clean/all_courses.csv', index=False)