### Purpose: To find similar courses between classcentral and skillup by matching keywords of skillup with the courses title of classcentral, so that we can publish popular courses on classcentral from skillup to generate leads.

In [1]:
# Import modules
import pandas as pd
import numpy as np

In [2]:
# Read in class central data
class_central_df = pd.read_excel("classCentral_26Aug.xlsx")
class_central_df.head()

Unnamed: 0,courseTitle,courseLink,courseProvider,cat,unitSold,certificatePrice,freeOrAudit,review
0,Elements of AI,https://www.classcentral.com/course/independen...,University of Helsinki and Reaktor Education v...,Computer Science\n\n\n\n\nArtificial Intelligence,31.2k,Certificate Available,Free Online Course,574 reviews
1,Machine Learning,https://www.classcentral.com/course/machine-le...,Stanford University via Coursera,Computer Science\n\n\n\n\nMachine Learning,62.3k,Paid Certificate Available,Free Online Course (Audit),369 reviews
2,Unlocking Information Security: Part Ⅰ,https://www.classcentral.com/course/edx-unlock...,Tel Aviv University via edX,Computer Science\n\n\n\n\nCybersecurity,3.5k,"5 weeks long, 4-6 hours a week",Free Online Course (Audit),151 reviews
3,Introduction to Computer Science and Programmi...,https://www.classcentral.com/course/edx-introd...,Massachusetts Institute of Technology via edX,Computer Science,16.4k,$75 Certificate Available,Free Online Course (Audit),126 reviews
4,CS50's Introduction to Computer Science,https://www.classcentral.com/course/edx-cs50-s...,Harvard University via edX,Computer Science,26.6k,$90 Certificate Available,Free Online Course (Audit),96 reviews


#### Processing, cleaning, and feature engineering:

In [3]:
# Extract subcategory and broadcategory from cat columns
class_central_df["subCat"] = class_central_df.cat.str.split("\n").str[-1]
class_central_df["broadCat"] = class_central_df.cat.str.split("\n").str[0]

# Extract digits from review
class_central_df["review"] = class_central_df.review.str.extract(r"(\d+,?)")


# Multiply by 1000 where unit sale contains "k", otherwise keep that as it is.
class_central_df["unitSold"] = np.where(class_central_df.unitSold.str.contains("k", na=False), 
        class_central_df.unitSold.str.replace("k", "").fillna(0).astype(float).multiply(1000).astype(int),
        class_central_df.unitSold).astype(int)

# Extract colaborating university from course provider column
class_central_df["colaboratingUniv"] = class_central_df.courseProvider.str.split("via").str[0].str.strip()


# Extract only course provider without colaborating university
class_central_df.courseProvider = class_central_df.courseProvider.str.split("via").str[-1].str.strip()

# Is the course is free, paid or auditable?
class_central_df["freeAuditOrPaid"] = np.where(class_central_df.freeOrAudit.str.contains("Audit"), "audit",
        np.where(class_central_df.freeOrAudit.str.contains("Free"), "free",
                np.where(class_central_df.freeOrAudit.str.contains("Paid"), "paid", class_central_df.freeOrAudit)))

# Is certificate free, paid or have some price?
class_central_df["certificatePrice"] = np.where(class_central_df.certificatePrice.str.contains("Paid"), "paid",
       np.where(class_central_df.certificatePrice.str.contains("\$"), class_central_df.certificatePrice,
                 np.where(class_central_df.certificatePrice.str.contains("Certificate Available"), "free", "na")))

# Split by certificate price, otherwise keep it as it is.
class_central_df["certificatePrice"] = np.where(class_central_df.certificatePrice.str.contains("\$"), 
         class_central_df.certificatePrice.str.split("Certificate Available").str[0], class_central_df.certificatePrice)

# Drop category column
class_central_df.drop("cat", axis=1, inplace=True)

# Sort by unit sale
class_central_df = class_central_df.sort_values("unitSold", ascending=False)

In [4]:
# Read in skillup data
skillup_df = pd.read_excel("skillUp.xlsx")
skillup_df.head()

Unnamed: 0,date,courseId,courseTitle,keyword,subTitle,courseLink,courseProvider,soldOrEnq,category,broadCategory1,...,cpdAccreditedBy,othersAsCpd,awrBodyName,awrBodyQualName,courseLevel,savings,newOfferPrice,unitSold,offerPrice,savingsPercent
0,18_Aug,277314,Animal Care and Pet First Aid - 5 Courses Bundle,animal care,Special Bundle Offer | Accredited by CPD | 13 ...,https://www.reed.co.uk/courses/animal-care-and...,Skill Up,3 students purchased this course,"Animal care, Veterinary, Animal care, Dog care",Animal care,...,,,,,,,,3,39,96
1,18_Aug,273817,Dog Trainer - 8 Courses Complete Bundle,dog trainer|dog training|dog|dog walking|raw d...,Special Bundle Offer | Accredited by CPD | 40 ...,https://www.reed.co.uk/courses/dog-trainer-8-c...,Skill Up,11 students purchased this course,"Animal care, Dog training, Animal care, Dog tr...",Animal care,...,,,,,,,,11,49,94
2,18_Aug,273884,Photography Bundle for Professional Photographer,photography|wedding photography,Special Bundle Offer | Accredited by CPD | 45 ...,https://www.reed.co.uk/courses/photography-bun...,Skill Up,5 students purchased this course,"Media and art, Photography, Recreational, Phot...",Media and art,...,,,,,,,,5,49,94
3,18_Aug,277102,BARF - Feed Your Dog A Raw Diet,dog trainer|dog training|dog|dog walking|raw d...,Accredited by CPD | 3 CPD Points | Video train...,https://www.reed.co.uk/courses/barf-feed-your-...,Skill Up,5 students purchased this course,"Animal care, Dog agility, Animal care, Dog wal...",Animal care,...,,,,,,,,5,10,95
4,18_Aug,277218,Reiki Diploma - Level 1 to Master Level Certif...,reiki,Accredited by CPD | 4 CPD Points | Video train...,https://www.reed.co.uk/courses/reiki-diploma-l...,Skill Up,10 students purchased this course,"Health & care, Alternative medicine, Reiki",Health & care,...,,,,,,,,10,10,95


In [5]:
# This function match keywords of skillup courses with the title of class central courses
def matchByKeyword(keyword):
    """Keyword: keyword in the skill up dataset."""
    
    # Search classcentral course title
    in_course_cenreal = class_central_df[class_central_df["courseTitle"].str.contains(fr"\b({keyword})\b", case=False, regex=True)]
    
    # Insert keyword
    in_course_cenreal.insert(loc=0, value=keyword, column="keyword")
    
    # Match keyword with the skill up keyword column
    match_with_skillup = skillup_df[skillup_df.keyword==keyword]
    
    # Concat if a keyword is found on both of the dataframes
    if in_course_cenreal.shape[0]>0 and match_with_skillup.shape[0]>0:
        return pd.concat([match_with_skillup, in_course_cenreal.head(10)])

In [6]:
# Call the function on array of keywords
matched = pd.concat(list(map(matchByKeyword, skillup_df.keyword.unique()))).reset_index(drop=True)

# Sort by keywords
matched = matched.sort_values(["keyword"])
matched.head(10)

  return func(self, *args, **kwargs)


Unnamed: 0,date,courseId,courseTitle,keyword,subTitle,courseLink,courseProvider,soldOrEnq,category,broadCategory1,...,unitSold,offerPrice,savingsPercent,certificatePrice,freeOrAudit,review,subCat,broadCat,colaboratingUniv,freeAuditOrPaid
390,18_Aug,277471.0,Acrylic Painting for Beginners,Acrylic Painting|painting,Accredited by CPD | 4 CPD Points | Video train...,https://www.reed.co.uk/courses/acrylic-paintin...,Skill Up,Tutor is available to students,"Construction, Painting and Decorating, Media a...",Construction,...,0,10.0,95.0,,,,,,,
392,,,The Art of Drawing and Painting,Acrylic Painting|painting,,https://www.classcentral.com/course/open2study...,Open2Study,,,,...,2000,,,na,Free Online Course,6.0,Visual Arts,Art & Design,,free
393,,,In the Studio: Postwar Abstract Painting,Acrylic Painting|painting,,https://www.classcentral.com/course/painting-8349,Coursera,,,,...,715,,,paid,Free Online Course (Audit),2.0,Art & Design,Art & Design,The Museum of Modern Art,audit
394,,,Secondary : Painting (225),Acrylic Painting|painting,,https://www.classcentral.com/course/swayam-sec...,Swayam,,,,...,76,,,paid,Free Online Course,0.0,Visual Arts,Art & Design,NIOS,free
395,,,Sr.Secondary : Painting (332),Acrylic Painting|painting,,https://www.classcentral.com/course/swayam-sr-...,Swayam,,,,...,67,,,paid,Free Online Course,0.0,Visual Arts,Art & Design,NIOS,free
391,18_Aug,277480.0,Modern Acrylic Painting Course,Acrylic Painting|painting,Accredited by CPD | 3 CPD Points | Video train...,https://www.reed.co.uk/courses/modern-acrylic-...,Skill Up,Tutor is available to students,"Construction, Painting and Decorating, Media a...",Construction,...,0,10.0,95.0,,,,,,,
385,18_Aug,277470.0,Adobe Illustrator CC Beginner,Adobe Illustrator,Accredited by CPD | 4 CPD Points | Video train...,https://www.reed.co.uk/courses/adobe-illustrat...,Skill Up,Tutor is available to students,"Design, Media and art, Illustration",Design,...,0,10.0,95.0,,,,,,,
386,18_Aug,278250.0,Complete Adobe Illustrator CC 2018,Adobe Illustrator,Accredited by CPD | 12 CPD Points | Video trai...,https://www.reed.co.uk/courses/complete-adobe-...,Skill Up,Tutor is available to students,"Media and art, Photography, Recreational, Phot...",Media and art,...,0,10.0,95.0,,,,,,,
387,18_Aug,278308.0,Adobe Illustrator CC Advanced,Adobe Illustrator,Accredited by CPD | 2 CPD Points | Video train...,https://www.reed.co.uk/courses/adobe-illustrat...,Skill Up,Tutor is available to students,"Media and art, Photography, Recreational, Phot...",Media and art,...,0,10.0,95.0,,,,,,,
388,,,Adobe Illustrator: aprende a crear presentacio...,Adobe Illustrator,,https://www.classcentral.com/course/edx-adobe-...,edX,,,,...,165,,,$15,Free Online Course (Audit),1.0,Digital Media,Art & Design,The Pontificia Universidad Javeriana,audit


#### We can see for keyword "Acrylic Painting|painting", we have 2 courses for skill up that match 4 courses from course central from different providers.

### Which are the most popular 10 courses from skill up that are in course central too?

In [7]:
top10 = matched.keyword.value_counts().head(10).reset_index().rename(columns={"index":"keyword", "keyword":"count"})
top10

Unnamed: 0,keyword,count
0,dog trainer|dog training|dog|dog walking|raw d...,21
1,Office 365,14
2,digital marketing|digital marketer,13
3,leadership|management,13
4,nutrition|diet,13
5,branding,12
6,project management,12
7,depression|counselling,12
8,Negotiation,12
9,accounting|finance,12


### How many courses does skill up have for those keywords?

In [8]:
def count_by_keyword(keyword):
    
    # Count no of skill up courses for the keyword
    skillup_count = matched[matched.keyword==keyword].query("courseProvider=='Skill Up'").shape[0]
    return pd.DataFrame({
        "keyword":keyword,
        "skillup_count":skillup_count
    }, index=[0])

In [9]:
# Apply the function to find no of courses for skill up for top 10 keywords
skilup_top10_count = pd.concat(list(map(count_by_keyword, top10.keyword))).reset_index(drop=True)

# Merge with the main data
final_df = pd.merge(top10, skilup_top10_count, on="keyword")

# Calculate competitors count
final_df["competitor_count"] = final_df["count"].sub(final_df.skillup_count)

# Rename column
final_df = final_df.rename(columns={"count":"total_count"})
final_df

Unnamed: 0,keyword,total_count,skillup_count,competitor_count
0,dog trainer|dog training|dog|dog walking|raw d...,21,13,8
1,Office 365,14,4,10
2,digital marketing|digital marketer,13,3,10
3,leadership|management,13,3,10
4,nutrition|diet,13,3,10
5,branding,12,2,10
6,project management,12,2,10
7,depression|counselling,12,2,10
8,Negotiation,12,2,10
9,accounting|finance,12,2,10


### Takeaway: Out of 21 courses for keyword "dog trainer|dog training|dog|dog walking|raw diet|pet", 13 are from skill up and 8 are from the competitors. Out of 14 "Office 365" courses, 10 are from the competitors while 4 are from the skill up.