<a href="https://colab.research.google.com/github/chidinma-godwin/course-recommendation-nlp/blob/main/course_reccomendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Suppress the TqdmExperimentalWarning
from tqdm import TqdmExperimentalWarning
import warnings
warnings.filterwarnings('ignore', category=TqdmExperimentalWarning)

In [37]:
import re
import string
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

## Data Exploration

In [67]:
base_dir = "/content/drive/MyDrive/project"

df = pd.read_csv(f"{base_dir}/courseslist.csv")
df.head()

Unnamed: 0,additional_course_detail,course_avg_rating,course_certificate,course_institution,course_is_classroom,course_is_university,course_level,course_link,course_name,course_num_rating,course_provider,course_subject,course_type,description,duration,is_all_time_best,pricing,start_date,teacher
0,Welcome to the world of programming! Learn the...,4.84906,True,,False,False,beginner,https://www.classcentral.com/course/intro-to-p...,Introduction to Programming,53,Udacity,Programming,microcredential,Master coding basics with this 17-week Udacity...,4 months 3 weeks,False,Paid Course,On-Demand,"Karl Krueger, Kelly Howard, Julia Van Cleve, J..."
1,",This course bridges the gap between introduct...",3.66667,True,Harvard University,False,True,intermediate,https://www.classcentral.com/course/python-har...,Using Python for Research,12,edX,Programming,course,Master coding basics with this 17-week Udacity...,4 months 3 weeks,False,Paid Course,On-Demand,Jukka-Pekka JP Onnela
2,This course takes Java beginners to the next l...,4.45,True,University of Alberta,False,True,intermediate,https://www.classcentral.com/course/object-ori...,Object-Oriented Design,40,Coursera,Programming,course,Master coding basics with this 17-week Udacity...,4 months 3 weeks,False,Paid Course,On-Demand,Kenny Wong
3,This Specialization builds on the success of t...,4.88462,True,University of Michigan,False,True,beginner,https://www.classcentral.com/course/python-186...,Python for Everybody,416,Coursera,Python,microcredential,Master coding basics with this 17-week Udacity...,4 months 3 weeks,False,Paid Course,On-Demand,Charles Russell Severance
4,This course will introduce the core data struc...,4.89328,True,University of Michigan,False,True,beginner,https://www.classcentral.com/course/python-dat...,Python Data Structures,16426,Coursera,Python,course,Master coding basics with this 17-week Udacity...,4 months 3 weeks,False,Paid Course,On-Demand,Charles Severance


In [68]:
df.duplicated().sum()

0

In [69]:
df = df.replace(np.nan, None)
df.isna().sum()

additional_course_detail        0
course_avg_rating               0
course_certificate              0
course_institution          20813
course_is_classroom             0
course_is_university            0
course_level                16715
course_link                     0
course_name                     0
course_num_rating               0
course_provider                 0
course_subject                  0
course_type                     0
description                     0
duration                       65
is_all_time_best                0
pricing                         0
start_date                      0
teacher                     10840
dtype: int64

In [70]:
course_subjects_df = pd.DataFrame(df.groupby('course_subject').size(), columns=["count"])
course_subjects_df = course_subjects_df.sort_values(by='count', ascending=False)
course_subjects_df

Unnamed: 0_level_0,count
course_subject,Unnamed: 1_level_1
Python,1332
Microsoft Azure,1198
Google Cloud Platform (GCP),1111
Uncategorized,1008
Javascript,812
...,...
AWS Artifact,1
Set Theory,1
Tricentis Tosca,1
ServiceNow Certified System Administrator,1


In [71]:
course_subjects_df = pd.DataFrame(df.groupby('course_level', dropna=False).size(), columns=["count"])
course_subjects_df = course_subjects_df.sort_values(by='count', ascending=False)
course_subjects_df

Unnamed: 0_level_0,count
course_level,Unnamed: 1_level_1
,16715
beginner,11844
intermediate,5625
advanced,979


In [72]:
course_subjects_df = pd.DataFrame(df.groupby('pricing').size(), columns=["count"])
course_subjects_df = course_subjects_df.sort_values(by='count', ascending=False)
course_subjects_df

Unnamed: 0_level_0,count
pricing,Unnamed: 1_level_1
Free Online Course,11498
Free Trial Available,9615
Paid Course,9502
Free Online Course (Audit),2987
Conference Talk,747
Free Certificate,525
$14.00,28
$518.00,15
$79.00,15
"$7,500.00",15


In [73]:
course_subjects_df = pd.DataFrame(df.groupby('course_is_university').size(), columns=["count"])
course_subjects_df = course_subjects_df.sort_values(by='count', ascending=False)
course_subjects_df

Unnamed: 0_level_0,count
course_is_university,Unnamed: 1_level_1
False,32392
True,2771


In [74]:
course_subjects_df = pd.DataFrame(df.groupby('course_provider').size(), columns=["count"])
course_subjects_df = course_subjects_df.sort_values(by='count', ascending=False)
course_subjects_df

Unnamed: 0_level_0,count
course_provider,Unnamed: 1_level_1
YouTube,8452
Udemy,8248
Pluralsight,4389
Coursera,3711
LinkedIn Learning,3620
...,...
Federica,1
Stepik,1
Semrush Academy,1
EMMA,1


In [75]:
df[df['course_subject'] == 'Uncategorized'].head()

Unnamed: 0,additional_course_detail,course_avg_rating,course_certificate,course_institution,course_is_classroom,course_is_university,course_level,course_link,course_name,course_num_rating,course_provider,course_subject,course_type,description,duration,is_all_time_best,pricing,start_date,teacher
478,"Introduction,Project Overview,Case Scenario,Al...",0.0,False,Stanford University,True,True,,https://www.classcentral.com/classroom/youtube...,Picking on the Same Person - Does Algorithmic ...,0,YouTube,Uncategorized,course,Stanford University offers a brief seminar exp...,35 minutes,False,Free Online Course,On-Demand,
479,"Introduction,Linear approach,landscape changes...",0.0,False,Stanford University,True,True,,https://www.classcentral.com/classroom/youtube...,"AI, Archaeology, and Archives - How Data Scien...",0,YouTube,Uncategorized,course,Stanford University offers a brief seminar exp...,35 minutes,False,Free Online Course,On-Demand,
480,"Introduction,Our Story Begins,The History of t...",0.0,False,Stanford University,True,True,,https://www.classcentral.com/classroom/youtube...,"Voices in the Code - A Story About People, The...",0,YouTube,Uncategorized,course,Stanford University offers a brief seminar exp...,35 minutes,False,Free Online Course,On-Demand,
497,Stanford Seminar: PyWren - Pushing Microservic...,0.0,False,Stanford University,True,True,,https://www.classcentral.com/classroom/youtube...,Stanford Seminar - PyWren - Pushing Microservi...,0,YouTube,Uncategorized,course,Stanford University offers a brief seminar on ...,45 minutes,False,Free Online Course,On-Demand,
498,"Stanford Seminar: Data For The People, Andreas...",0.0,False,Stanford University,True,True,,https://www.classcentral.com/classroom/youtube...,Data for the People - Andreas Weigend of Socia...,0,YouTube,Uncategorized,course,Stanford University offers a brief seminar on ...,45 minutes,False,Free Online Course,On-Demand,


## Data Preprocessing

In [76]:
# Function to create a meaningful combined course details
def create_combined_details(row):
    teacher_and_duration = ""
    if row['teacher'] and row['duration']:
        teacher_and_duration = f"The course was taught by {row['teacher']} and has a duration of {row['duration']}"
    elif row['teacher']:
        teacher_and_duration = f"The course was taught by {row['teacher']}"
    elif row['duration']:
        teacher_and_duration = f"The course has a duration of {row['duration']}"

    certificate = " You can get a certificate after completing this course" if row['course_certificate'] else ""

    course_institution = f" by {row['course_institution']}" if row['course_institution'] else ""

    course_level = f" and is classified as {row['course_level']} level" if row['course_level'] else ""

    combined_details = (
        f"""{row['course_name']}: This {re.sub(r' Course', '', row['pricing'])} course was rated by \
{row['course_num_rating']} people with an average rating of {round(row['course_avg_rating'], 1)}. {teacher_and_duration}. \
This {row['course_subject']} course is offered on {row['course_provider']}{course_institution}{course_level}.{certificate}. {row['additional_course_detail']}. {row['description']}"""
    )
    return combined_details

In [77]:
# Apply the function to each row to create the combined text
df["combined_details"] = df.apply(create_combined_details, axis=1)
df["combined_details"][0]

'Introduction to Programming: This Paid course was rated by 53 people with an average rating of 4.8. The course was taught by Karl Krueger, Kelly Howard, Julia Van Cleve, James Parkes, Richard Kalehoff, Greg C., Yodit F., Matthew R., John M., Vincenzo A. and James L. and has a duration of 4 months 3 weeks. This Programming course is offered on Udacity and is classified as beginner level. You can get a certificate after completing this course. Welcome to the world of programming! Learn the skills that all programmers use, whether they build apps, web pages, or analyze data.,,. Master coding basics with this 17-week Udacity course. Learn HTML, CSS, Python, and JavaScript, with no prior programming skills required. Ideal for beginners eyeing careers in tech.'

In [78]:
print("Min: ", min(df["combined_details"].str.split().apply(len)))
print("Max: ", max(df["combined_details"].str.split().apply(len)))

Min:  50
Max:  5448


In [36]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [79]:
def preprocess_text(text):
    text = text.lower()

    # Remove punctuations but preserve "." between numbers
    text = re.sub(r'(?<=\d)[.](?=\d)', '√√√DOT√√√', text)  # Temporarily replace "." between numbers
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # Remove all punctuations
    text = re.sub(r'√√√DOT√√√', '.', text)  # Restore the preserved "."

    words = word_tokenize(text)
    filtered_words = [stemmer.stem(word) for word in words if word not in stop_words]
    preprocessed_text = ' '.join(filtered_words)
    return preprocessed_text

df['preprocessed_details'] = df['combined_details'].apply(preprocess_text)
df["preprocessed_details"][0]

'introduct program paid cours rate 53 peopl averag rate 4.8 cours taught karl krueger kelli howard julia van cleve jame park richard kalehoff greg c yodit f matthew r john vincenzo jame l durat 4 month 3 week program cours offer udac classifi beginn level get certif complet cours welcom world program learn skill programm use whether build app web page analyz data master code basic 17week udac cours learn html css python javascript prior program skill requir ideal beginn eye career tech'

In [80]:
print("Min: ", min(df["preprocessed_details"].str.split().apply(len)))
print("Max: ", max(df["preprocessed_details"].str.split().apply(len)))

Min:  30
Max:  2846
