In [None]:
!pip install nltk
!pip install gensim
!pip install scipy==1.10
!pip install pandas
!pip install matplotlib
!pip install seaborn



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gensim
import pandas as pd
import nltk as nltk

from scipy.spatial.distance import cosine
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
from gensim import corpora
from scipy.spatial.distance import euclidean


%matplotlib inline

In [None]:
rs = 123

Calculate the cosine similarity between two example courses

In [None]:
course1 = "machine learning for everyone"

In [None]:
course2 = "machine learning for beginners"

In [None]:
tokens = set(course1.split() + course2.split())

In [None]:
tokens = list(tokens)
tokens

['everyone', 'beginners', 'for', 'machine', 'learning']

Generate BoW features for the two courses & Euclidean distance applied to get similarity between  the vectors


In [None]:
def generate_bow_with_vocabulary(course, vocabulary):
    bow_vector = [0] * len(vocabulary)
    words = course.split()
    for word in words:
        if word in vocabulary:
            bow_vector[vocabulary.index(word)] = 1
    return bow_vector

# get all unique tokens from all courses
all_tokens = []
for course_text in [course1, course2]:
    all_tokens.extend(course_text.split())
vocabulary = list(set(all_tokens))

# generate BoW vectors with the shared vocabulary
bow1 = generate_bow_with_vocabulary(course1, vocabulary)
bow2 = generate_bow_with_vocabulary(course2, vocabulary)

# calculate Euclidean distance
distance = euclidean(bow1, bow2)
print(f"The Euclidean distance between course `{course1}` and course `{course2}` is {round(distance, 2)}")

The Euclidean distance between course `machine learning for everyone` and course `machine learning for beginners` is 1.41


In [None]:
cos_sim = 1 - cosine(bow1, bow2)

In [None]:
print(f"The cosine similarity between course `{course1}` and course `{course2}` is {round(cos_sim, 2) * 100}%")

The cosine similarity between course `machine learning for everyone` and course `machine learning for beginners` is 75.0%


BoW feature vectors

In [None]:
# Load the BoW features as Pandas dataframe
bows_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/courses_bows.csv"
bows_df = pd.read_csv(bows_url)
bows_df = bows_df[['doc_id', 'token', 'bow']]
bows_df.head(10)

Unnamed: 0,doc_id,token,bow
0,ML0201EN,ai,2
1,ML0201EN,apps,2
2,ML0201EN,build,2
3,ML0201EN,cloud,1
4,ML0201EN,coming,1
5,ML0201EN,create,1
6,ML0201EN,data,1
7,ML0201EN,developer,1
8,ML0201EN,found,1
9,ML0201EN,fun,1


load different dataset

In [None]:
# Load the course dataframe
course_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv"
course_df = pd.read_csv(course_url)
course_df.head(10)

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...
5,CNSC02EN,cloud native security conference data security,introduction to data security on cloud
6,DX0106EN,data science bootcamp with r for university pr...,a multi day intensive in person data science ...
7,GPXX0FTCEN,learn how to use docker containers for iterati...,learn how to use docker containers for iterati...
8,RAVSCTEST1,scorm test 1,scron test course
9,GPXX06RFEN,create your first mongodb database,in this guided project you will get started w...


In [None]:
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


In [None]:
ml_course = bows_df[bows_df['doc_id'] == 'ML0101ENv3']
ml_course

Unnamed: 0,doc_id,token,bow
2747,ML0101ENv3,course,1
2748,ML0101ENv3,learning,4
2749,ML0101ENv3,machine,3
2750,ML0101ENv3,need,1
2751,ML0101ENv3,get,1
2752,ML0101ENv3,started,1
2753,ML0101ENv3,python,2
2754,ML0101ENv3,tool,1
2755,ML0101ENv3,tools,1
2756,ML0101ENv3,predict,1


In [None]:
ml_courseT = ml_course.pivot(index=['doc_id'], columns='token').reset_index(level=[0])
ml_courseT

Unnamed: 0_level_0,doc_id,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow
token,Unnamed: 1_level_1,beneficial,course,free,future,get,give,hidden,insights,learning,machine,need,predict,python,started,supervised,tool,tools,trends,unsupervised
0,ML0101ENv3,1,1,1,1,1,1,1,1,4,3,1,1,2,1,1,1,1,1,1


compare the BoWs of any two courses, which normally have a different set of tokens, we need to create a union token set and then transpose them

In [None]:
def pivot_two_bows(basedoc, comparedoc):
    """
    Pivot two bag-of-words (BoW) representations for comparison.

    Parameters:
    basedoc (DataFrame): DataFrame containing the bag-of-words representation for the base document.
    comparedoc (DataFrame): DataFrame containing the bag-of-words representation for the document to compare.

    Returns:
    DataFrame: A DataFrame with pivoted BoW representations for the base and compared documents,
    facilitating direct comparison of word occurrences between the two documents.
    """

    # Create copies of the input DataFrames to avoid modifying the originals
    base = basedoc.copy()
    base['type'] = 'base'  # Add a 'type' column indicating base document
    compare = comparedoc.copy()
    compare['type'] = 'compare'  # Add a 'type' column indicating compared document

    # Concatenate the two DataFrames vertically
    join = pd.concat([base, compare])

    # Pivot the concatenated DataFrame based on 'doc_id' and 'type', with words as columns
    joinT = join.pivot(index=['doc_id', 'type'], columns='token').fillna(0).reset_index(level=[0, 1])

    # Assign meaningful column names to the pivoted DataFrame
    joinT.columns = ['doc_id', 'type'] + [t[1] for t in joinT.columns][2:]

    # Return the pivoted DataFrame for comparison
    return joinT


In [None]:
course1 = bows_df[bows_df['doc_id'] == 'ML0151EN']
course2 = bows_df[bows_df['doc_id'] == 'ML0101ENv3']

In [None]:
bow_vectors = pivot_two_bows(course1, course2)
bow_vectors

Unnamed: 0,doc_id,type,approachable,basics,beneficial,comparison,course,dives,free,future,...,relates,started,statistical,supervised,tool,tools,trends,unsupervised,using,vs
0,ML0101ENv3,compare,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,ML0151EN,base,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


cosine method to calculate their similarity:

In [None]:
similarity = 1 - cosine(bow_vectors.iloc[0, 2:], bow_vectors.iloc[1, 2:])
similarity

0.6626221399549089

In [None]:
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


Find courses which are similar to course Machine Learning with Python (ML0101ENv3), you also need to show the title and descriptions of those courses.

In [None]:
thresh = 0.5
similar_courses = []

#get the bow for the base course (ML0101ENv3)
base_course_bow = bows_df[bows_df['doc_id'] == 'ML0101ENv3']

# iterate through all unique course IDs
for course_id in bows_df['doc_id'].unique():
    # skip base course itself
    if course_id == 'ML0101ENv3':
        continue

    # get the bow for the current course
    compare_course_bow = bows_df[bows_df['doc_id'] == course_id]

    # pivot the bows for comparison
    bow_vectors = pivot_two_bows(base_course_bow, compare_course_bow)

    # Calc cosine similarity
    similarity = 1 - cosine(bow_vectors.iloc[0, 2:], bow_vectors.iloc[1, 2:])

    # If similarity is above the threshold, add to similar courses
    if similarity >= thresh:
        # get course title and description
        course_info = course_df[course_df['COURSE_ID'] == course_id][['TITLE', 'DESCRIPTION']].iloc[0]
        similar_courses.append({
            'course_id': course_id,
            'title': course_info['TITLE'],
            'description': course_info['DESCRIPTION'],
            'similarity': similarity
        })

# display  courses
for course in similar_courses:
    print(f"Course ID: {course['course_id']}")
    print(f"Title: {course['title']}")
    print(f"Description: {course['description']}")
    print(f"Similarity: {course['similarity']}\n")

Course ID: ML0109EN
Title: machine learning   dimensionality reduction
Description: machine learning   dimensionality reduction
Similarity: 0.521749194749951

Course ID: ML0151EN
Title: machine learning with r
Description: this machine learning with r course dives into the basics of machine learning using an approachable  and well known  programming language  you ll learn about supervised vs unsupervised learning  look into how statistical modeling relates to machine learning  and do a comparison of each 
Similarity: 0.6626221399549089

Course ID: excourse46
Title: machine learning
Description: machine learning is the science of getting computers to act without being explicitly programmed  in the past decade  machine learning has given us self driving cars  practical speech recognition  effective web search  and a vastly improved understanding of the human genome  machine learning is so pervasive today that you probably use it dozens of times a day without knowing it  many researchers 