In [1]:
!pip install nltk==3.6.7
!pip install gensim==4.1.2

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.spatial.distance as ssd

import gensim
from gensim import corpora
import nltk as nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [2]:
# set sample courses
course1 = 'machine learning for everyone'
course2 = 'machine learning for beginners'

tokens = list(set(course1.split() + course2.split()))
tokens

['beginners', 'machine', 'for', 'learning', 'everyone']

In [3]:
# generate BoW features
def generate_sparse_bow(course):
    bow_vector = []
    words = course.split()
    
    for token in tokens:
        if token in words:
            bow_vector.append(1)
        else:
            bow_vector.append(0)
            
    return bow_vector

In [4]:
bow1 = generate_sparse_bow(course1)
bow1

[0, 1, 1, 1, 1]

In [5]:
bow2 = generate_sparse_bow(course2)
bow2

[1, 1, 1, 1, 0]

In [6]:
inter = sum((min(bow1, bow2) for bow1, bow2 in zip(bow1, bow2)))
union = sum((max(bow1, bow2) for bow1, bow2 in zip(bow1, bow2)))

sim = inter / union if union != 0 else 0
sim

0.6

In [7]:
cos_sim = 1 - ssd.cosine(bow1, bow2)  # find cosine similarity
print(f'Cosine similarity between course {course1} and course {course2}: {round(cos_sim, 2) * 100}%')

Cosine similarity between course machine learning for everyone and course machine learning for beginners: 75.0%


In [8]:
euc_dis = ssd.euclidean(bow1, bow2)  # find Euclidean distance
print(f"Euclidean distance between course '{course1}' and course '{course2}': {euc_dis}")

Euclidean distance between course 'machine learning for everyone' and course 'machine learning for beginners': 1.4142135623730951


In [9]:
# find Jaccard index
def jac_ind(bow1, bow2):
    inter = len(list(set(bow1).intersection(bow2)))
    union = (len(set(bow1)) + len(set(bow2))) - inter

    return float(inter) / union

print(f'Jaccard index between course {course1} and course {course2}: {jac_ind(bow1, bow2)}')  # ?

Jaccard index between course machine learning for everyone and course machine learning for beginners: 1.0


In [10]:
# BoW FEATURES DATASET
bows_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/courses_bows.csv'
bows_df = pd.read_csv(bows_url)
bows_df = bows_df[['doc_id', 'token', 'bow']]
bows_df.head(10)

Unnamed: 0,doc_id,token,bow
0,ML0201EN,ai,2
1,ML0201EN,apps,2
2,ML0201EN,build,2
3,ML0201EN,cloud,1
4,ML0201EN,coming,1
5,ML0201EN,create,1
6,ML0201EN,data,1
7,ML0201EN,developer,1
8,ML0201EN,found,1
9,ML0201EN,fun,1


In [11]:
# COURSE DATASET
course_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv'
course_df = pd.read_csv(course_url)
course_df.head(10)

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...
5,CNSC02EN,cloud native security conference data security,introduction to data security on cloud
6,DX0106EN,data science bootcamp with r for university pr...,a multi day intensive in person data science ...
7,GPXX0FTCEN,learn how to use docker containers for iterati...,learn how to use docker containers for iterati...
8,RAVSCTEST1,scorm test 1,scron test course
9,GPXX06RFEN,create your first mongodb database,in this guided project you will get started w...


In [12]:
course_df[course_df['COURSE_ID'] == 'ML0101ENv3']  # get info for course with ID ML0101ENv3

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...


In [13]:
ml_course = bows_df[bows_df['doc_id'] == 'ML0101ENv3']  # get its associated BoW features
ml_course

Unnamed: 0,doc_id,token,bow
2747,ML0101ENv3,course,1
2748,ML0101ENv3,learning,4
2749,ML0101ENv3,machine,3
2750,ML0101ENv3,need,1
2751,ML0101ENv3,get,1
2752,ML0101ENv3,started,1
2753,ML0101ENv3,python,2
2754,ML0101ENv3,tool,1
2755,ML0101ENv3,tools,1
2756,ML0101ENv3,predict,1


In [14]:
ml_courseT = ml_course.pivot(index=['doc_id'], columns='token').reset_index(level=[0])  # convert BoW feature vector to horizontal format
ml_courseT

Unnamed: 0_level_0,doc_id,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow,bow
token,Unnamed: 1_level_1,beneficial,course,free,future,get,give,hidden,insights,learning,machine,need,predict,python,started,supervised,tool,tools,trends,unsupervised
0,ML0101ENv3,1,1,1,1,1,1,1,1,4,3,1,1,2,1,1,1,1,1,1


In [15]:
def pivot_two_bows(basedoc, comparedoc):
    base = basedoc.copy()
    base['type'] = 'base'
    compare = comparedoc.copy()
    compare['type'] = 'compare'

    join = pd.concat([base, compare], ignore_index=True)  # concatenate two token sets vertically
    joinT = join.pivot(index=['doc_id', 'type'], columns='token').fillna(0).reset_index(level=[0, 1])  # pivot joined courses
    joinT.columns = ['doc_id', 'type'] + [t[1] for t in joinT.columns][2:]  # assign columns

    return joinT

In [16]:
course1 = bows_df[bows_df['doc_id'] == 'ML0151EN']
course2 = bows_df[bows_df['doc_id'] == 'ML0101ENv3']
course2

Unnamed: 0,doc_id,token,bow
2747,ML0101ENv3,course,1
2748,ML0101ENv3,learning,4
2749,ML0101ENv3,machine,3
2750,ML0101ENv3,need,1
2751,ML0101ENv3,get,1
2752,ML0101ENv3,started,1
2753,ML0101ENv3,python,2
2754,ML0101ENv3,tool,1
2755,ML0101ENv3,tools,1
2756,ML0101ENv3,predict,1


In [17]:
bow_vectors = pivot_two_bows(course1, course2)
bow_vectors

Unnamed: 0,doc_id,type,approachable,basics,beneficial,comparison,course,dives,free,future,...,relates,started,statistical,supervised,tool,tools,trends,unsupervised,using,vs
0,ML0101ENv3,compare,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,ML0151EN,base,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


In [18]:
sim = 1 - ssd.cosine(bow_vectors.iloc[0, 2:], bow_vectors.iloc[1, 2:])
print(f'Cosine similarity between courses ML0151EN and ML0101ENv3: {sim}')

Cosine similarity between courses ML0151EN and ML0101ENv3: 0.6626221399549089


In [19]:
# find courses similar to 'Machine Learning with Python'
sims = []
mlwp = bows_df[bows_df['doc_id'] == 'ML0101ENv3']

for index, course in course_df[course_df['COURSE_ID'] != 'ML0101ENv3'].iterrows():
    title = course.TITLE
    course_id = course.COURSE_ID
    course = bows_df[bows_df['doc_id'] == course_id]
    bow_vectors = pivot_two_bows(mlwp, course)
    sim = 1 - ssd.cosine(bow_vectors.iloc[0, 2:], bow_vectors.iloc[1, 2:])
    sims.append([title, sim])

course_sims = pd.DataFrame(sims, columns=['Title', 'Similarity'])
sim_courses = course_sims[course_sims['Similarity'] > 0.5].sort_values(by='Similarity', ascending=False)
pd.set_option('max_colwidth', None)
sim_courses

Unnamed: 0,Title,Similarity
199,machine learning with r,0.662622
259,machine learning for all,0.634755
258,machine learning,0.612054
272,introduction to tensorflow for artificial intelligence machine learning and deep learning,0.54904
157,machine learning dimensionality reduction,0.521749
