E.g. "Users who liked this course also liked these other courses."

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from scipy.stats import pearsonr
from scipy.spatial.distance import squareform, pdist

In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

In [3]:
df_courses = pd.read_csv('../data/Coursera_courses.csv').drop_duplicates()
df_reviews = pd.read_csv('../data/Coursera_reviews.csv').drop_duplicates()

In [4]:
df_reviews.columns

Index(['reviews', 'reviewers', 'date_reviews', 'rating', 'course_id'], dtype='object')

In [5]:
df_courses.columns

Index(['name', 'institution', 'course_url', 'course_id'], dtype='object')

In [6]:
df_courses.head()

Unnamed: 0,name,institution,course_url,course_id
0,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning
1,Indigenous Canada,University of Alberta,https://www.coursera.org/learn/indigenous-canada,indigenous-canada
2,The Science of Well-Being,Yale University,https://www.coursera.org/learn/the-science-of-well-being,the-science-of-well-being
3,Technical Support Fundamentals,Google,https://www.coursera.org/learn/technical-support-fundamentals,technical-support-fundamentals
4,Become a CBRS Certified Professional Installer by Google,Google - Spectrum Sharing,https://www.coursera.org/learn/google-cbrs-cpi-training,google-cbrs-cpi-training


In [7]:
# Pivot the reviews dataframe to create a user-item interaction matrix
interaction_matrix = pd.pivot_table(df_reviews, index='reviewers', columns='course_id', values='rating')
interaction_matrix_filled = interaction_matrix.fillna(0)

In [8]:
interaction_matrix_filled.head()

course_id,aboriginal-education,access-control-sscp,accounting-analytics,accounting-data-analytics-python,actualizacion-manejo-diabetes-tipo-2,addiction-and-the-brain,addiction-treatment,advanced-valuation-and-strategy,agile-atlassian-jira,ai-for-everyone,...,what-is-compliance,what-is-datascience,what-is-social,wind-energy,wine,womens-health-human-rights,write-a-feature-length-screenplay-for-film-or-television,write-your-first-novel,writing-editing-words,writing-for-business
reviewers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
By \t M N H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
By \t M R I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
By \t M S R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
By \t N A F B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
By \t N H P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Calculate similarities using different metrics
def calculate_similarities(metric):
    if metric == 'cosine':
        similarity = 1 - pairwise_distances(interaction_matrix_filled.T, metric='cosine')
    elif metric == 'euclidean':
        similarity = 1 / (1 + pairwise_distances(interaction_matrix_filled.T, metric='euclidean'))
    elif metric == 'pearson':
        similarity = np.array([[pearsonr(a, b)[0] for a in interaction_matrix_filled.T.values] for b in interaction_matrix_filled.T.values])
    return pd.DataFrame(similarity, index=interaction_matrix.columns, columns=interaction_matrix.columns)

In [10]:

# Recommender function
def recommend_courses(course_id, num_courses=5, metric='cosine'):
    item_similarity_df = calculate_similarities(metric)
    
    similar_score = item_similarity_df[course_id]
    
    # Sort in descending order
    recommended_courses = similar_score.sort_values(ascending=False)
    
    # Remove the course itself from the recommendation
    recommended_courses = recommended_courses.drop(course_id)
    recommended_courses_details = df_courses[df_courses['course_id'].isin(recommended_courses.index)].copy()
    recommended_courses_details['similarity_score'] = recommended_courses
    recommended_courses_details = recommended_courses_details.sort_values('similarity_score', ascending=False)
    
    return recommended_courses_details.head(num_courses)


course_id = "machine-learning"

# Example using different metrics
print("Cosine Similarity Recommendations:")
print(recommend_courses(course_id, 5, metric='cosine'))
print("\nEuclidean Distance Recommendations:")
print(recommend_courses(course_id, 5, metric='euclidean'))
# print("\nPearson Correlation Recommendations:")
# print(recommend_courses(course_id, 5, metric='pearson'))

Cosine Similarity Recommendations:
                                                       name                institution                                                     course_url                       course_id  similarity_score
1                                         Indigenous Canada      University of Alberta               https://www.coursera.org/learn/indigenous-canada               indigenous-canada               NaN
2                                 The Science of Well-Being            Yale University       https://www.coursera.org/learn/the-science-of-well-being       the-science-of-well-being               NaN
3                            Technical Support Fundamentals                     Google  https://www.coursera.org/learn/technical-support-fundamentals  technical-support-fundamentals               NaN
4  Become a CBRS Certified Professional Installer by Google  Google - Spectrum Sharing        https://www.coursera.org/learn/google-cbrs-cpi-training        google-c

: 