# Course Recommender System

In [1]:
from collections import defaultdict
from itertools import combinations
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from coursemate.dataset import Dataset

In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

In [3]:
%load_ext autoreload
%autoreload 2

## 0. Dataset preparation

In [4]:
dataset = Dataset('../data/Coursera_courses.csv', '../data/Coursera.csv', '../data/Coursera_reviews.csv')
dataset.set_interaction_counts(3, 50)
dataset.show_dataset_details()

Loading Coursera courses...
Loading Coursera reviews...
Segmenting out students with less than 3 or more than 50 reviews...
30719 students, 468 courses, 174219 reviews
Sparsity: 1.21%
Duplicates: 4.54%


In [13]:
dataset.set_train_test_split_by_user()

# For the content-based and CF recommender
train_Xmatrix, train_ymatrix, df_train_X, df_train_y = dataset.get_train_matrix_split(ratio=0.8)
test_Xmatrix, test_ymatrix, df_test_X, df_test_y = dataset.get_test_matrix_split(ratio=0.5)

# For the sequential/KB recommender
train_Xseq, test_Xseq, train_yseq, test_yseq = dataset.get_train_test_next_course_predictions()

Setting the train-test split by user...
Computing the training and test rating matrix...


131100it [00:07, 17764.63it/s]


Computing the test rating matrix split...
Computing the training and test rating matrix...


131100it [00:07, 17505.19it/s]
43119it [00:02, 17086.81it/s]


Computing the training and test list of sequences...


131100it [00:06, 20275.63it/s]
43119it [00:02, 18337.97it/s]


In [14]:
print(f"Training: From {len(df_train_X)} ratings, predict the next {len(df_train_y)} for {df_train_y['reviewers'].nunique()} users")
print(f"Testing: From {len(df_test_X)} ratings, predict the next {len(df_test_y)} for {df_test_y['reviewers'].nunique()} users")

Training: From 119949 ratings, predict the next 11151 for 6631 users
Testing: From 157922 ratings, predict the next 16297 for 7680 users


In [15]:
print(f"Training (Sequential): From {len(train_Xseq)} sequences, predict the next course taken for {df_train_y['reviewers'].nunique()} users")
print(f"Testing (Sequential): From {len(test_Xseq)} sequences, predict the next course taken for {df_test_y['reviewers'].nunique()} users")

Training (Sequential): From 108061 sequences, predict the next course taken for 6631 users
Testing (Sequential): From 7680 sequences, predict the next course taken for 7680 users


## 1. Content-Based Recommendation

In [224]:
from coursemate.model import ContentBasedModel
from sklearn.feature_extraction.text import TfidfVectorizer,  CountVectorizer
from coursemate.metrics import calculate_hit_rate_from_next_course_sequences

model = ContentBasedModel(course_set=dataset.course_set, Vectorizer=TfidfVectorizer, n_features=1000)

unique_reviewers_X = df_test_X['reviewers'].unique()
unique_reviewers_y = df_test_y['reviewers'].unique()


hits = 0
count = 0

for user in tqdm(unique_reviewers_X, desc="Recommending"):
    if user in unique_reviewers_y:
        
        count += 1
        user_profile = tuple(df_test_X[df_test_X['reviewers'] == user]['course_id'])
        target = df_test_y[df_test_y['reviewers'] == user]['course_id']

        recommendations = model.recommend(user_profile,5)
        recommendation_ids = [recommendation[0] for recommendation in recommendations]

        res = len(set(target) & set(recommendation_ids))
        
        if res > 0:
            hits += 1

print("Hitrate", hits / count)

Recommending: 100%|██████████| 30719/30719 [1:28:48<00:00,  5.77it/s]   


Hitrate 0.17408854166666668


## 2. Collaborative Filtering Recommendation

## 3. Sequential/Knowledge-Based Recommendation

In [22]:
from coursemate.model import AssociationMiningModel
from coursemate.metrics import (
    calculate_hit_rate_from_next_course_sequences,
    calculate_precision_from_next_course_sequences,
    calculate_f1score_from_next_course_sequences
)

# support cutoff determined empirically, testing the 90th and 99th percentile
model = AssociationMiningModel(cutoff=3, course_set=dataset.course_set)
df_seq = model.get_sequences_data(train_Xseq, train_yseq)
model.fit(df_seq)

100%|█████████████████████████████████████████████████████████████████████████| 23039/23039 [00:00<00:00, 583272.89it/s]
100%|█████████████████████████████████████████████████████████████████████████| 23039/23039 [00:00<00:00, 182101.58it/s]
100%|██████████████████████████████████████████████████████████████████████████| 23039/23039 [00:01<00:00, 18914.89it/s]


1-subsequences: 468
2-subsequences: 90472
3-subsequences: 1835231
(1926171, 2)


In [41]:
hr5 = calculate_hit_rate_from_next_course_sequences(test_Xseq, test_yseq, model.recommend, 5)
hr10 = calculate_hit_rate_from_next_course_sequences(test_Xseq, test_yseq, model.recommend, 10)
prec5 = calculate_precision_from_next_course_sequences(test_Xseq, test_yseq, model.recommend, 5)
prec10 = calculate_precision_from_next_course_sequences(test_Xseq, test_yseq, model.recommend, 10)
f1score5 = calculate_f1score_from_next_course_sequences(test_Xseq, test_yseq, model.recommend, 5)
f1score10 = calculate_f1score_from_next_course_sequences(test_Xseq, test_yseq, model.recommend, 10)

100%|██████████████████████████████████████████████████████████████████████████████| 7680/7680 [01:06<00:00, 114.87it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 7680/7680 [01:06<00:00, 115.21it/s]


In [42]:
print(f"    | Hit rate | Precision | F1 Score")
print(f"@5  | {hr5:.3f}    | {prec5:.3f}     | {f1score5:.3f}   ")
print(f"@10 | {hr10:.3f}    | {prec10:.3f}     | {f1score10:.3f}   ")

    | Hit rate | Precision | F1 Score
@5  | 0.374    | 0.086     | 0.118   
@10 | 0.490    | 0.062     | 0.097   
