In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from coursemate.dataset import Dataset
from coursemate.model import UserBasedCF

pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)


In [2]:
dataset = Dataset('data/Coursera_courses.csv', 'data/Coursera.csv', 'data/Coursera_reviews.csv')
dataset.set_interaction_counts(3, 50)
dataset.show_dataset_details()

Loading Coursera courses...


Loading Coursera reviews...
Segmenting out students with less than 3 or more than 50 reviews...
30719 students, 468 courses, 174219 reviews
Sparsity: 1.21%
Duplicates: 4.54%


In [3]:
dataset.set_train_test_split_by_user()

# Instantiate the UserBasedCF model
user_cf_model = UserBasedCF()

# Fit the model with the training data
# Ensure that dataset.train_ratings is available and properly set up before this step
user_cf_model.fit(dataset.train_ratings)

Setting the train-test split by user...


In [4]:
# Get recommendations for a particular user
user_id = 'By Kelvin k'
recommendations = user_cf_model.recommend(user_id, k=5)

In [5]:
recommendations

['aws-fundamentals-going-cloud-native',
 'information-security-data',
 'sql-for-data-science',
 'python-basics',
 'intro-sql']

In [3]:
dataset.set_train_test_split_random()
# Instantiate the UserBasedCF model
user_cf_model = UserBasedCF()

# Fit the model with the training data
# Ensure that dataset.train_ratings is available and properly set up before this step
user_cf_model.fit(dataset.train_ratings)

Setting the train-test split randomly...


### Evaluation

In [4]:
def hit_rate(model, test_set, k=5):
    hits = 0
    total = 0

    for user_id in test_set['reviewers'].unique():
        # Check if the user_id exists in the user similarity matrix
        if user_id not in model.user_similarity_matrix.index:
            continue  # Skip this user if not in the training set

        user_actual = test_set[test_set['reviewers'] == user_id]
        user_actual_top_k = user_actual.nlargest(k, 'rating')['course_id']
        user_predicted_top_k = model.recommend(user_id, k)

        hits += 1 if len(set(user_predicted_top_k) & set(user_actual_top_k)) > 0 else 0
        total += 1
        
        print(f"Hit Rate: {hits / total if total > 0 else 0}")
        
        if total == 200: # Stop after 200 users
            break

    return hits / total if total > 0 else 0

In [5]:
cf_model = UserBasedCF()
cf_model.fit(dataset.train_ratings)

hit_rate(cf_model, dataset.test_ratings, k=10)

Hit Rate: 0.0
Hit Rate: 0.0
Hit Rate: 0.0
Hit Rate: 0.25
Hit Rate: 0.4
Hit Rate: 0.3333333333333333
Hit Rate: 0.2857142857142857
Hit Rate: 0.25
Hit Rate: 0.2222222222222222
Hit Rate: 0.3
Hit Rate: 0.36363636363636365
Hit Rate: 0.3333333333333333
Hit Rate: 0.3076923076923077
Hit Rate: 0.35714285714285715
Hit Rate: 0.3333333333333333
Hit Rate: 0.375
Hit Rate: 0.35294117647058826
Hit Rate: 0.3333333333333333
Hit Rate: 0.3684210526315789
Hit Rate: 0.35
Hit Rate: 0.3333333333333333
Hit Rate: 0.36363636363636365
Hit Rate: 0.34782608695652173
Hit Rate: 0.375
Hit Rate: 0.4
Hit Rate: 0.38461538461538464
Hit Rate: 0.37037037037037035
Hit Rate: 0.35714285714285715
Hit Rate: 0.3448275862068966
Hit Rate: 0.3333333333333333
Hit Rate: 0.3225806451612903
Hit Rate: 0.3125
Hit Rate: 0.3333333333333333
Hit Rate: 0.3235294117647059
Hit Rate: 0.3142857142857143
Hit Rate: 0.3055555555555556
Hit Rate: 0.32432432432432434
Hit Rate: 0.3157894736842105
Hit Rate: 0.3333333333333333
Hit Rate: 0.35
Hit Rate: 0.341

0.28

In [6]:
# Work in progress

def precision_at_k(model, test_set, k=5):
    hits = 0
    total = 0
    skipped_users = 0

    for user_id in test_set['reviewers'].unique():
        if user_id not in model.user_similarity_matrix.index:
            print(f"Skipping usr not found in training set: {user_id}")
            skipped_users += 1
            continue

        user_actual = test_set[test_set['reviewers'] == user_id]
        user_actual_top_k = user_actual.nlargest(k, 'rating')['course_id']
        user_predicted_top_k = model.recommend(user_id, k)

        hits += len(set(user_predicted_top_k) & set(user_actual_top_k))
        total += k

    print(f"Processed {len(test_set['reviewers'].unique()) - skipped_users} users, Skipped {skipped_users} users")
    return hits / total if total > 0 else 0

# Test the model
precision = precision_at_k(user_cf_model, dataset.test_ratings, k=5)
print(f"Precision@5: {precision}")


Skipping usr not found in training set: By Maggie P
Skipping usr not found in training set: By Ricardo I
Skipping usr not found in training set: By Rithvika L
Skipping usr not found in training set: By VIJAY N


KeyboardInterrupt: 