# Item Based Collaborative Filtering

Suggests items by identifying relationships between different items based on user ratings.

This approach calculates the similarity between items using user ratings. Here, we assume that if users liked a particular item, they are more likely to enjoy items that are similar to it.

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from coursemate.dataset import Dataset
from coursemate.model import ItemBasedCF

pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

In [11]:
dataset = Dataset('data/Coursera_courses.csv', 'data/Coursera.csv', 'data/Coursera_reviews.csv')
dataset.set_interaction_counts(3, 50)
dataset.show_dataset_details()

Loading Coursera courses...
Loading Coursera reviews...
Segmenting out students with less than 3 or more than 50 reviews...
30719 students, 468 courses, 174219 reviews
Sparsity: 1.21%
Duplicates: 4.54%


In [12]:
dataset.set_train_test_split_by_user()

Setting the train-test split by user...


In [5]:
train_Xmatrix, train_ymatrix, df_train_X, df_train_y = dataset.get_train_matrix_split(ratio=0.8)
test_Xmatrix, test_ymatrix, df_test_X, df_test_y = dataset.get_test_matrix_split(ratio=0.5)

In [None]:
item_based_cf_model = ItemBasedCF(dataset.course_set)

In [None]:
all_train = pd.concat([df_train_X, df_train_y])

# fit with both training X and training y
item_based_cf_model.fit(all_train)

In [6]:
# Use the model to get recommendations for a test user
user_previous_courses = tuple(dataset.train_ratings[dataset.train_ratings['reviewers'] == 'By Kelvin k']['course_id'])
recommendations = item_based_cf_model.recommend(user_previous_courses, k=5)

recommendations

['cybersecurity-roles-processes-operating-system-security',
 'python',
 'python-functions-files-dictionaries',
 'python-data',
 'technical-support-fundamentals']

### Evaluation

In [14]:
def calculate_hit_rate(model, test_X, test_y, k=5):
    hit_count = 0
    total = 0

    for user_id in tqdm(dataset.test_students):
        user_history = test_X[test_X['reviewers'] == user_id]['course_id'].values
        actual_next_courses = test_y[test_y['reviewers'] == user_id]['course_id'].values
        
        if len(actual_next_courses) == 0:
            continue

        recommended_courses = model.recommend(user_history, k=k)

        hits = any(course in recommended_courses for course in actual_next_courses)
        hit_count += 1 if hits else 0
        total += 1

    # Calculate overall hit rate
    return hit_count / total


def calculate_precision(model, test_X, test_y, k=5):
    true_positives = 0
    total_recommendations = 0

    for user_id in tqdm(dataset.test_students):
        user_history = test_X[test_X['reviewers'] == user_id]['course_id'].values
        actual_next_courses = test_y[test_y['reviewers'] == user_id]['course_id'].values
        
        if len(actual_next_courses) == 0:
            continue

        recommended_courses = model.recommend(user_history, k=k)
        total_recommendations += len(recommended_courses)
        
        # Count the number of true positives
        true_positives += sum(course in actual_next_courses for course in recommended_courses)

    # Calculate precision
    if total_recommendations == 0:
        return 0
    return true_positives / total_recommendations

def calculate_recall_precision(model, test_X, test_y, k=5):
    true_positives = 0
    total_recommendations = 0
    total_relevant = 0

    for user_id in tqdm(dataset.test_students):
        user_history = test_X[test_X['reviewers'] == user_id]['course_id'].values
        actual_next_courses = test_y[test_y['reviewers'] == user_id]['course_id'].values

        total_relevant += len(actual_next_courses)
        if len(actual_next_courses) == 0:
            continue

        recommended_courses = model.recommend(user_history, k=k)
        total_recommendations += len(recommended_courses)

        true_positives += sum(course in actual_next_courses for course in recommended_courses)

    precision = true_positives / total_recommendations if total_recommendations else 0
    recall = true_positives / total_relevant if total_relevant else 0
    return recall, precision

def calculate_f1_score(recall, precision):
    if recall + precision == 0:
        return 0
    return 2 * (recall * precision) / (recall + precision)

In [15]:
calculate_hit_rate(item_based_cf_model, df_test_X, df_test_y, k=10)

100%|██████████| 7680/7680 [03:32<00:00, 36.17it/s]


0.49296875

In [11]:
calculate_hit_rate(item_based_cf_model, df_test_X, df_test_y, k=20)

100%|██████████| 7680/7680 [04:18<00:00, 29.72it/s]


0.6087239583333334

In [17]:
calculate_hit_rate(item_based_cf_model, df_test_X, df_test_y, k=5)

100%|██████████| 7680/7680 [04:02<00:00, 31.72it/s]


0.378515625

In [19]:
calculate_precision(item_based_cf_model, df_test_X, df_test_y, k=5)

100%|██████████| 7680/7680 [03:43<00:00, 34.41it/s]


0.090859375

In [23]:
calculate_precision(item_based_cf_model, df_test_X, df_test_y, k=10)

  0%|          | 2/7680 [00:00<06:26, 19.86it/s]

100%|██████████| 7680/7680 [04:23<00:00, 29.15it/s]


0.06274739583333333

In [22]:
recall, precision = calculate_recall_precision(item_based_cf_model, df_test_X, df_test_y, k=5)
calculate_f1_score(recall, precision)

100%|██████████| 7680/7680 [04:14<00:00, 30.13it/s]


0.12757555258972156

In [24]:
recall, precision = calculate_recall_precision(item_based_cf_model, df_test_X, df_test_y, k=10)
calculate_f1_score(recall, precision)

100%|██████████| 7680/7680 [04:01<00:00, 31.84it/s]


0.10352642942307484