# Item Based Collaborative Filtering

Suggests items by identifying relationships between different items based on user ratings.

This approach calculates the similarity between items using user ratings. Here, we assume that if users liked a particular item, they are more likely to enjoy items that are similar to it.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from coursemate.dataset import Dataset
from coursemate.model import ItemBasedCF

pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)


In [2]:
dataset = Dataset('data/Coursera_courses.csv', 'data/Coursera.csv', 'data/Coursera_reviews.csv')
dataset.set_interaction_counts(3, 50)
dataset.show_dataset_details()

Loading Coursera courses...
Loading Coursera reviews...
Segmenting out students with less than 3 or more than 50 reviews...
30719 students, 468 courses, 174219 reviews
Sparsity: 1.21%
Duplicates: 4.54%


In [25]:
dataset.set_train_test_split_by_user()

# Instantiate the ItemBasedCF model
item_cf_model = ItemBasedCF(dataset.course_set)

# Fit the model with the training data
item_cf_model.fit(dataset.train_ratings)

Setting the train-test split by user...


In [26]:
# Use the model to get recommendations for a given user
user_previous_courses = tuple(dataset.train_ratings[dataset.train_ratings['reviewers'] == 'By Kelvin k']['course_id'])
recommendations = item_cf_model.recommend(user_previous_courses, k=5)

recommendations

['cybersecurity-roles-processes-operating-system-security',
 'python',
 'python-functions-files-dictionaries',
 'python-data',
 'technical-support-fundamentals']

### Evaluation

In [3]:


def hit_rate(model, test_set, k=5):
    hits = 0
    total = 0
    #test_set['reviewers'].unique():
    for user_id in tqdm(test_set['reviewers'].unique()):
        # Get previously rated courses for the user in training data
        prev_courses = tuple(dataset.train_ratings[dataset.train_ratings['reviewers'] == user_id]['course_id'])

        # get the top-k recommended courses from the model
        recommended_courses = model.recommend(prev_courses, k)

        # Get the actual top-k rated courses for the user in the test data
        actual_top_k = test_set[test_set['reviewers'] == user_id].nlargest(k, 'rating')['course_id']

        # Calculate hits
        if any(course in recommended_courses for course in actual_top_k):
            hits += 1
        total += 1
        
        # print(f"Hit Rate: {hits / total if total > 0 else 0}")
        # if total == 500: # Stop after 500 users
        #     break

    return hits / total if total > 0 else 0


In [7]:
dataset.set_train_test_split_random(0.999)

Setting the train-test split randomly...


In [8]:
cf_model = ItemBasedCF(dataset.course_set)

cf_model.fit(dataset.train_ratings)

In [9]:
hit_rate(cf_model, dataset.test_ratings, k=10)

100%|██████████| 174/174 [00:06<00:00, 28.34it/s]


0.3390804597701149