E.g. "Users who are similar to you also liked these courses."

In [27]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import f1_score


In [28]:
# Load datasets
courses_df = pd.read_csv('../data/Coursera_courses.csv').drop_duplicates()
reviews_df = pd.read_csv('../data/Coursera_reviews.csv').drop_duplicates()

In [29]:
reviews_df['date_reviews'] = pd.to_datetime(reviews_df['date_reviews'])

# Sort reviews by date
reviews_df = reviews_df.sort_values(by='date_reviews')

print(courses_df.isnull().sum())
print(reviews_df.isnull().sum())
courses_df = courses_df.drop_duplicates(subset='course_id')
reviews_df = reviews_df.drop_duplicates(subset=['reviewers', 'course_id'])

# Define a cutoff date for splitting the data (e.g., the last 20% of dates)
cutoff_date = reviews_df['date_reviews'].quantile(0.8)

# Split the data into training and testing sets
train_data = reviews_df[reviews_df['date_reviews'] < cutoff_date]
test_data = reviews_df[reviews_df['date_reviews'] >= cutoff_date]

name           0
institution    0
course_url     0
course_id      0
dtype: int64
reviews         56
reviewers        0
date_reviews     0
rating           0
course_id        0
dtype: int64


In [30]:
all_courses = pd.concat([train_data['course_id'], test_data['course_id']]).unique()
all_reviewers = pd.concat([train_data['reviewers'], test_data['reviewers']]).unique()

full_interaction_matrix = pd.DataFrame(0, index=all_courses, columns=all_reviewers)

for _, row in train_data.iterrows():
    full_interaction_matrix.at[row['course_id'], row['reviewers']] = row['rating']

test_interaction_matrix_structure = full_interaction_matrix.copy()

for _, row in test_data.iterrows():
    test_interaction_matrix_structure.at[row['course_id'], row['reviewers']] = row['rating']

train_interaction_matrix_sparse = csr_matrix(full_interaction_matrix.values)
test_interaction_matrix_sparse = csr_matrix(test_interaction_matrix_structure.values)

# Train
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(train_interaction_matrix_sparse)

# Prediction and evaluation
def get_user_predictions(user_id, train_interaction_matrix, model_knn, n_recommendations=5):
    user_index = list(train_interaction_matrix.columns).index(user_id)
    user_vector = train_interaction_matrix_sparse[:, user_index]
    distances, indices = model_knn.kneighbors(user_vector.T, n_neighbors=n_recommendations+1)
    
    # (exclude the user's own interaction)
    similar_users_indices = indices.flatten()[1:]
    similar_users = train_interaction_matrix.columns[similar_users_indices]
    
    # Aggregate the courses of similar users and count the frequency of courses
    course_frequencies = full_interaction_matrix[similar_users].sum(axis=1)
    recommended_courses = course_frequencies.sort_values(ascending=False).head(n_recommendations).index.tolist()
    
    return set(recommended_courses)

def evaluate_predictions(test_interaction_matrix, predicted_courses, threshold=4):
    actual_relevant_courses = set(test_interaction_matrix.index[test_interaction_matrix >= threshold])
    predicted_relevant_courses = predicted_courses

    tp = len(actual_relevant_courses & predicted_relevant_courses)
    fp = len(predicted_relevant_courses - actual_relevant_courses)
    fn = len(actual_relevant_courses - predicted_relevant_courses)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1

# Iterate over users in the test set to get recommendations and evaluate the F1 score
f1_scores = []
for user_id in test_interaction_matrix_structure.columns:
    predicted_courses = get_user_predictions(user_id, full_interaction_matrix, model_knn)
    user_f1_score = evaluate_predictions(test_interaction_matrix_structure[user_id], predicted_courses)
    f1_scores.append(user_f1_score)

# Calculate the mean F1 score across all users
mean_f1_score = np.mean(f1_scores)
print(f"Mean F1 Score: {mean_f1_score}")

ValueError: X has 604 features, but NearestNeighbors is expecting 287808 features as input.

: 