E.g. "Users who are similar to you also liked these courses."

In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [3]:
# Load datasets
courses_df = pd.read_csv('../data/Coursera_courses.csv').drop_duplicates()
reviews_df = pd.read_csv('../data/Coursera_reviews.csv').drop_duplicates()

In [4]:
reviews_df['date_reviews'] = pd.to_datetime(reviews_df['date_reviews'])

# Sort reviews by date
reviews_df = reviews_df.sort_values(by='date_reviews')

print(courses_df.isnull().sum())
print(reviews_df.isnull().sum())
courses_df = courses_df.drop_duplicates(subset='course_id')
reviews_df = reviews_df.drop_duplicates(subset=['reviewers', 'course_id'])

# Define a cutoff date for splitting the data (e.g., the last 20% of dates)
cutoff_date = reviews_df['date_reviews'].quantile(0.8)

# Split the data into training and testing sets
train_data = reviews_df[reviews_df['date_reviews'] < cutoff_date]
test_data = reviews_df[reviews_df['date_reviews'] >= cutoff_date]


name           0
institution    0
course_url     0
course_id      0
dtype: int64
reviews         56
reviewers        0
date_reviews     0
rating           0
course_id        0
dtype: int64


In [5]:
# create a course-user matrix instead of user-course
user_course_matrix = train_data.pivot(index='reviewers', columns='course_id', values='rating').fillna(0)

# get a user-course matrix where rows are users and columns are courses
user_course_matrix_transpose = user_course_matrix.T

user_course_matrix_sparse = csr_matrix(user_course_matrix_transpose.values)

user_knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
user_knn_model.fit(user_course_matrix_sparse)

def recommend_courses_for_user(user_id, user_course_matrix, user_knn_model, courses_df, k=5):
    if user_id not in user_course_matrix.index:
        raise ValueError("User ID not found")
    
    user_index = user_course_matrix.index.tolist().index(user_id)
    
    distances, indices = user_knn_model.kneighbors(
        user_course_matrix_transpose.iloc[:, user_index].values.reshape(1, -1), n_neighbors=k+1)
    
    user_indices = [user_course_matrix.index[i] for i in indices.flatten()][1:]  # exclude the user itself
    
    # identify courses that similar users liked
    similar_users_courses = user_course_matrix.iloc[user_indices]
    course_ratings = similar_users_courses.mean(axis=0)
    recommended_courses_ids = course_ratings.sort_values(ascending=False).head(k).index.tolist()
    
    # map indices to course names
    recommended_courses = courses_df[courses_df['course_id'].isin(recommended_courses_ids)]
    return recommended_courses


user_id = 'By Daniel F'
recommended_courses = recommend_courses_for_user(user_id, user_course_matrix, user_knn_model, courses_df)
print(recommended_courses)

ValueError: X has 587 features, but NearestNeighbors is expecting 233496 features as input.