In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, recall_score
import time
wd = %pwd
if wd.split('\\')[-1] == 'notebooks':
    %cd ..

from coursemate.dataset import Dataset
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [157]:
dataset = Dataset('data/Coursera_courses.csv', 'data/Coursera.csv', 'data/Coursera_reviews.csv')
dataset.set_interaction_counts(3, 50)
dataset.set_train_test_split_by_ratings()
training_matrix, test_matrix = dataset.get_train_test_matrices()

Loading Coursera courses...
Loading Coursera reviews...
Segmenting out students with less than 3 or more than 50 reviews...
Setting the train-test split by rating...


174219it [00:11, 15441.81it/s]


Computing the training and test rating matrix...


128771it [00:08, 14669.52it/s]
45448it [00:03, 14214.80it/s]


In [158]:
train_df = dataset.train_ratings.merge(dataset.df_courses, how='left',on='course_id')
test_df = dataset.test_ratings.merge(dataset.df_courses, how='left',on='course_id')
users = dataset.student_set.copy(deep=True)
courses = dataset.course_set.copy(deep=True)

In [159]:
print(len(train_df), len(test_df))
print(len(pd.unique(train_df['reviewers'])),len(pd.unique(test_df['reviewers'])))
print(len(users), len(courses))

128771 45448
30719 30719
30719 468


In [160]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def process_skills(skill_text):
    skills = set(skill_text.replace(')','').replace('(','').replace('-',' ').lower().split())
    return ' '.join(skills)

def process_description(description):
    description = description.lower()
    description = re.sub(r'[^\w\s]', '', description)
    tokens = word_tokenize(description)
    tokens = [word for word in tokens if word not in stop_words]
    
    #tokens = [ps.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def process_reviewers(reviewer):
    reviewer = reviewer.lower()
    reviewer = re.sub(r'[^\w\s]', '', reviewer).replace('by ', '').strip()
    return reviewer

train_df['reviewers'] = train_df['reviewers'].apply(process_reviewers)
test_df['reviewers'] = test_df['reviewers'].apply(process_reviewers)

train_df['skills'] = train_df['skills'].apply(process_skills)
test_df['skills'] = test_df['skills'].apply(process_skills)

train_df['description'] = train_df['description'].apply(process_description)
test_df['description'] = test_df['description'].apply(process_description)

courses['description'] = courses['description'].apply(process_description)

In [None]:
def make_recommendations(user_id,n_recommendations,vectorizer,course_vectors,category):
    user_reviews = train_df[train_df['reviewers'] == user_id][category]
    user_vector = vectorizer.transform(user_reviews)

    most_similar_courses = find_most_similar_courses(user_vector,course_vectors,user_reviews,category)[:n_recommendations]
    
    recommended_courses = []
    for course_id, similarity in most_similar_courses:
        recommended_courses.append(course_id)

    return recommended_courses


def find_most_similar_courses(user_vector,course_vectors,user_reviewed_courses,category):   
    most_similar_courses = []
    for other_course_id in courses.index:
        if other_course_id in user_reviewed_courses:
            continue

        desc = courses[courses.index == other_course_id][category]
        if desc.shape[0] == 0:
            continue

        desc = desc.iloc[0]
        course_vector = course_vectors[other_course_id]
        normalized_user_vector = normalize(user_vector)
        normalized_course_vector = normalize(course_vector)
        similarity = cosine_similarity(normalized_user_vector, normalized_course_vector)
        #similarity = cosine_similarity(user_vector, course_vector)
        most_similar_courses.append((other_course_id, similarity.mean()))
        
    most_similar_courses.sort(key=lambda x: x[1], reverse=True)
    
    return most_similar_courses

def evaluate_model(n_users,n_recommendations, Vectorizer,n_features, category):
    grouped_df = train_df.groupby('reviewers')['course_id'].count()
    filtered_df = grouped_df[grouped_df > 3]
    users = filtered_df.index[:n_users]

    vectorizer = Vectorizer(max_features=n_features)
    vectorizer.fit(courses[category])
    
    course_vectors = {}
    for id,row in courses.iterrows():
        course_vectors[id] = vectorizer.transform([row[category]])
    
            
    hitrate,f1,recall,precision,count = 0,0,0,0,0
    for user_id in users:
        recommendations = make_recommendations(user_id,n_recommendations,vectorizer,course_vectors,category)

        # For metric calculating
        user_reviewed_courses = test_df[test_df['reviewers'] == user_id]['course_id'].unique()
        res = len(set(user_reviewed_courses) & set(recommendations))

        if res > 0:
            hitrate += 1

        all_courses = np.concatenate((recommendations, user_reviewed_courses))
        recommended_vector = [1 if course in recommendations else 0 for course in all_courses]
        taken_vector = [1 if course in user_reviewed_courses else 0 for course in all_courses]

        # Calculate Metrics
        f1 += f1_score(taken_vector, recommended_vector)
        precision += precision_score(taken_vector, recommended_vector)
        recall += recall_score(taken_vector, recommended_vector)

        count +=1
    #print(f"Vectorizer: {Vectorizer.__name__} Features: {n}, category: {category}")
    #print(f"Hit-rate: {(hitrate / count):.3f}, F1: {(f1 / count):.3f}, Precision: {(precision / count):.3f}, Recall: {(recall / count):.3f}")
    return hitrate / count, f1 / count

# Testable parameters
how_many_users_to_test = 30
vectorizers = [TfidfVectorizer]
n_recommendations_list = [5, 10,15]
n_features_list = [1000,10000,100000]
categories = ['skills', 'description']


best_params_hitrate = {}
best_params_f1 = {}

# Initialize dictionaries to keep track of the highest scores for each n_recommendations
highest_hitrate = {}
highest_f1 = {}



# Gridsearch
# Test all combinations of parameters
for n_recommendations in n_recommendations_list:
    # Initialize the highest scores for this n_recommendations
    highest_hitrate[n_recommendations] = 0
    highest_f1[n_recommendations] = 0

    for vectorizer in vectorizers:
        for n_features in n_features_list:
            for category in categories:
                hitrate, f1 = evaluate_model(how_many_users_to_test, n_recommendations, vectorizer, n_features, category)

                # Update the best parameters for hitrate
                if hitrate > highest_hitrate[n_recommendations]:
                    highest_hitrate[n_recommendations] = hitrate
                    best_params_hitrate[n_recommendations] = (vectorizer, n_features, category)

                # Update the best parameters for f1 score
                if f1 > highest_f1[n_recommendations]:
                    highest_f1[n_recommendations] = f1
                    best_params_f1[n_recommendations] = (vectorizer, n_features, category)

    # Print the best parameters for this n_recommendations
    print(f"For n_recommendations = {n_recommendations}:")
    print(f"Hitrate: {highest_hitrate[n_recommendations]}, f1-score: {highest_f1[n_recommendations]}")
    print(f"Best parameters for highest hitrate: Vectorizer = {best_params_hitrate[n_recommendations][0].__name__}, n_features = {best_params_hitrate[n_recommendations][1]}, category = {best_params_hitrate[n_recommendations][2]}")
    print(f"Best parameters for highest f1 score: Vectorizer = {best_params_f1[n_recommendations][0].__name__}, n_features = {best_params_f1[n_recommendations][1]}, category = {best_params_f1[n_recommendations][2]}")
    print('\n')

In [152]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def make_recommendations(user_id, n_recommendations, model, course_vectors,category):
    user_reviews = train_df[train_df['reviewers'] == user_id][category]
    user_vector = model.infer_vector(user_reviews).reshape(1, -1)

    #print(train_df[train_df['reviewers'] == user_id]['course_id'].unique())
    most_similar_courses = find_most_similar_courses(user_vector, course_vectors, user_reviews,category)[:n_recommendations]
    
    recommended_courses = []
    for course_id, similarity in most_similar_courses:
        #print(course_id)
        recommended_courses.append(course_id)

    #print('\n\n')

    return recommended_courses

def find_most_similar_courses(user_vector, course_vectors, user_reviewed_courses,category):   
    most_similar_courses = []
    for other_course_id in courses.index:
        if other_course_id in user_reviewed_courses:
            continue

        desc = train_df[train_df['course_id'] == other_course_id][category]
        if desc.shape[0] == 0:
            continue

        desc = desc.iloc[0]
        course_vector = course_vectors[other_course_id].reshape(1, -1)

        normalized_user_vector = normalize(user_vector)
        normalized_course_vector = normalize(course_vector)
        similarity = cosine_similarity(normalized_user_vector, normalized_course_vector)
        #similarity = cosine_similarity(user_vector, course_vector)
        most_similar_courses.append((other_course_id, similarity[0]))
        
    most_similar_courses.sort(key=lambda x: x[1], reverse=True)
    
    return most_similar_courses

def evaluate_model(n_users, n_recommendations, vectorizer_model, n_features, category):
    grouped_df = train_df.groupby('reviewers')['course_id'].count()
    filtered_df = grouped_df[grouped_df > 3]
    users = filtered_df.index[:n_users]      
    start_time = time.time()
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(courses[category].tolist())]

    model = vectorizer_model(documents, dm=1, vector_size=n_features, epochs=250)    
    vectors = [model.infer_vector(word_tokenize(doc.lower())) for doc in courses['description'].tolist()]

    course_vectors = {}
    for i, row in enumerate(courses.iterrows()):
        course_vectors[row[0]] = vectors[i]

    hitrate, f1, recall, precision, count = 0, 0, 0, 0, 0
    for user_id in users:
        recommendations = make_recommendations(user_id, n_recommendations, model, course_vectors, category)

        # For metric calculating
        user_reviewed_courses = test_df[test_df['reviewers'] == user_id]['course_id'].unique()
        res = len(set(user_reviewed_courses) & set(recommendations))

        if res > 0:
            hitrate += 1

        all_courses = np.concatenate((recommendations, user_reviewed_courses))
        recommended_vector = [1 if course in recommendations else 0 for course in all_courses]
        taken_vector = [1 if course in user_reviewed_courses else 0 for course in all_courses]

        # Calculate Metrics
        f1 += f1_score(taken_vector, recommended_vector)
        precision += precision_score(taken_vector, recommended_vector)
        recall += recall_score(taken_vector, recommended_vector)

        count +=1
    print(f"Model: {model.__class__.__name__} Features: {n_features}, duration: {(time.time()-start_time):.3f}")
    print(f"Hit-rate: {(hitrate / count):.3f}, F1: {(f1 / count):.3f}, Precision: {(precision / count):.3f}, Recall: {(recall / count):.3f}")
evaluate_model(30, 10, Doc2Vec, 10000, 'description')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [None]:
from sentence_transformers import SentenceTransformer

def make_recommendations(user_id, n_recommendations, model, course_vectors, category):
    user_reviews = train_df[train_df['reviewers'] == user_id][category]
    user_vector = model.encode(user_reviews.tolist())

    most_similar_courses = find_most_similar_courses(user_vector, course_vectors, user_reviews, category)[:n_recommendations]

    recommended_courses = [course_id for course_id, _ in most_similar_courses]

    return recommended_courses


def find_most_similar_courses(user_vector, course_vectors, user_reviewed_courses, category):
    most_similar_courses = []
    for other_course_id in courses.index:
        if other_course_id in user_reviewed_courses:
            continue

        desc = train_df[train_df['course_id'] == other_course_id][category]
        if desc.shape[0] == 0:
            continue

        desc = desc.iloc[0]
        course_vector = course_vectors[other_course_id]  # Use precomputed vectors
        similarity = cosine_similarity(user_vector, [course_vector])[0][0]  # Remove the extra list
        most_similar_courses.append((other_course_id, similarity))

    most_similar_courses.sort(key=lambda x: x[1], reverse=True)

    return most_similar_courses




def evaluate_model(n_users, n_recommendations, model_name, category):
    grouped_df = train_df.groupby('reviewers')['course_id'].count()
    filtered_df = grouped_df[grouped_df > 3]
    users = filtered_df.index[:n_users]

    model = SentenceTransformer(model_name)
    course_vectors = {}
    for id, row in courses.iterrows():
        course_vectors[id] = model.encode([row[category]])[0]

    hitrate, f1, recall, precision, count = 0, 0, 0, 0, 0
    for user_id in users:
        recommendations = make_recommendations(user_id, n_recommendations, model, course_vectors, category)

        # For metric calculating
        user_reviewed_courses = test_df[test_df['reviewers'] == user_id]['course_id'].unique()
        res = len(set(user_reviewed_courses) & set(recommendations))

        if res > 0:
            hitrate += 1

        all_courses = np.concatenate((recommendations, user_reviewed_courses))
        recommended_vector = [1 if course in recommendations else 0 for course in all_courses]
        taken_vector = [1 if course in user_reviewed_courses else 0 for course in all_courses]

        # Calculate Metrics
        f1 += f1_score(taken_vector, recommended_vector)
        precision += precision_score(taken_vector, recommended_vector)
        recall += recall_score(taken_vector, recommended_vector)

        count +=1
    print(f"Model: {model_name} Features: {n_features}")
    print(f"Hit-rate: {(hitrate / count):.3f}, F1: {(f1 / count):.3f}, Precision: {(precision / count):.3f}, Recall: {(recall / count):.3f}")


evaluate_model(30, 5, 'paraphrase-distilroberta-base-v1','description')
evaluate_model(30, 5, 'paraphrase-multilingual-MiniLM-L12-v2','description')
evaluate_model(30, 5, 'distiluse-base-multilingual-cased-v2','description')
evaluate_model(30, 5, 'all-mpnet-base-v2','description')

evaluate_model(30, 5, 'paraphrase-distilroberta-base-v1','skills')
evaluate_model(30, 5, 'paraphrase-multilingual-MiniLM-L12-v2','skills')
evaluate_model(30, 5, 'distiluse-base-multilingual-cased-v2','skills')
evaluate_model(30, 5, 'all-mpnet-base-v2','skills')


Model: paraphrase-distilroberta-base-v1 Features: 100000
Hit-rate: 0.100, F1: 0.041, Precision: 0.028, Recall: 0.077
Model: paraphrase-multilingual-MiniLM-L12-v2 Features: 100000
Hit-rate: 0.100, F1: 0.034, Precision: 0.023, Recall: 0.062
Model: distiluse-base-multilingual-cased-v2 Features: 100000
Hit-rate: 0.100, F1: 0.035, Precision: 0.023, Recall: 0.071
Model: all-mpnet-base-v2 Features: 100000
Hit-rate: 0.067, F1: 0.032, Precision: 0.022, Recall: 0.060


In [161]:
# mixing the 
def make_recommendations(user_id,n_recommendations,vectorizer,course_vectors):
    user_reviews_skills = train_df[train_df['reviewers'] == user_id]['skills']
    user_reviews_description = train_df[train_df['reviewers'] == user_id]['description']
    user_reviews_combined = user_reviews_skills + ' ' + user_reviews_description
    user_vector = vectorizer.transform(user_reviews_combined)

    most_similar_courses = find_most_similar_courses(user_vector,course_vectors,user_reviews_combined)[:n_recommendations]
    
    recommended_courses = []
    for course_id, similarity in most_similar_courses:
        recommended_courses.append(course_id)

    return recommended_courses


def find_most_similar_courses(user_vector,course_vectors,user_reviewed_courses):   
    most_similar_courses = []
    for other_course_id in courses.index:
        if other_course_id in user_reviewed_courses:
            continue

        course_vector = course_vectors[other_course_id]
        normalized_user_vector = normalize(user_vector)
        normalized_course_vector = normalize(course_vector)
        similarity = cosine_similarity(normalized_user_vector, normalized_course_vector)
        #similarity = cosine_similarity(user_vector, course_vector)
        most_similar_courses.append((other_course_id, similarity.mean()))
        
    most_similar_courses.sort(key=lambda x: x[1], reverse=True)
    
    return most_similar_courses

def evaluate_model(n_users,n_recommendations, Vectorizer,n_features, category):
    grouped_df = train_df.groupby('reviewers')['course_id'].count()
    filtered_df = grouped_df[grouped_df > 3]
    users = filtered_df.index[:n_users]

    vectorizer = Vectorizer(max_features=n_features)
    vectorizer.fit(courses['description']+courses['skills'])
    
    course_vectors = {}
    for id,row in courses.iterrows():
        course_vectors[id] = vectorizer.transform([row['description'] + row['skills']])
    

    hitrate,f1,recall,precision,count = 0,0,0,0,0
    for user_id in users:
        recommendations = make_recommendations(user_id,n_recommendations,vectorizer,course_vectors)

        # For metric calculating
        user_reviewed_courses = test_df[test_df['reviewers'] == user_id]['course_id'].unique()
        res = len(set(user_reviewed_courses) & set(recommendations))

        if res > 0:
            hitrate += 1

        all_courses = np.concatenate((recommendations, user_reviewed_courses))
        recommended_vector = [1 if course in recommendations else 0 for course in all_courses]
        taken_vector = [1 if course in user_reviewed_courses else 0 for course in all_courses]

        # Calculate Metrics
        f1 += f1_score(taken_vector, recommended_vector)
        precision += precision_score(taken_vector, recommended_vector)

        count +=1

    return hitrate / count, f1 / count

# Testable parameters
how_many_users_to_test = 50
vectorizers = [TfidfVectorizer]
n_recommendations_list = [5, 10,15]
n_features_list = [1000,10000,100000]
categories = ['skills', 'description']


best_params_hitrate = {}
best_params_f1 = {}

# Initialize dictionaries to keep track of the highest scores for each n_recommendations
highest_hitrate = {}
highest_f1 = {}



# Gridsearch
# Test all combinations of parameters
for n_recommendations in n_recommendations_list:
    # Initialize the highest scores for this n_recommendations
    highest_hitrate[n_recommendations] = 0
    highest_f1[n_recommendations] = 0

    for vectorizer in vectorizers:
        for n_features in n_features_list:
            for category in categories:
                hitrate, f1 = evaluate_model(how_many_users_to_test, n_recommendations, vectorizer, n_features, category)

                # Update the best parameters for hitrate
                if hitrate > highest_hitrate[n_recommendations]:
                    highest_hitrate[n_recommendations] = hitrate
                    best_params_hitrate[n_recommendations] = (vectorizer, n_features, category)

                # Update the best parameters for f1 score
                if f1 > highest_f1[n_recommendations]:
                    highest_f1[n_recommendations] = f1
                    best_params_f1[n_recommendations] = (vectorizer, n_features, category)

    # Print the best parameters for this n_recommendations
    print(f"For n_recommendations = {n_recommendations}:")
    print(f"Hitrate: {highest_hitrate[n_recommendations]}, f1-score: {highest_f1[n_recommendations]}")
    print(f"Best parameters for highest hitrate: Vectorizer = {best_params_hitrate[n_recommendations][0].__name__}, n_features = {best_params_hitrate[n_recommendations][1]}")
    print(f"Best parameters for highest f1 score: Vectorizer = {best_params_f1[n_recommendations][0].__name__}, n_features = {best_params_f1[n_recommendations][1]}")
    print('\n')

For n_recommendations = 5:
Hitrate: 0.26, f1-score: 0.11368919968919969
Best parameters for highest hitrate: Vectorizer = TfidfVectorizer, n_features = 10000
Best parameters for highest f1 score: Vectorizer = TfidfVectorizer, n_features = 10000


For n_recommendations = 10:
Hitrate: 0.42, f1-score: 0.12718724412842058
Best parameters for highest hitrate: Vectorizer = TfidfVectorizer, n_features = 10000
Best parameters for highest f1 score: Vectorizer = TfidfVectorizer, n_features = 10000


For n_recommendations = 15:
Hitrate: 0.5, f1-score: 0.12378236032972875
Best parameters for highest hitrate: Vectorizer = TfidfVectorizer, n_features = 10000
Best parameters for highest f1 score: Vectorizer = TfidfVectorizer, n_features = 10000


