In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, recall_score
import time
wd = %pwd
if wd.split('\\')[-1] == 'notebooks':
    %cd ..

from coursemate.dataset import Dataset
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [2]:
dataset = Dataset('data/Coursera_courses.csv', 'data/Coursera.csv', 'data/Coursera_reviews.csv')
dataset.set_interaction_counts(3, 50)
dataset.set_train_test_split_by_ratings(ratio=0.8)
training_matrix, test_matrix = dataset.get_train_test_matrices()

Loading Coursera courses...
Loading Coursera reviews...
Segmenting out students with less than 3 or more than 50 reviews...
Setting the train-test split by rating...


174219it [00:12, 14431.47it/s]


Computing the training and test rating matrix...


128771it [00:08, 14608.17it/s]
45448it [00:03, 13238.75it/s]


In [3]:
train_df = dataset.train_ratings.merge(dataset.df_courses, how='left',on='course_id')
test_df = dataset.test_ratings.merge(dataset.df_courses, how='left',on='course_id')
users = dataset.student_set.copy(deep=True)
courses = dataset.course_set.copy(deep=True)

In [4]:
print(len(train_df), len(test_df))
print(len(pd.unique(train_df['reviewers'])),len(pd.unique(test_df['reviewers'])))
print(len(users), len(courses))

128771 45448
30719 30719
30719 468


In [5]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def process_skills(skill_text):
    skills = set(skill_text.replace(')','').replace('(','').replace('-',' ').lower().split())
    return ' '.join(skills)

def process_description(description):
    description = description.lower()
    description = re.sub(r'[^\w\s]', '', description)
    tokens = word_tokenize(description)
    tokens = [word for word in tokens if word not in stop_words]
    
    #tokens = [ps.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def process_reviewers(reviewer):
    reviewer = reviewer.lower()
    reviewer = re.sub(r'[^\w\s]', '', reviewer).replace('by ', '').strip()
    return reviewer

train_df['reviewers'] = train_df['reviewers'].apply(process_reviewers)
test_df['reviewers'] = test_df['reviewers'].apply(process_reviewers)

train_df['skills'] = train_df['skills'].apply(process_skills)
test_df['skills'] = test_df['skills'].apply(process_skills)

train_df['description'] = train_df['description'].apply(process_description)
test_df['description'] = test_df['description'].apply(process_description)

courses['description'] = courses['description'].apply(process_description)

In [112]:
def make_recommendations(user_id,vectorizer,course_vectors):
    #user_reviews = train_df[train_df['reviewers'] == user_id]['course_id'].unique()
    #user_vector = vectorizer.transform(train_df[train_df['course_id'].isin(user_reviewed_courses)]['description'])
    user_reviews = train_df[train_df['reviewers'] == user_id]['description']
    user_vector = vectorizer.transform(user_reviews)

    s = time.time()
    most_similar_courses = find_most_similar_courses(user_vector,course_vectors,user_reviews)
    
    recommended_courses = []
    for course_id, similarity in most_similar_courses:
        recommended_courses.append(course_id)

    return recommended_courses


def find_most_similar_courses(user_vector,course_vectors,user_reviewed_courses):   
    most_similar_courses = []
    for other_course_id in courses.index:
        if other_course_id in user_reviewed_courses:
            continue

        desc = train_df[train_df['course_id'] == other_course_id]['description']
        if desc.shape[0] == 0:
            continue

        desc = desc.iloc[0]
        course_vector = course_vectors[other_course_id]
        similarity = cosine_similarity(user_vector, course_vector)
        most_similar_courses.append((other_course_id, similarity.mean()))
        
    most_similar_courses.sort(key=lambda x: x[1], reverse=True)
    
    return most_similar_courses[:5]

def evaluate_model(n_users,Vectorizer,n_features):
    grouped_df = train_df.groupby('reviewers')['course_id'].count()
    filtered_df = grouped_df[grouped_df > 3]
    users = filtered_df.index[:n_users]      
    if isinstance(n_features, int):
        n_features = [n_features]

    for n in n_features:
        t = time.time()
        vectorizer = Vectorizer(max_features=n)
        vectorizer.fit(courses['description'])
        #print(f"Fitting {(time.time() - t):.3f}")
        #course_vectors1 = {course_id: vectorizer.transform([desc]) for course_id, desc in zip(train_df['course_id'], train_df['description'])}
        
        course_vectors = {}
        for id,row in courses.iterrows():
            course_vectors[id] = vectorizer.transform([row['description']])
        # for i in course_vectors.values():
        #     for j in course_vectors1.values():
        #         if course
            #if i not in course_vectors1.values:
                #print(i)
                

        # print(course_vectors == course_vectors1)
        # print(f"Transform course vectors {(time.time() - t):.3f}")
        hitrate,f1,recall,precision,count = 0,0,0,0,0
        start_time = time.time()
        for user_id in users:
            recommendations = make_recommendations(user_id,vectorizer,course_vectors)

            user_reviewed_courses = test_df[test_df['reviewers'] == user_id]['course_id'].unique()
            res = len(set(user_reviewed_courses) & set(recommendations))

            if res > 0:
                hitrate += 1

            all_courses = np.concatenate((recommendations, user_reviewed_courses))

            # Create binary vectors
            recommended_vector = [1 if course in recommendations else 0 for course in all_courses]
            taken_vector = [1 if course in user_reviewed_courses else 0 for course in all_courses]


            # Calculate F1 score
            f1 += f1_score(taken_vector, recommended_vector)
            precision += precision_score(taken_vector, recommended_vector)
            recall += recall_score(taken_vector, recommended_vector)

            count +=1
        print(f"Vectorizer: {Vectorizer.__name__} Features: {n}, duration: {(time.time()-start_time):.3f}")
        print(f"Hit-rate: {(hitrate / count):.3f}, F1: {(f1 / count):.3f}, Precision: {(precision / count):.3f}, Recall: {(recall / count):.3f}")


In [113]:
evaluate_model(30,CountVectorizer,[1000])
#evaluate_model(30,TfidfVectorizer,[100,500,1000,5000])

Fitting 0.080
remote-team-management
social-media-marketing-introduction
attention-models-in-nlp
introduction-to-data-analytics
covid-19-contact-tracing-for-nursing-professionals
building-modern-python-applications-on-aws
the-business-of-product-management-one
False
Transform course vectors 27.638


In [77]:
start_time = time.time()
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
desc = courses['description']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(desc)
print(time.time()-start_time)

start_time = time.time()
vectorizer = TfidfVectorizer()
vectorizer.fit(courses['description'])
course_vectors = {}
for id,row in courses.iterrows():
    course_vectors[id] = vectorizer.transform([row['description']])
print(time.time()-start_time)

8.499151706695557
0.6956300735473633


In [78]:
print(len(embeddings), len(course_vectors))
print(type(embeddings), type(course_vectors))
# for i in len(courses):
#     print(cosine_similarity(embeddings))

468 468
<class 'numpy.ndarray'> <class 'dict'>
