In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, recall_score
import time
wd = %pwd
if wd.split('\\')[-1] == 'notebooks':
    %cd ..

from coursemate.dataset import Dataset
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [4]:
dataset = Dataset('data/Coursera_courses.csv', 'data/Coursera.csv', 'data/Coursera_reviews.csv')
dataset.set_interaction_counts(3, 50)
dataset.set_train_test_split_by_ratings(ratio=0.8)
training_matrix, test_matrix = dataset.get_train_test_matrices()

Loading Coursera courses...


Loading Coursera reviews...
Segmenting out students with less than 3 or more than 50 reviews...
Setting the train-test split by rating...


174219it [00:10, 16201.59it/s]


Computing the training and test rating matrix...


128771it [00:09, 13868.40it/s]
45448it [00:03, 13526.30it/s]


In [5]:
train_df = dataset.train_ratings.merge(dataset.df_courses, how='left',on='course_id')
test_df = dataset.test_ratings.merge(dataset.df_courses, how='left',on='course_id')
users = dataset.student_set.copy(deep=True)
courses = dataset.course_set.copy(deep=True)

In [6]:
print(len(train_df), len(test_df))
print(len(pd.unique(train_df['reviewers'])),len(pd.unique(test_df['reviewers'])))
print(len(users), len(courses))

128771 45448
30719 30719
30719 468


In [7]:
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def process_skills(skill_text):
    skills = set(skill_text.replace(')','').replace('(','').replace('-',' ').lower().split())
    return ' '.join(skills)

def process_description(description):
    description = description.lower()
    description = re.sub(r'[^\w\s]', '', description)
    tokens = word_tokenize(description)
    tokens = [word for word in tokens if word not in stop_words]
    
    #tokens = [ps.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def process_reviewers(reviewer):
    reviewer = reviewer.lower()
    reviewer = re.sub(r'[^\w\s]', '', reviewer).replace('by ', '').strip()
    return reviewer

train_df['reviewers'] = train_df['reviewers'].apply(process_reviewers)
test_df['reviewers'] = test_df['reviewers'].apply(process_reviewers)

train_df['skills'] = train_df['skills'].apply(process_skills)
test_df['skills'] = test_df['skills'].apply(process_skills)

train_df['description'] = train_df['description'].apply(process_description)
test_df['description'] = test_df['description'].apply(process_description)

courses['description'] = courses['description'].apply(process_description)

In [98]:
def make_recommendations(user_id,n_recommendations,vectorizer,course_vectors,category):
    user_reviews = train_df[train_df['reviewers'] == user_id][category]
    user_vector = vectorizer.transform(user_reviews)

    most_similar_courses = find_most_similar_courses(user_vector,course_vectors,user_reviews,category)[:n_recommendations]
    
    recommended_courses = []
    for course_id, similarity in most_similar_courses:
        recommended_courses.append(course_id)

    return recommended_courses


def find_most_similar_courses(user_vector,course_vectors,user_reviewed_courses,category):   
    most_similar_courses = []
    for other_course_id in courses.index:
        if other_course_id in user_reviewed_courses:
            continue

        desc = train_df[train_df['course_id'] == other_course_id][category]
        if desc.shape[0] == 0:
            continue

        desc = desc.iloc[0]
        course_vector = course_vectors[other_course_id]
        normalized_user_vector = normalize(user_vector)
        normalized_course_vector = normalize(course_vector)
        similarity = cosine_similarity(normalized_user_vector, normalized_course_vector)
        #similarity = cosine_similarity(user_vector, course_vector)
        most_similar_courses.append((other_course_id, similarity.mean()))
        
    most_similar_courses.sort(key=lambda x: x[1], reverse=True)
    
    return most_similar_courses

def evaluate_model(n_users,n_recommendations, Vectorizer,n_features, category):
    grouped_df = train_df.groupby('reviewers')['course_id'].count()
    filtered_df = grouped_df[grouped_df > 3]
    users = filtered_df.index[:n_users]

    vectorizer = Vectorizer(max_features=n_features)
    vectorizer.fit(courses[category])
    
    course_vectors = {}
    for id,row in courses.iterrows():
        course_vectors[id] = vectorizer.transform([row[category]])
    
            
    hitrate,f1,recall,precision,count = 0,0,0,0,0
    for user_id in users:
        recommendations = make_recommendations(user_id,n_recommendations,vectorizer,course_vectors,category)

        # For metric calculating
        user_reviewed_courses = test_df[test_df['reviewers'] == user_id]['course_id'].unique()
        res = len(set(user_reviewed_courses) & set(recommendations))

        if res > 0:
            hitrate += 1

        all_courses = np.concatenate((recommendations, user_reviewed_courses))
        recommended_vector = [1 if course in recommendations else 0 for course in all_courses]
        taken_vector = [1 if course in user_reviewed_courses else 0 for course in all_courses]

        # Calculate Metrics
        f1 += f1_score(taken_vector, recommended_vector)
        precision += precision_score(taken_vector, recommended_vector)
        recall += recall_score(taken_vector, recommended_vector)

        count +=1
    #print(f"Vectorizer: {Vectorizer.__name__} Features: {n}, category: {category}")
    #print(f"Hit-rate: {(hitrate / count):.3f}, F1: {(f1 / count):.3f}, Precision: {(precision / count):.3f}, Recall: {(recall / count):.3f}")
    return hitrate / count, f1 / count

# Testable parameters
how_many_users_to_test = 30
vectorizers = [TfidfVectorizer]
n_recommendations_list = [5, 10,15]
n_features_list = [1000,10000,100000]
categories = ['skills', 'description']


best_params_hitrate = {}
best_params_f1 = {}

# Initialize dictionaries to keep track of the highest scores for each n_recommendations
highest_hitrate = {}
highest_f1 = {}



# Gridsearch
# Test all combinations of parameters
for n_recommendations in n_recommendations_list:
    # Initialize the highest scores for this n_recommendations
    highest_hitrate[n_recommendations] = 0
    highest_f1[n_recommendations] = 0

    for vectorizer in vectorizers:
        for n_features in n_features_list:
            for category in categories:
                hitrate, f1 = evaluate_model(how_many_users_to_test, n_recommendations, vectorizer, n_features, category)

                # Update the best parameters for hitrate
                if hitrate > highest_hitrate[n_recommendations]:
                    highest_hitrate[n_recommendations] = hitrate
                    best_params_hitrate[n_recommendations] = (vectorizer, n_features, category)

                # Update the best parameters for f1 score
                if f1 > highest_f1[n_recommendations]:
                    highest_f1[n_recommendations] = f1
                    best_params_f1[n_recommendations] = (vectorizer, n_features, category)

    # Print the best parameters for this n_recommendations
    print(f"For n_recommendations = {n_recommendations}:")
    print(f"Hitrate: {highest_hitrate[n_recommendations]}, f1-score: {highest_f1[n_recommendations]}")
    print(f"Best parameters for highest hitrate: Vectorizer = {best_params_hitrate[n_recommendations][0].__name__}, n_features = {best_params_hitrate[n_recommendations][1]}, category = {best_params_hitrate[n_recommendations][2]}")
    print(f"Best parameters for highest f1 score: Vectorizer = {best_params_f1[n_recommendations][0].__name__}, n_features = {best_params_f1[n_recommendations][1]}, category = {best_params_f1[n_recommendations][2]}")
    print('\n')

For n_recommendations = 5:
Hitrate: 0.2, f1-score: 0.08958300958300956
Best parameters for highest hitrate: Vectorizer = TfidfVectorizer, n_features = 1000, category = description
Best parameters for highest f1 score: Vectorizer = TfidfVectorizer, n_features = 1000, category = description


For n_recommendations = 10:
Hitrate: 0.36666666666666664, f1-score: 0.1148580286815581
Best parameters for highest hitrate: Vectorizer = TfidfVectorizer, n_features = 10000, category = description
Best parameters for highest f1 score: Vectorizer = TfidfVectorizer, n_features = 10000, category = description


For n_recommendations = 15:
Hitrate: 0.4, f1-score: 0.10369570390623023
Best parameters for highest hitrate: Vectorizer = TfidfVectorizer, n_features = 10000, category = description
Best parameters for highest f1 score: Vectorizer = TfidfVectorizer, n_features = 10000, category = description




In [99]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def make_recommendations(user_id, n_recommendations, model, course_vectors,category):
    user_reviews = train_df[train_df['reviewers'] == user_id][category]
    user_vector = model.infer_vector(user_reviews).reshape(1, -1)

    #print(train_df[train_df['reviewers'] == user_id]['course_id'].unique())
    most_similar_courses = find_most_similar_courses(user_vector, course_vectors, user_reviews,category)[:n_recommendations]
    
    recommended_courses = []
    for course_id, similarity in most_similar_courses:
        #print(course_id)
        recommended_courses.append(course_id)

    #print('\n\n')

    return recommended_courses

def find_most_similar_courses(user_vector, course_vectors, user_reviewed_courses,category):   
    most_similar_courses = []
    for other_course_id in courses.index:
        if other_course_id in user_reviewed_courses:
            continue

        desc = train_df[train_df['course_id'] == other_course_id][category]
        if desc.shape[0] == 0:
            continue

        desc = desc.iloc[0]
        course_vector = course_vectors[other_course_id].reshape(1, -1)

        normalized_user_vector = normalize(user_vector)
        normalized_course_vector = normalize(course_vector)
        similarity = cosine_similarity(normalized_user_vector, normalized_course_vector)
        #similarity = cosine_similarity(user_vector, course_vector)
        most_similar_courses.append((other_course_id, similarity[0]))
        
    most_similar_courses.sort(key=lambda x: x[1], reverse=True)
    
    return most_similar_courses

def evaluate_model(n_users, n_recommendations, vectorizer_model, n_features, category):
    grouped_df = train_df.groupby('reviewers')['course_id'].count()
    filtered_df = grouped_df[grouped_df > 3]
    users = filtered_df.index[:n_users]      
    start_time = time.time()
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(courses[category].tolist())]

    model = vectorizer_model(documents, dm=1, vector_size=n_features, epochs=250)    
    vectors = [model.infer_vector(word_tokenize(doc.lower())) for doc in courses['description'].tolist()]

    course_vectors = {}
    for i, row in enumerate(courses.iterrows()):
        course_vectors[row[0]] = vectors[i]

    hitrate, f1, recall, precision, count = 0, 0, 0, 0, 0
    for user_id in users:
        recommendations = make_recommendations(user_id, n_recommendations, model, course_vectors, category)

        # For metric calculating
        user_reviewed_courses = test_df[test_df['reviewers'] == user_id]['course_id'].unique()
        res = len(set(user_reviewed_courses) & set(recommendations))

        if res > 0:
            hitrate += 1

        all_courses = np.concatenate((recommendations, user_reviewed_courses))
        recommended_vector = [1 if course in recommendations else 0 for course in all_courses]
        taken_vector = [1 if course in user_reviewed_courses else 0 for course in all_courses]

        # Calculate Metrics
        f1 += f1_score(taken_vector, recommended_vector)
        precision += precision_score(taken_vector, recommended_vector)
        recall += recall_score(taken_vector, recommended_vector)

        count +=1
    print(f"Model: {model.__class__.__name__} Features: {n_features}, duration: {(time.time()-start_time):.3f}")
    print(f"Hit-rate: {(hitrate / count):.3f}, F1: {(f1 / count):.3f}, Precision: {(precision / count):.3f}, Recall: {(recall / count):.3f}")
evaluate_model(30, 10, Doc2Vec, 0000, 'description')

Model: Doc2Vec Features: 10000, duration: 453.285
Hit-rate: 0.000, F1: 0.000, Precision: 0.000, Recall: 0.000


In [135]:
from sentence_transformers import SentenceTransformer

def make_recommendations(user_id, n_recommendations, model, course_vectors, category):
    user_reviews = train_df[train_df['reviewers'] == user_id][category]
    user_vector = model.encode(user_reviews.tolist())

    most_similar_courses = find_most_similar_courses(user_vector, course_vectors, user_reviews, category)[:n_recommendations]

    recommended_courses = [course_id for course_id, _ in most_similar_courses]

    return recommended_courses


def find_most_similar_courses(user_vector, course_vectors, user_reviewed_courses, category):
    most_similar_courses = []
    for other_course_id in courses.index:
        if other_course_id in user_reviewed_courses:
            continue

        desc = train_df[train_df['course_id'] == other_course_id][category]
        if desc.shape[0] == 0:
            continue

        desc = desc.iloc[0]
        course_vector = course_vectors[other_course_id]  # Use precomputed vectors
        similarity = cosine_similarity(user_vector, [course_vector])[0][0]  # Remove the extra list
        most_similar_courses.append((other_course_id, similarity))

    most_similar_courses.sort(key=lambda x: x[1], reverse=True)

    return most_similar_courses




def evaluate_model(n_users, n_recommendations, model_name, category):
    grouped_df = train_df.groupby('reviewers')['course_id'].count()
    filtered_df = grouped_df[grouped_df > 3]
    users = filtered_df.index[:n_users]

    model = SentenceTransformer(model_name)
    course_vectors = {}
    for id, row in courses.iterrows():
        course_vectors[id] = model.encode([row[category]])[0]

    hitrate, f1, recall, precision, count = 0, 0, 0, 0, 0
    for user_id in users:
        recommendations = make_recommendations(user_id, n_recommendations, model, course_vectors, category)

        # For metric calculating
        user_reviewed_courses = test_df[test_df['reviewers'] == user_id]['course_id'].unique()
        res = len(set(user_reviewed_courses) & set(recommendations))

        if res > 0:
            hitrate += 1

        all_courses = np.concatenate((recommendations, user_reviewed_courses))
        recommended_vector = [1 if course in recommendations else 0 for course in all_courses]
        taken_vector = [1 if course in user_reviewed_courses else 0 for course in all_courses]

        # Calculate Metrics
        f1 += f1_score(taken_vector, recommended_vector)
        precision += precision_score(taken_vector, recommended_vector)
        recall += recall_score(taken_vector, recommended_vector)

        count +=1
    print(f"Model: {model.__class__.__name__} Features: {n_features}")
    print(f"Hit-rate: {(hitrate / count):.3f}, F1: {(f1 / count):.3f}, Precision: {(precision / count):.3f}, Recall: {(recall / count):.3f}")


# evaluate_model(30, 10, SentenceTransformer('paraphrase-distilroberta-base-v1'),'description')
# evaluate_model(30, 10, SentenceTransformer('paraphrase-distilroberta-base-v1'),'description')
# evaluate_model(30, 10, SentenceTransformer('paraphrase-distilroberta-base-v1'),'description')
# evaluate_model(30, 10, SentenceTransformer('paraphrase-distilroberta-base-v1'),'description')
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
#print(model._model_card_text)
print(model._model_card_text)
model_path = model.save_path
model_name = os.path.basename(model_path)
print("Pre-trained model name:", model_name)

---
pipeline_tag: sentence-similarity
license: apache-2.0
tags:
- sentence-transformers
- feature-extraction
- sentence-similarity
- transformers
---

# sentence-transformers/paraphrase-distilroberta-base-v1

This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.



## Usage (Sentence-Transformers)

Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:

```
pip install -U sentence-transformers
```

Then you can use the model like this:

```python
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/paraphrase-distilroberta-base-v1')
embeddings = model.encode(sentences)
print(embeddings)
```



## Usage (HuggingFace Transformers)
Without [sentence-transformers](https://www.SB

AttributeError: 'SentenceTransformer' object has no attribute 'save_path'