In [1]:
# Install the scikit-surprise library using pip

pip install scikit-surprise



In [2]:
# Import the required libraries and modules

import numpy as np
import pandas as pd
from surprise import Dataset,Reader

In [3]:
# Read the 'jokes-data (1).csv' file and load its contents into a pandas DataFrame (df)

In [4]:
df=pd.read_csv('/content/jokes-data (1).csv')

In [5]:
df.head()

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375


In [6]:
df.shape

(1092059, 4)

In [7]:
df.dtypes

id          object
user_id      int64
joke_id      int64
Rating     float64
dtype: object

In [8]:
df.isna().sum()

id         0
user_id    0
joke_id    0
Rating     0
dtype: int64

In [9]:
df['user_id'].nunique()

40863

In [10]:
df['joke_id'].nunique()

139

In [11]:
df['Rating'].nunique()

641

In [12]:
df.describe()

Unnamed: 0,user_id,joke_id,Rating
count,1092059.0,1092059.0,1092059.0
mean,20683.56,64.02266,1.758394
std,11830.32,44.11652,5.23086
min,1.0,1.0,-10.0
25%,10412.0,22.0,-1.719
50%,21308.0,62.0,2.344
75%,30784.0,104.0,5.781
max,40863.0,139.0,10.0


In [13]:
# Create a Reader object to specify the rating scale for the recommendation system

In [14]:
reader = Reader(rating_scale=(-10, 10))

In [15]:
# Load the data from the DataFrame into a Surprise dataset

In [16]:
data= Dataset.load_from_df(df[['user_id', 'joke_id', 'Rating']], reader)

In [17]:
from surprise.model_selection import train_test_split

In [18]:
# Split the Surprise dataset into training and testing sets

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Collaborative Filtering.

## SVD




In [19]:
# Import the SVD recommendation algorithms from Surprise
from surprise import SVD

# Import the accuracy module from Surprise
from surprise import accuracy


In [20]:
# Create an SVD model
model = SVD()

In [21]:
# Train the model on the training data
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7aaac2883130>

### Single RMSE Evaluation:

In [22]:
# Make predictions on the test data
predictions = model.test(testset)

In [23]:
# Calculate RMSE
rmse = accuracy.rmse(predictions)

RMSE: 4.2997


## Cross-validation

In [25]:
from surprise.model_selection import cross_validate

In [30]:
cross_validate(model,data,measures=['RMSE'],cv=5)

{'test_rmse': array([4.29423571, 4.29113416, 4.29123398, 4.29167161, 4.28802784]),
 'fit_time': (20.390852689743042,
  20.65864372253418,
  20.259047985076904,
  19.16134524345398,
  19.18902349472046),
 'test_time': (2.516907215118408,
  2.1251354217529297,
  2.2079014778137207,
  2.3076066970825195,
  3.1245222091674805)}

In [34]:
rmse_values = [4.29423571, 4.29113416, 4.29123398, 4.29167161, 4.28802784]
average_rmse = np.mean(rmse_values)
print("Average RMSE:", average_rmse)

Average RMSE: 4.291260660000001


## Omitting K-Nearest Neighbors (KNN) Algorithm     


In [27]:
from surprise.prediction_algorithms.knns import KNNBasic
model2=KNNBasic()

In [None]:
cross_validate(model2,data,measures=['RMSE'],cv=5)

Computational Resource Limitations:  KNN is a memory-intensive algorithm, and it can require significant computational resources, especially for large datasets. During our experimentation in Google Colab, we encountered issues such as crashes and unmanageable memory consumption, which made it impractical to proceed with KNN.

In [None]:
# Generate recommendations for user:31030 by using the trained model

In [40]:
user_id = 31030
top_n = 5   # Number of recommendations to generate

In [37]:
# Get a list of joke IDs and their predicted ratings for the user
user_ratings = [(joke_id, model.predict(user_id, joke_id).est) for joke_id in df['joke_id'].unique()]

In [38]:
# Sort the jokes by predicted rating and get the top N

top_rated_jokes = sorted(user_ratings, key=lambda x: x[1], reverse=True)[:top_n]

In [39]:
# Print the top-rated jokes

for joke_id, predicted_rating in top_rated_jokes:
    print(f"Joke ID: {joke_id}, Predicted Rating: {predicted_rating:.2f}")

Joke ID: 13, Predicted Rating: 9.65
Joke ID: 57, Predicted Rating: 8.68
Joke ID: 47, Predicted Rating: 8.04
Joke ID: 40, Predicted Rating: 8.03
Joke ID: 58, Predicted Rating: 8.01
