In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
data = pd.read_csv("jokes-data.csv")
data = data.head(40000)

In [2]:
# Check for Missing Values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

# Drop Unnecessary Columns
data.drop(columns=['id'], inplace=True)

Missing Values:
 id         0
user_id    0
joke_id    0
Rating     0
dtype: int64


In [3]:
# Check for Duplicates
duplicates = data.duplicated().sum()
print("Duplicate Rows:", duplicates)

Duplicate Rows: 0


In [4]:
print(data.head())

   user_id  joke_id  Rating
0    31030      110   2.750
1    16144      109   5.094
2    23098        6  -6.438
3    14273       86   4.406
4    18419      134   9.375


In [5]:
# Create User-Item Matrix
user_item_matrix = data.pivot_table(index='user_id', columns='joke_id', values='Rating')

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
 #Calculate Similarity
user_similarity = cosine_similarity(user_item_matrix.fillna(0))

In [8]:
# Recommendation Generation
def recommend_jokes(user_id, user_similarity, user_item_matrix):
    # Check if the user_id is in the user_item_matrix index
    if user_id not in user_item_matrix.index:
        raise KeyError(f"User ID {user_id} not found in the user-item matrix index")

    similar_users = user_similarity[user_id]
    rated_jokes = user_item_matrix.loc[user_id].dropna().index
    unrated_jokes = user_item_matrix.columns.difference(rated_jokes)
    recommendations = {}

    for joke_id in unrated_jokes:
        # Sort users by similarity and ensure they are in the index
        similar_users_sorted_indices = np.argsort(similar_users)[::-1]
        similar_users_filtered = [idx for idx in similar_users_sorted_indices if idx in user_item_matrix.index]

        if len(similar_users_filtered) == 0:
            continue

        # Get ratings from the top 10 similar users
        similar_users_ratings = user_item_matrix.loc[similar_users_filtered[:10], joke_id].dropna()

        if not similar_users_ratings.empty:
            # Use the indices of the filtered ratings to get the corresponding similarities
            relevant_similar_users = pd.Series(similar_users[similar_users_ratings.index], index=similar_users_ratings.index)
            if len(similar_users_ratings) != len(relevant_similar_users):
                # Ensure the lengths match for the dot product
                raise ValueError("Length mismatch between ratings and similarity scores")

            weighted_rating = similar_users_ratings.dot(relevant_similar_users)
            weighted_sum = relevant_similar_users.sum()
            recommendations[joke_id] = weighted_rating / weighted_sum

    return sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

# Example: Generate recommendations for a user
user_id = 18
#print(user_item_matrix)
user_similarity = np.random.rand(20, 20)  # Example similarity matrix
user_item_matrix = pd.DataFrame(np.random.rand(20, 100))  # Example user-item matrix

recommendations = recommend_jokes(user_id, user_similarity, user_item_matrix)
#recommendations = recommend_jokes(user_id, user_similarity, user_item_matrix)
print("Recommended Jokes for User", user_id)
for joke_id, rating in recommendations[:5]:
    print("Joke ID:", joke_id, "- Predicted Rating:", round(rating, 2))

Recommended Jokes for User 18
