In [59]:
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from keras.models import Sequential
from keras.layers import Dense
import numpy as np


In [60]:

# Load MovieLens dataset as an example
movies = pd.read_csv("movies.csv")  # Load movie data (movieId, title, genres)
ratings = pd.read_csv("ratings.csv")  # Load user ratings (userId, movieId, rating)

movies
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,5.0
1,1,2,3.0
2,1,3,4.5
3,1,4,4.0
4,1,5,4.0
5,1,6,1.0
6,1,7,4.0
7,1,8,1.5
8,1,9,1.5
9,1,10,5.0


In [63]:
# Step 1: Data Preprocessing (if applicable, depending on the dataset)
# Data Processing: Convert genres into binary columns using one-hot encoding
# Split the 'genres' column containing pipe-separated genres into binary columns for each genre
genres_list = movies["genres"].str.get_dummies(sep="|")
# Concatenate the binary genre columns back to the 'movies' DataFrame
movies = pd.concat([movies, genres_list], axis=1)

movies


Unnamed: 0,movieId,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Drama,...,Adventure.1,Animation.1,Children's.1,Comedy.1,Crime.1,Drama.1,Fantasy,Horror,Romance,Thriller
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,1,1,1,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,1,...,0,0,0,1,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
5,6,Heat (1995),Action|Crime|Thriller,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
6,7,Sabrina (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
7,8,Tom and Huck (1995),Adventure|Children's|Drama,0,1,0,1,0,0,1,...,1,0,1,0,0,1,0,0,0,0
8,9,Sudden Death (1995),Action,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye (1995),Action|Adventure|Thriller,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [66]:
# Step 2: Handling Missing Values (if applicable, depending on the dataset)
# Handling Missing Values: Fill missing values in the 'rating' column with the mean rating
# Calculate the mean rating from the 'rating' column
mean_rating = ratings["rating"].mean()

# Fill the missing values in the 'rating' column with the calculated mean rating
ratings["rating"].fillna(mean_rating, inplace=True)

ratings

Unnamed: 0,userId,movieId,rating
0,1,1,5.0
1,1,2,3.0
2,1,3,4.5
3,1,4,4.0
4,1,5,4.0
5,1,6,1.0
6,1,7,4.0
7,1,8,1.5
8,1,9,1.5
9,1,10,5.0


In [68]:
# Collaborative Filtering with Surprise
# Step 3: Create a Surprise Reader with the rating scale (minimum and maximum ratings)
reader = Reader(rating_scale=(0.5, 5.0))

reader

<surprise.reader.Reader at 0x7fd64478fdf0>

In [70]:

# Step 4: Load the user ratings data into a Surprise Dataset using the Reader
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)



Unnamed: 0,userId,movieId,rating
0,1,1,5.0
1,1,2,3.0
2,1,3,4.5
3,1,4,4.0
4,1,5,4.0
5,1,6,1.0
6,1,7,4.0
7,1,8,1.5
8,1,9,1.5
9,1,10,5.0


In [80]:

# Step 5: Split the dataset into training and testing sets for evaluation
trainset, testset = train_test_split(data, test_size=0.2)



In [82]:

# Step 6: Use SVD (Singular Value Decomposition) for collaborative filtering
svd = SVD()
svd.fit(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd6447cd7e0>

In [84]:
# Content-Based Filtering with TF-IDF
# Step 7: Create a TF-IDF vectorizer for content-based filtering
tfidf_vectorizer = TfidfVectorizer(stop_words="english")


In [85]:
# Step 8: Preprocess the 'genres' column in the 'movies' DataFrame (remove '|' separator)
movies["genres"] = movies["genres"].str.replace("|", " ")

movies


Unnamed: 0,movieId,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Drama,...,Adventure.1,Animation.1,Children's.1,Comedy.1,Crime.1,Drama.1,Fantasy,Horror,Romance,Thriller
0,1,Toy Story (1995),Animation Children's Comedy,0,0,1,1,1,0,0,...,0,1,1,1,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure Children's Fantasy,0,1,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,3,Grumpier Old Men (1995),Comedy Romance,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
3,4,Waiting to Exhale (1995),Comedy Drama,0,0,0,0,1,0,1,...,0,0,0,1,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
5,6,Heat (1995),Action Crime Thriller,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
6,7,Sabrina (1995),Comedy Romance,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
7,8,Tom and Huck (1995),Adventure Children's Drama,0,1,0,1,0,0,1,...,1,0,1,0,0,1,0,0,0,0
8,9,Sudden Death (1995),Action,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye (1995),Action Adventure Thriller,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [94]:
# Step 9: Create the TF-IDF matrix for content-based filtering using the 'genres' column
tfidf_matrix = tfidf_vectorizer.fit_transform(movies["genres"])

tfidf_matrix

<20x11 sparse matrix of type '<class 'numpy.float64'>'
	with 41 stored elements in Compressed Sparse Row format>

In [96]:
# Step 10: Compute the cosine similarity between TF-IDF vectors for content-based filtering
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [99]:
# Deep Learning Model for Sequence Modeling
# Step 11: Create a sequential model for deep learning
model = Sequential()


In [100]:
# Step 12: Add a dense layer with 128 units and ReLU activation function as the embedding layer
model.add(Dense(128, activation="relu", input_dim=len(movies), name="embedding_layer"))


In [16]:
# Step 13: Add a dense layer with 'len(movies)' units and softmax activation for output
model.add(Dense(len(movies), activation="softmax"))



In [17]:
# Step 14: Compile the model with 'adam' optimizer and mean squared error loss
model.compile(optimizer="adam", loss="mean_squared_error")



In [18]:
# Step 15: Train the model using random data (replace this with real data)
# Generate random data of the same shape as the 'movies' DataFrame for training
X_train = pd.DataFrame(
    data=np.random.random((len(movies), len(movies))), columns=movies["movieId"]
)
# Use the same random data for the target labels (unsupervised learning)
y_train = X_train

model.fit(X_train, y_train, epochs=10)


`

SyntaxError: invalid syntax (3745584345.py, line 12)

In [19]:
# Optimized Recommendation Function
def get_recommendations(user_id, num_recommendations=10):
    # Collaborative Filtering
    # Get the movies watched by the user with 'user_id' from the 'ratings' DataFrame
    user_movies = ratings[ratings["userId"] == user_id]["movieId"]
    # For each movie in the 'movies' DataFrame, predict the user's estimated rating using SVD
    # Exclude movies that the user has already watched
    collab_recommendations = [
        (movieId, svd.predict(user_id, movieId).est)
        for movieId in movies["movieId"]
        if movieId not in user_movies
    ]
    # Sort the collaborative filtering recommendations based on estimated ratings in descending order
    collab_recommendations.sort(key=lambda x: x[1], reverse=True)

    # Content-Based Filtering
    # Get the index of the last movie watched by the user from the 'movies' DataFrame
    movie_indices = [
        movies[movies["movieId"] == movie_id].index[0] for movie_id in user_movies
    ]
    # Calculate the cosine similarity between the last watched movie and all other movies
    similar_movies = list(enumerate(cosine_sim[movie_indices[-1]]))
    # Exclude movies that the user has already watched
    content_predictions = [
        (movies.iloc[i]["movieId"], score)
        for i, score in similar_movies
        if i not in movie_indices
    ]
    # Sort the content-based filtering predictions based on similarity scores in descending order
    content_predictions.sort(key=lambda x: x[1], reverse=True)

    # Combine Recommendations
    # Combine collaborative and content-based recommendations, giving priority to collaborative filtering
    hybrid_recommendations = (
        collab_recommendations[:num_recommendations]
        + content_predictions[:num_recommendations]
    )
    # Sort the hybrid recommendations based on estimated ratings (collaborative filtering) in descending order
    hybrid_recommendations.sort(key=lambda x: x[1], reverse=True)

    # Return the movieIds of the top 'num_recommendations' hybrid recommendations
    return [movie_id for movie_id, _ in hybrid_recommendations][:num_recommendations]




In [21]:
# Example Usage:
user_id = 1  # Replace with the user ID for whom you want to get recommendations
recommendations = get_recommendations(user_id)
print("Top Recommendations for User", user_id)
print(recommendations)


Top Recommendations for User 1
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
