In [1]:

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

from surprise.model_selection import train_test_split as surprise_train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from surprise import SVD, NMF, Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Dense, Concatenate


2025-02-15 14:16:07.909131: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score
import numpy as np

In [3]:
movies = pd.read_csv('/Users/misscc/Desktop/Spring_2025/Movie_Recommdation/data/movies.csv')
ratings = pd.read_csv('/Users/misscc/Desktop/Spring_2025/Movie_Recommdation/data/ratings.csv')
tags = pd.read_csv('/Users/misscc/Desktop/Spring_2025/Movie_Recommdation/data/tags.csv')

In [4]:
# Data Preprocessing
movies['genres'] = movies['genres'].fillna('Unknown').str.split('|')
ratings = ratings.dropna()
tags = tags.dropna()

In [5]:
# One-hot encode genres
mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(mlb.fit_transform(movies['genres']), columns=mlb.classes_)
movies = pd.concat([movies, genres_encoded], axis=1)

In [None]:
sample_users = ratings['userId'].unique()[:1000]  
sample_movies = ratings['movieId'].unique()[:500]  

filtered_ratings = ratings[ratings['userId'].isin(sample_users) & ratings['movieId'].isin(sample_movies)]
user_movie_matrix = filtered_ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

print(user_movie_matrix.shape)


(998, 500)


In [7]:
# User feature engineering
user_avg_rating = ratings.groupby('userId')['rating'].mean().reset_index(name='avg_rating')
user_activity = ratings.groupby('userId').size().reset_index(name='rating_count')
user_features = pd.merge(user_avg_rating, user_activity, on='userId')

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tags['tag'] = tags['tag'].fillna('')
vectorizer = TfidfVectorizer(max_features=100)
tag_matrix = vectorizer.fit_transform(tags['tag'])

In [9]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['year'] = ratings['timestamp'].dt.year
ratings['month'] = ratings['timestamp'].dt.month
ratings['day_of_week'] = ratings['timestamp'].dt.dayofweek


In [10]:
# Tag feature engineering
top_tags = tags['tag'].value_counts().head(50).index.tolist()
tag_features = tags.groupby('movieId')['tag'].apply(list).reset_index()
tag_features['tag_vector'] = tag_features['tag'].apply(lambda x: [1 if tag in x else 0 for tag in top_tags])
tag_df = pd.DataFrame(tag_features['tag_vector'].tolist(), columns=top_tags)
tag_features = pd.concat([tag_features[['movieId']], tag_df], axis=1)
movies = pd.merge(movies, tag_features, on='movieId', how='left')

In [11]:
# User-User Collaborative Filtering
user_sim_matrix = cosine_similarity(user_movie_matrix)
def recommend_user_based(user_id, num_recommendations=5):
    similar_users = np.argsort(user_sim_matrix[user_id])[-6:-1]
    movie_scores = user_movie_matrix.iloc[similar_users].mean(axis=0)
    watched = user_movie_matrix.loc[user_id] > 0
    recommendations = movie_scores[~watched].sort_values(ascending=False).head(num_recommendations)
    return recommendations

# Item-Item Collaborative Filtering
item_sim_matrix = cosine_similarity(user_movie_matrix.T)
def recommend_item_based(movie_id, num_recommendations=5):
    similar_items = np.argsort(item_sim_matrix[movie_id])[-6:-1]
    recommendations = user_movie_matrix.columns[similar_items]
    return recommendations


In [12]:
def calculate_similarity(metric='cosine'):
    movie_features = movies.drop(['movieId', 'title', 'genres'], axis=1, errors='ignore')
    movie_features = movie_features.apply(pd.to_numeric, errors='coerce').fillna(0)

    if metric == 'cosine':
        return cosine_similarity(movie_features)
    elif metric == 'pearson':
        return np.corrcoef(movie_features.values)
    elif metric == 'jaccard':
        return 1 - np.corrcoef(movie_features.values)

sim_matrix = calculate_similarity('cosine')

def recommend_content_based(movie_id, num_recommendations=5):
    movie_index = movies.index[movies['movieId'] == movie_id].tolist()
    if not movie_index:
        return f"Movie ID {movie_id} 不存在"
    movie_index = movie_index[0]

    sim_scores = sim_matrix[movie_index]
    similar_movies = np.argsort(sim_scores)[-6:-1]
    recommended_movie_ids = movies.iloc[similar_movies]['movieId'].values

    return movies[movies['movieId'].isin(recommended_movie_ids)][['title', 'genres']]

print(recommend_content_based(movie_id=1))


                                 title  \
1                       Jumanji (1995)   
3021                Toy Story 2 (1999)   
4262  Atlantis: The Lost Empire (2001)   
4781             Monsters, Inc. (2001)   
4885  Jimmy Neutron: Boy Genius (2001)   

                                                 genres  
1                        [Adventure, Children, Fantasy]  
3021  [Adventure, Animation, Children, Comedy, Fantasy]  
4262          [Adventure, Animation, Children, Fantasy]  
4781  [Adventure, Animation, Children, Comedy, Fantasy]  
4885           [Adventure, Animation, Children, Comedy]  


In [13]:
def compare_similarity_measures(movie_id, metric='cosine'):
    sim_matrix = calculate_similarity(metric)
    sim_scores = sim_matrix[movie_id]
    similar_movies = np.argsort(sim_scores)[-6:-1]
    return movies.iloc[similar_movies][['title', 'genres']]

In [14]:

# Collaborative Filtering using SVD
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# train_test_split
trainset, testset = surprise_train_test_split(data, test_size=0.2)

In [None]:
# SVD training
algo = SVD(n_factors=50, n_epochs=20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc121196850>

In [12]:
# RMSE
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.7732


0.7731832459557475

In [16]:
# Precision@K and Recall@K
def precision_recall_at_k(predictions, k=10):
    top_k = predictions[:k]
    true_positive = sum([1 for p in top_k if p[2] >= 4.0])  
    precision = true_positive / k
    relevant_items = sum([1 for p in predictions if p[3] >= 4.0])  
    recall = true_positive / relevant_items if relevant_items > 0 else 0
    return precision, recall

In [17]:
# NDCG
def ndcg_at_k(predictions, k=10):
    dcg = sum([p[3] / np.log2(i+2) for i, p in enumerate(predictions[:k])])
    ideal_dcg = sum([sorted(predictions, key=lambda x: x[3], reverse=True)[i][3] / np.log2(i+2) for i in range(k)])
    return dcg / ideal_dcg if ideal_dcg > 0 else 0

In [18]:
svd = SVD(n_factors=50, n_epochs=20)
svd.fit(trainset)
predictions = svd.test(testset)
accuracy.rmse(predictions)

RMSE: 0.7736


0.7735544118510616

In [14]:
# Model comparison
nmf = NMF()
cross_validate(nmf, data, cv=5, verbose=True)
cross_validate(svd, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8637  0.8630  0.8634  0.8628  0.8636  0.8633  0.0004  
MAE (testset)     0.6570  0.6562  0.6567  0.6560  0.6570  0.6566  0.0004  
Fit time          765.96  760.16  746.69  740.46  751.41  752.94  9.15    
Test time         82.87   96.04   98.43   92.22   79.75   89.86   7.32    
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7729  0.7729  0.7737  0.7740  0.7734  0.7734  0.0004  
MAE (testset)     0.5810  0.5810  0.5815  0.5816  0.5813  0.5813  0.0003  
Fit time          311.37  326.71  336.89  330.80  338.01  328.76  9.62    
Test time         95.81   149.03  118.60  143.35  155.71  132.50  22.22   


{'test_rmse': array([0.77288786, 0.77289283, 0.77366535, 0.77402361, 0.77342583]),
 'test_mae': array([0.58099677, 0.5810097 , 0.58150921, 0.58164906, 0.58128031]),
 'fit_time': (311.3661091327667,
  326.7127079963684,
  336.89248490333557,
  330.7967121601105,
  338.01125288009644),
 'test_time': (95.8121190071106,
  149.02960515022278,
  118.59962296485901,
  143.35208225250244,
  155.71375012397766)}

In [51]:
# NCF
n_users = ratings['userId'].max()
n_items = ratings['movieId'].max()
user_input = Input(shape=(1,))
item_input = Input(shape=(1,))

user_embedding = Embedding(n_users + 1, 50)(user_input)
item_embedding = Embedding(n_items + 1, 50)(item_input)

user_flat = Flatten()(user_embedding)
item_flat = Flatten()(item_embedding)
concat = Concatenate()([user_flat, item_flat])

x = Dense(64, activation='relu')(concat)
x = Dense(32, activation='relu')(x)
output = Dense(1)(x)

model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

X_users = ratings['userId'].values.astype(np.int32)
X_items = ratings['movieId'].values.astype(np.int32)
y = ratings['rating'].values

model.fit([X_users, X_items], y, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5


InvalidArgumentError: Graph execution error:

Detected at node 'model_3/embedding_7/embedding_lookup' defined at (most recent call last):
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/traitlets/config/application.py", line 1075, in launch_instance
      app.start()
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel/kernelapp.py", line 739, in start
      self.io_loop.start()
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/tornado/platform/asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel/kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
      await result
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel/ipkernel.py", line 359, in execute_request
      await super().execute_request(stream, ident, parent)
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel/ipkernel.py", line 446, in do_execute
      res = shell.run_cell(
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/misscc/Library/Python/3.8/lib/python/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/z1/g6zn_s853dzgm3gkdbyp5qw80000gn/T/ipykernel_50809/2810575343.py", line 29, in <module>
      model.fit([X_users, X_items], y, epochs=5, batch_size=64, validation_split=0.2)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/training.py", line 1742, in fit
      tmp_logs = self.train_function(iterator)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/training.py", line 1338, in train_function
      return step_function(self, iterator)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/training.py", line 1322, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/training.py", line 1303, in run_step
      outputs = model.train_step(data)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/training.py", line 1080, in train_step
      y_pred = self(x, training=True)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/training.py", line 569, in __call__
      return super().__call__(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/base_layer.py", line 1150, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/functional.py", line 512, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/functional.py", line 669, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/engine/base_layer.py", line 1150, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/keras/src/layers/core/embedding.py", line 272, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'model_3/embedding_7/embedding_lookup'
indices[2,0] = 102903 is not in [0, 84433)
	 [[{{node model_3/embedding_7/embedding_lookup}}]] [Op:__inference_train_function_5413]

In [None]:
svd_predictions = svd.test(testset)
svd_rmse = accuracy.rmse(svd_predictions)

nmf_predictions = nmf.test(testset)
nmf_rmse = accuracy.rmse(nmf_predictions)

y_pred = model.predict([X_users, X_items])
ncf_predictions = [(u, i, r, p) for u, i, r, p in zip(X_users, X_items, y, y_pred.flatten())]
assert len(X_users) == len(X_items) == len(y) == len(y_pred.flatten()),

ncf_rmse = mean_squared_error(y, y_pred, squared=False)

svd_precision, svd_recall = precision_recall_at_k(svd_predictions, k=10)
nmf_precision, nmf_recall = precision_recall_at_k(nmf_predictions, k=10)
ncf_precision, ncf_recall = precision_recall_at_k(list(zip(X_users, X_items, y, y_pred.flatten())), k=10)


In [None]:
results = {
    'SVD': [svd_rmse, svd_precision, svd_recall],
    'NMF': [nmf_rmse, nmf_precision, nmf_recall],
    'NCF': [ncf_rmse, ncf_precision, ncf_recall]
}
metrics_df = pd.DataFrame(results, index=['RMSE', 'Precision@10', 'Recall@10'])
metrics_df.plot(kind='bar', figsize=(10, 6))
plt.title('Model Performance Comparison')
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.xticks(rotation=0)
plt.show()