# Movie Ratings

In [8]:
import zipfile
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from math import sqrt

zip_file_path = 'archive.zip'
extract_to_dir = 'MovieLens'
os.makedirs(extract_to_dir, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    for file_name in ['movies.dat', 'ratings.dat', 'users.dat']:
        zip_ref.extract(file_name, extract_to_dir)

extracted_files = os.listdir(extract_to_dir)
print(f"Extracted files: {extracted_files}")


Extracted files: ['movies.dat', 'ratings.dat', 'users.dat']


In [7]:
movies_file = os.path.join(extract_to_dir, 'movies.dat')
ratings_file = os.path.join(extract_to_dir, 'ratings.dat')
users_file = os.path.join(extract_to_dir, 'users.dat')

# Define the column names based on the data format
movies_columns = ['MovieID', 'Title', 'Genres']
ratings_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
users_columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

# Load the data into pandas DataFrames 
movies_df = pd.read_csv(movies_file, sep='::', header=None, names=movies_columns, engine='python', encoding='ISO-8859-1')
ratings_df = pd.read_csv(ratings_file, sep='::', header=None, names=ratings_columns, engine='python', encoding='ISO-8859-1')
users_df = pd.read_csv(users_file, sep='::', header=None, names=users_columns, engine='python', encoding='ISO-8859-1')

# Display the first few rows of each DataFrame to confirm the data is loaded correctly
print("Movies DataFrame:")
print(movies_df.head())

print("\nRatings DataFrame:")
print(ratings_df.head())

print("\nUsers DataFrame:")
print(users_df.head())


Movies DataFrame:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

Ratings DataFrame:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291

Users DataFrame:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55

In [11]:
user_item_matrix = ratings_df.pivot(index='UserID', columns='MovieID', values='Rating')

# Fill missing values with zeros for the initial matrix
user_item_matrix_filled = user_item_matrix.fillna(0)

# Split the data into training and testing sets
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Create user-item matrices for training and testing sets
train_matrix = train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)
test_matrix = test_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

In [10]:
# Apply NMF
nmf_model = NMF(n_components=20, init='random', random_state=42)
W = nmf_model.fit_transform(train_matrix)
H = nmf_model.components_

# Predict the ratings
predicted_ratings = np.dot(W, H)

# Ensure the predicted ratings are in the same shape as the original user-item matrix
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=train_matrix.index, columns=train_matrix.columns)

# Function to compute RMSE
def compute_rmse(actual, predicted):
    # Flatten the matrices
    actual = actual.values.flatten()
    predicted = predicted.values.flatten()
    # Filter out zero values
    mask = actual > 0
    actual = actual[mask]
    predicted = predicted[mask]
    # Compute RMSE
    rmse = sqrt(mean_squared_error(actual, predicted))
    return rmse

# Compute RMSE for the test set
test_user_item_matrix = test_matrix.reindex(index=predicted_ratings_df.index, columns=predicted_ratings_df.columns).fillna(0)
rmse = compute_rmse(test_user_item_matrix, predicted_ratings_df)

print(f"RMSE for the test set: {rmse}")




RMSE for the test set: 2.7411791762164253


### Discussion

The RMSE of 2.741 for the NMF model on the test set indicates a significant prediction error. Given that movie ratings typically range from 1 to 5, this high RMSE suggests that the model's predictions are not very accurate. This outcome can be attributed to the sparsity of the dataset, where most users have rated only a small fraction of the available movies. Sparse matrices pose a challenge for NMF, as it tries to learn latent features from limited data. Additionally, biases in the data, such as users tending to rate movies they feel strongly about, may not be well captured by the NMF model, leading to less accurate predictions.

In comparison, simpler baseline methods like predicting the global average rating or a user's average rating can sometimes perform better because they capture the central tendency of the ratings. Similarity-based methods, such as k-Nearest Neighbors, often outperform matrix factorization in sparse settings by leveraging user-user or item-item similarities. To improve the NMF model's performance, one could consider hybrid models that combine NMF with similarity-based methods, data augmentation to reduce sparsity, regularization techniques to prevent overfitting, or exploring more advanced matrix factorization techniques like SVD or ALS. Incorporating additional data sources, such as user demographics or movie metadata, could also enhance the model's predictive power.