In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/alx-movie-recommendation-project-2024/sample_submission.csv
/kaggle/input/alx-movie-recommendation-project-2024/movies.csv
/kaggle/input/alx-movie-recommendation-project-2024/imdb_data.csv
/kaggle/input/alx-movie-recommendation-project-2024/genome_tags.csv
/kaggle/input/alx-movie-recommendation-project-2024/genome_scores.csv
/kaggle/input/alx-movie-recommendation-project-2024/train.csv
/kaggle/input/alx-movie-recommendation-project-2024/test.csv
/kaggle/input/alx-movie-recommendation-project-2024/tags.csv
/kaggle/input/alx-movie-recommendation-project-2024/links.csv


# **Importing Our Old Hero Packages**

In [2]:
# Install packages here
# Packages for data processing
import numpy as np
import pandas as pd
import datetime
from sklearn import preprocessing
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from scipy.sparse import csr_matrix
import scipy as sp


# Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Packages for modeling
from surprise import Reader
from surprise import Dataset
from surprise import KNNWithMeans
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
import heapq

# Packages for model evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from time import time

# Package to suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Packages for saving models
import pickle

# Reading Our Data

In [3]:
train_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/train.csv')
movies_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/movies.csv')
imdb_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/imdb_data.csv')
test_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/test.csv')
links_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/links.csv')
tags = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/tags.csv')
genome_scores = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/genome_scores.csv')
genome_tags = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/genome_tags.csv')
sample_submissions = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/sample_submission.csv')

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies_df.shape

(62423, 3)

In [6]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


# **EDA**

# Outliers

* Identify outliers: Outliers are data points that differ significantly from other observations. They can skew and mislead the training process of a machine learning model.
* Detecting outliers: Use statistical methods such as Z-scores or IQR (Interquartile Range) to detect outliers.
* Handling outliers: Decide whether to remove or transform the outliers depending on their impact on the dataset.
# Understanding Relationships Between Various Attributes and Structure of the Data
* Correlation Analysis: Use correlation matrices to understand the relationships between numerical attributes.
* Visualization Techniques: Employ scatter plots, pair plots, and heatmaps to visualize and explore relationships.
* Data Structure: Understand the structure of the data, including the distribution of values and the presence of any missing values.
# Recognizing Important Variables
* Feature Importance: Use techniques like Random Forests, Gradient Boosting, or SHAP values to determine feature importance.
* Domain Knowledge: Incorporate domain expertise to identify which variables are likely to be important.
* Statistical Tests: Conduct statistical tests to identify variables that have significant effects on the target variable.
By understanding the data through these steps, we ensure a robust foundation for building and evaluating machine learning models.

Lets Check whether or not we have any missing values in our dataset

In [7]:
print("Train: ")
print(str(train_df.isnull().sum()))
print("************")
print("Test: ")
print(str(test_df.isnull().sum()))
print("************")
print("Movies: ")
print(str(movies_df.isnull().sum()))
print("************")
print("Links: ")
print(str(links_df.isnull().sum()))
print("************")
print("IMDB: ")
print(str(imdb_df.isnull().sum()))
print("************")
print("Genome scores: ")
print(str(genome_scores.isnull().sum()))
print("************")
print("Genome tags: ")
print(str(genome_tags.isnull().sum()))

Train: 
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
************
Test: 
userId     0
movieId    0
dtype: int64
************
Movies: 
movieId    0
title      0
genres     0
dtype: int64
************
Links: 
movieId      0
imdbId       0
tmdbId     107
dtype: int64
************
IMDB: 
movieId              0
title_cast       10068
director          9874
runtime          12089
budget           19372
plot_keywords    11078
dtype: int64
************
Genome scores: 
movieId      0
tagId        0
relevance    0
dtype: int64
************
Genome tags: 
tagId    0
tag      0
dtype: int64


# Data Preprocessing

In [4]:
# Merge datasets if necessary
merged_df = pd.merge(imdb_df, links_df, on='movieId')

# Handle missing values
merged_df = merged_df.dropna()

# Convert data types if necessary
merged_df['movieId'] = merged_df['movieId'].astype(int)

# Feature Engineering

Feature engeneered


In [5]:
# Merge train_df with movies_df to get movie details
merged_df = pd.merge(train_df, movies_df, on='movieId')

# Ensure the genres column exists and is processed correctly
if 'genres' in merged_df.columns:
    # Create dataframe containing only the movieId and genres
    movies_genres = pd.DataFrame(merged_df[['movieId', 'genres']], columns=['movieId', 'genres'])

    # Split genres separated by "|" and create a list containing the genres allocated to each movie
    movies_genres.genres = movies_genres.genres.apply(lambda x: x.split('|'))

    # Create expanded dataframe where each movie-genre combination is in a separate row
    movies_genres = pd.DataFrame([(tup.movieId, genre) for tup in movies_genres.itertuples() for genre in tup.genres],
                                 columns=['movieId', 'genre'])

    # Create a one-hot encoded dataframe for genres
    genres_one_hot = movies_genres.pivot_table(index='movieId', columns='genre', aggfunc='size', fill_value=0)

    # Merge with the original dataframe
    merged_df = pd.merge(merged_df, genres_one_hot, on='movieId', how='inner')
else:
    print("The 'genres' column is missing from the merged dataframe.")


# Collaborative Filtering

In [6]:
# Prepare data for Surprise library
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)

# Build and evaluate a model using SVD
model = SVD()
cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

# Train the model on the entire dataset
trainset = data.build_full_trainset()
model.fit(trainset)


Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8346  0.8340  0.8339  0.8334  0.8338  0.8339  0.0004  
Fit time          203.17  227.01  217.21  216.89  215.47  215.95  7.60    
Test time         33.96   43.58   50.89   45.49   50.85   44.96   6.21    


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7b616109e4d0>

# Recco function

In [8]:
def recommend_movies(user_id, num_recommendations=10):
    # Get a list of all movie IDs
    all_movie_ids = train_df['movieId'].unique()
    
    # Get the list of movies the user has already rated
    rated_movies = train_df[train_df['userId'] == user_id]['movieId'].unique()
    
    # Create a list of movie IDs that the user has not rated
    unrated_movies = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movies]
    
    # Predict ratings for all unrated movies
    predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movies]
    
    # Sort the predictions by estimated rating in descending order
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get the top N recommendations
    top_recommendations = predictions[:num_recommendations]
    
    # Return the recommended movie IDs and their estimated ratings
    return [(pred.iid, pred.est) for pred in top_recommendations]

# Example usage
user_id = 1  # Replace with a user ID from your dataset
recommendations = recommend_movies(user_id)
print(f"Top recommendations for user {user_id}: {recommendations}")


Top recommendations for user 1: [(171495, 4.536297628683461), (7926, 4.528681941163437), (89759, 4.526350578668971), (2019, 4.523988926093174), (6669, 4.5191921995891455), (171011, 4.510305709692851), (1201, 4.487020128994383), (170705, 4.484785113013309), (4878, 4.483844391929086), (157373, 4.476746766154687)]


# Generating predictions and Submission


In [9]:
# Prepare the test data
testset_for_prediction = test_df[['userId', 'movieId']].copy()
testset_for_prediction['rating'] = 0  # Dummy rating column to match the input format

# Convert the test set to a list of tuples
testset_for_prediction = list(testset_for_prediction.itertuples(index=False, name=None))

# Generate predictions
predictions = model.test(testset_for_prediction)

# Prepare the submission dataframe
submission = pd.DataFrame([(pred.uid, pred.iid, pred.est) for pred in predictions], columns=['userId', 'movieId', 'rating'])

# Create the 'Id' column by concatenating 'userId' and 'movieId'
submission['Id'] = submission['userId'].astype(str) + '_' + submission['movieId'].astype(str)

# Select only the 'Id' and 'rating' columns for the submission
submission = submission[['Id', 'rating']]

# Save the submission to a CSV file
submission.to_csv('my_submission.csv', index=False)
