In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import heapq
from tqdm import tqdm
import tensorflow as tf
from joblib import Parallel, delayed, cpu_count

import re
import os
import heapq

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the data
train_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/train.csv')
test_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/sample_submission.csv')


movies_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/movies.csv')
links_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/links.csv')
tags_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/tags.csv')
genome_scores_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/genome_scores.csv')
genome_tags_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/genome_tags.csv')
imdb_data_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/imdb_data.csv')

# Displaying the first few rows of each DataFrame
train_df.head(), test_df.head(), sample_submission_df.head(), movies_df.head(), links_df.head(), tags_df.head(), genome_scores_df.head(), genome_tags_df.head()



In [None]:
# Let's start by looking at the shape of our datasets
print(f"Train DataFrame shape: {train_df.shape}")
print(f"Test DataFrame shape: {test_df.shape}")
print(f"Sample Submission DataFrame shape: {sample_submission_df.shape}")

# Display basic statistics for the train dataset
train_df.describe()

In [None]:
# Plot the distribution of ratings
plt.figure(figsize=(10, 5))
sns.histplot(train_df['rating'], bins=10, kde=True)
plt.title("Distribution of Movie Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Plot the number of ratings per movie
ratings_per_movie = train_df.groupby('movieId').size()
plt.figure(figsize=(10, 5))
sns.histplot(ratings_per_movie, bins=50, kde=True)
plt.title("Number of Ratings per Movie")
plt.xlabel("Number of Ratings")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Plot the number of ratings per user
ratings_per_user = train_df.groupby('userId').size()
plt.figure(figsize=(10, 5))
sns.histplot(ratings_per_user, bins=50, kde=True)
plt.title("Number of Ratings per User")
plt.xlabel("Number of Ratings")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Plot the average rating per movie
avg_rating_per_movie = train_df.groupby('movieId')['rating'].mean()
plt.figure(figsize=(10, 5))
sns.histplot(avg_rating_per_movie, bins=50, kde=True)
plt.title("Average Rating per Movie")
plt.xlabel("Average Rating")
plt.ylabel("Frequency")
plt.show()


 Data Cleaning and Preprocessing


In [None]:
# Checking for missing values
print("Missing values in train_df:")
print(train_df.isnull().sum())

print("Missing values in test_df:")
print(test_df.isnull().sum())

print("Missing values in sample_submission_df:")
print(sample_submission_df.isnull().sum())

print("Missing values in movies_df:")
print(movies_df.isnull().sum())

print("Missing values in links_df:")
print(links_df.isnull().sum())

print("Missing values in tags_df:")
print(tags_df.isnull().sum())

print("Missing values in genome_scores_df:")
print(genome_scores_df.isnull().sum())

print("Missing values in genome_tags_df:")
print(genome_tags_df.isnull().sum())

print("Missing values in imdb_data_df:")
print(imdb_data_df.isnull().sum())


In [None]:
# Clean text data by removing punctuation and extra spaces
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', str(text))
    text = re.sub('\s+', ' ', text)
    return text.strip()

# Clean relevant columns
movies_df['title'] = movies_df['title'].apply(clean_text)
movies_df['genres'] = movies_df['genres'].apply(clean_text)
imdb_data_df['title_cast'] = imdb_data_df['title_cast'].apply(clean_text)
imdb_data_df['director'] = imdb_data_df['director'].apply(clean_text)
imdb_data_df['plot_keywords'] = imdb_data_df['plot_keywords'].apply(clean_text)
tags_df['tag'] = tags_df['tag'].apply(clean_text)
genome_tags_df['tag'] = genome_tags_df['tag'].apply(clean_text)

# Aggregate tags and genome tags into descriptions
tags_agg = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
genome_tags_agg = genome_scores_df.merge(genome_tags_df, on='tagId')
genome_tags_agg = genome_tags_agg.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()



In [None]:
movies_df.value_counts().unique

In [None]:
genome_tags_agg.shape

In [None]:
# Initialize an empty list to store movie descriptions
movie_descriptions_list = []

# Iterate through movies_df to append relevant information
for index, row in tqdm(movies_df.iterrows(), total=movies_df.shape[0]):
    movieId = row['movieId']
    
    # Get relevant information
    title = row['title']
    genres = row['genres']
    
    # Find the corresponding row in imdb_data_df
    imdb_row = imdb_data_df[imdb_data_df['movieId'] == links_df[links_df['movieId'] == movieId]['imdbId'].values[0]]
    
    title_cast = imdb_row['title_cast'].values[0] if not imdb_row.empty else ''
    director = imdb_row['director'].values[0] if not imdb_row.empty else ''
    plot_keywords = imdb_row['plot_keywords'].values[0] if not imdb_row.empty else ''
    
    # Find the corresponding tags and genome tags
    tag_row = tags_agg[tags_agg['movieId'] == movieId]
    genome_tag_row = genome_tags_agg[genome_tags_agg['movieId'] == movieId]
    
    tags = tag_row['tag'].values[0] if not tag_row.empty else ''
    genome_tags = genome_tag_row['tag'].values[0] if not genome_tag_row.empty else ''
    
    # Combine into a movie description
    movie_description = f"{title} {genres} {title_cast} {director} {plot_keywords} {tags} {genome_tags}"
    
    # Append to movie_descriptions_list
    movie_descriptions_list.append({
        'movieId': movieId,
        'movie_description': movie_description
    })

# Convert the list to a DataFrame
movie_descriptions = pd.DataFrame(movie_descriptions_list)

# Fill NaN values in movie_description
movie_descriptions['movie_description'].fillna('', inplace=True)

# Display the first few rows of the movie descriptions
movie_descriptions.head()


In [None]:
movie_descriptions.shape

In [None]:
# Shuffle the movie_descriptions DataFrame
movie_descriptions = movie_descriptions.sample(frac=1, random_state=42).reset_index(drop=True)

# Partition the DataFrame into 4 groups
partitions = np.array_split(movie_descriptions, 4)


In [None]:
import tensorflow as tff

tfidf_vectorizers = []
tfidf_matrices = []
cosine_sim_matrices = []

for i, partition in enumerate(partitions):
    # Initialize TfidfVectorizer for each partition
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')
    
    # Fit and transform the movie descriptions
    tfidf_matrix = tf.fit_transform(partition['movie_description'])
    
    # Compute cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Convert cosine similarity matrix to TensorFlow tensor
    cosine_sim_matrix_tensor = tff.constant(cosine_sim_matrix)

    # Assign to global variables
    globals()[f'tfidf_{i+1}_matrix'] = tfidf_matrix
    globals()[f'cosine_sim_{i+1}_matrix'] = cosine_sim_matrix_tensor
    
    tfidf_vectorizers.append(tf)
    tfidf_matrices.append(tfidf_matrix)
    cosine_sim_matrices.append(cosine_sim_matrix_tensor)


In [None]:
# Update indices and titles for partitions
partition_indices = [pd.Series(partition.index, index=partition['movieId']) for partition in partitions]

def content_generate_rating_estimate(movie_id, user, rating_data, k=20, threshold=0.0):
    partition_idx = -1
    b_idx = -1

    # Find the partition for the movie_id
    for i, indices in enumerate(partition_indices):
        if movie_id in indices.index:
            partition_idx = i + 1
            b_idx = indices[movie_id]
            break

    # If the movie_id is not found in any partition, return the average rating
    if partition_idx == -1:
        print(f"Movie ID {movie_id} not found in any partition.")
        return rating_data['rating'].mean()

    neighbors = []

    # Iterate through user ratings
    for index, row in rating_data[rating_data['userId'] == user].iterrows():
        try:
            # Find the similarity index
            sim_idx = partition_indices[partition_idx - 1][row['movieId']]
            if b_idx < cosine_sim_matrices[partition_idx - 1].shape[0] and sim_idx < cosine_sim_matrices[partition_idx - 1].shape[1]:
                sim = cosine_sim_matrices[partition_idx - 1][b_idx, sim_idx].numpy()
                neighbors.append((sim, row['rating']))
        except (KeyError, IndexError):
            continue

    # Get the top-k neighbors
    k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[0])
    simTotal, weightedSum = 0, 0

    # Calculate the predicted rating
    for (simScore, rating) in k_neighbors:
        if simScore > threshold:
            simTotal += simScore
            weightedSum += simScore * rating

    try:
        predictedRating = weightedSum / simTotal
    except ZeroDivisionError:
        # Fallback to average rating for the movie if no neighbors found
        predictedRating = np.mean(rating_data[rating_data['movieId'] == movie_id]['rating'])

    return predictedRating


In [None]:
# Extract userId and movieId from sample_submission_df
sample_submission_df['userId'] = sample_submission_df['Id'].str.split('_').str[0].astype(int)
sample_submission_df['movieId'] = sample_submission_df['Id'].str.split('_').str[1].astype(int)

# Function to compute the rating for a single row
def compute_rating(row, train_df):
    return content_generate_rating_estimate(row['movieId'], row['userId'], train_df)

# Function to process a chunk and return the computed ratings
def process_chunk(chunk_df, train_df):
    chunk_df['rating'] = chunk_df.apply(lambda row: compute_rating(row, train_df), axis=1)
    return chunk_df[['Id', 'rating']]

# Define the chunk size
chunk_size = 10000  # Adjust chunk size based on memory limits and performance

# Path to the submission file
submission_file_path = 'submission_2.csv'

# Check if the submission file already exists and create if necessary
if not os.path.exists(submission_file_path):
    with open(submission_file_path, 'w') as f:
        f.write('Id,rating\n')

# Split the sample_submission_df into chunks
chunks = [sample_submission_df.iloc[i:i + chunk_size] for i in range(0, sample_submission_df.shape[0], chunk_size)]

# Process the chunks in parallel and write the results incrementally
num_chunks = len(chunks)
num_cores = min(4, cpu_count())  # Adjust number of cores as needed

for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
    print(f"Processing chunk {i+1}/{num_chunks}")
    
    # Compute ratings for the current chunk
    chunk_ratings = Parallel(n_jobs=num_cores)(delayed(process_chunk)(chunk, train_df) for chunk in [chunk])
    
    # Flatten the list of dataframes into a single dataframe
    result_chunk = pd.concat(chunk_ratings, axis=0)
    
    # Append the current chunk to the CSV file
    result_chunk.to_csv(submission_file_path, mode='a', header=False, index=False)

print("Finished writing to CSV file.")
