# Content based anime recommendation system

## Notebook set up

### Imports

In [1]:
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.metrics.pairwise import cosine_similarity  # To compute similarity scores

### Dataset

Load animes & ratings data.

In [None]:
# Load anime information from CSV file
animes = pd.read_csv('anime.csv')
animes.head()

In [None]:
animes.info()

In [None]:
# Load user ratings from CSV file
ratings = pd.read_csv('rating.csv')
ratings.head()

Check the size and structure of the ratings dataset to understand the data volume.

In [None]:
ratings.info()

In [None]:
# Randomly sample 50,000 ratings for faster computation
sample_ratings = ratings.sample(n=50000, random_state=315)
sample_ratings.info()

In [None]:
# Count unique users and animes in the sample
num_users = sample_ratings['user_id'].nunique()
num_animes = sample_ratings['anime_id'].nunique()

print(f"Number of unique users in sample: {num_users}")
print(f"Number of unique animes in sample: {num_animes}")

## 1. Collaborative filtering

### 1.1. Memory based collaborative filtering

Create a user-item matrix where each row is a user, each column is an anime, and values are ratings.

In [None]:
# Create a user-item matrix using pivot table
user_item_matrix = sample_ratings.pivot_table(
    index='user_id',  # Users as rows
    columns='anime_id',  # Animes as columns
    values='rating'  # Ratings as values
)

# Fill missing values (unrated animes) with 0
user_item_filled = user_item_matrix.fillna(0)

print('User-Item Matrix shape:', user_item_filled.shape)
user_item_filled.head()

Compute cosine similarity between all pairs of animes to find which animes have similar rating patterns across users.

In [None]:
user_item_filled.T

In [None]:
# Compute cosine similarity between animes (transpose to compare columns)
item_similarity = cosine_similarity(user_item_filled.T)

# Convert to DataFrame with anime IDs as row and column labels
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

print('Item similarity matrix shape:', item_similarity_df.shape)
item_similarity_df.head()

Create helper functions to convert between anime IDs and names for more readable output.

In [None]:
def get_anime_name(anime_id):
    """Get anime name from ID"""
    result = animes[animes['anime_id'] == anime_id]['name']
    return result.values[0] if len(result) > 0 else f'Unknown (ID: {anime_id})'

def get_anime_id(anime_name):
    """Get anime ID from name"""
    result = animes[animes['name'] == anime_name]['anime_id']
    return result.values[0] if len(result) > 0 else None

# Test the helper function
print(f"Anime ID 1: {get_anime_name(1)}")

Demonstrate memory-based collaborative filtering by finding the top 5 animes most similar to a target anime based on user rating patterns.

In [None]:
# Select target anime
anime_id = 1

# Get similarity scores and sort in descending order
similar_animes = item_similarity_df[anime_id].sort_values(ascending=False)

print(f'Top 5 animes similar to "{get_anime_name(anime_id)}":')
print()

# Display top 5 (skip first one since it's the anime itself)
for anime_id_similar, score in similar_animes[1:6].items():
    print(f'{get_anime_name(anime_id_similar)}: {score:.4f}')

### 1.2. Model based collaborative filtering

Use matrix factorization (SVD) to reduce dimensionality and fill in missing ratings by learning latent features of users and animes.

In [None]:
# Create SVD model with 50 latent features
svd_model = TruncatedSVD(n_components=50, random_state=315)

# Fit model and transform user-item matrix to user features
user_features = svd_model.fit_transform(user_item_filled)

# Reconstruct ratings matrix by multiplying user and item features
predicted_ratings = np.dot(user_features, svd_model.components_)

# Convert back to DataFrame with original indices
predicted_ratings_df = pd.DataFrame(
    predicted_ratings,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)

print('Predicted ratings matrix shape:', predicted_ratings_df.shape)
predicted_ratings_df.head()

Demonstrate model-based collaborative filtering by recommending unwatched animes to a user based on predicted ratings from SVD.

In [None]:
# Select first user from the matrix
user_id = user_item_matrix.index[0]

# Get predicted ratings for this user
user_predictions = predicted_ratings_df.loc[user_id]

# Find animes the user hasn't rated (missing values in original matrix)
unrated_animes = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].isna()]

# Get predictions for unrated animes and sort by predicted rating
recommendations = user_predictions[unrated_animes.index].sort_values(ascending=False)

print(f'Top 5 recommended animes for user {user_id}:')
print(recommendations.head())

## 2. Content-based filtering

Examine the content features (genre, type) available for each anime to use in content-based filtering.

In [None]:
# Display relevant features for content-based filtering
animes[['anime_id', 'name', 'genre', 'type']].head(10)

Create a function to calculate similarity between animes based on their genres using Jaccard similarity (intersection over union).

In [None]:
# Convert genre strings to sets for easier comparison
animes['genre_set'] = animes['genre'].fillna('').apply(lambda x: set(x.split(', ')))

def genre_similarity(genres1, genres2):
    """Calculate Jaccard similarity between two genre sets"""

    # Return 0 if either set is empty
    if len(genres1) == 0 or len(genres2) == 0:
        return 0

    # Calculate intersection (common genres) and union (all unique genres)
    intersection = len(genres1.intersection(genres2))
    union = len(genres1.union(genres2))

    # Jaccard similarity = intersection / union
    return intersection / union if union > 0 else 0

Select a target anime to demonstrate content-based filtering using genre similarity.

In [None]:
# Choose anime to find similar content for
target_anime_id = 1
target_anime = animes[animes['anime_id'] == target_anime_id].iloc[0]
target_genres = target_anime['genre_set']

print(f"Target anime: {target_anime['name']}")
print(f"Genres: {target_anime['genre']}")

Demonstrate content-based filtering by finding animes with the most similar genres to the target anime.

In [None]:
# Calculate genre similarity for all animes
animes['similarity'] = animes['genre_set'].apply(
    lambda x: genre_similarity(target_genres, x)
)

# Find top similar animes (excluding the target itself)
similar_animes = animes[animes['anime_id'] != target_anime_id].sort_values(
    'similarity', 
    ascending=False
)[['name', 'genre', 'similarity']].head(5)

print('Top 5 similar animes based on genre:')
similar_animes.head()

## 3. Hybrid filtering

Combine collaborative filtering and content-based filtering using a weighted average to leverage both user behavior and content features.

In [None]:
# Get collaborative filtering scores (based on user ratings)
collab_score = item_similarity_df[target_anime_id]

# Get content-based scores (based on genre similarity)
content_score = animes.set_index('anime_id')['similarity']

# Find animes that exist in both scoring methods
common_animes = collab_score.index.intersection(content_score.index)

# Combine scores with equal weights (50% each)
hybrid_score = (
    0.5 * collab_score[common_animes] + 
    0.5 * content_score[common_animes]
)

# Sort and get top 5 (excluding the target anime itself)print(hybrid_recommendations)
hybrid_recommendations = hybrid_score.sort_values(ascending=False)[1:6]
print(f'Top 5 hybrid recommendations for anime_id {target_anime_id}:')

for rec in hybrid_recommendations.items():
    anime_id_rec, score = rec
    print(f'{get_anime_name(anime_id_rec)}: {score:.4f}')