# Assignment 11: Recommendation System

## Dataset: Anime Ratings

**Topics Covered:**
- Collaborative Filtering
- Content-Based Filtering
- Similarity Measures

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_csv('datasets/anime.csv')
print("Dataset loaded! Shape:", df.shape)
df.head()

Dataset loaded! Shape: (1000, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Anime_1_Death Note,"Romance, Comedy",Special,26,6.3,308613
1,2,Anime_2_Tokyo Ghoul,"Fantasy, Drama, Sci-Fi",ONA,12,8.13,68387
2,3,Anime_3_Tokyo Ghoul,Romance,ONA,50,8.19,590145
3,4,Anime_4_Demon Slayer,Comedy,TV,13,9.75,670010
4,5,Anime_5_My Hero Academia,"Horror, Romance",OVA,1,8.86,928076


In [2]:
# Check data info
print(df.columns.tolist())
print("\nMissing values:")
print(df.isnull().sum())

['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members']

Missing values:
anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [3]:
# Clean data
df = df.dropna(subset=['name'])

# Fill missing ratings with mean
if 'rating' in df.columns:
    df['rating'] = df['rating'].fillna(df['rating'].mean())

print("After cleaning:", df.shape)

After cleaning: (1000, 7)


In [4]:
# Content-Based Filtering using Genre
print("=== Content-Based Filtering ===")

# Check for genre column
if 'genre' in df.columns:
    # Fill missing genres
    df['genre'] = df['genre'].fillna('')
    
    # Create TF-IDF matrix from genres
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['genre'])
    
    print("TF-IDF matrix shape:", tfidf_matrix.shape)
    
    # Compute similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    print("Similarity matrix shape:", cosine_sim.shape)

=== Content-Based Filtering ===
TF-IDF matrix shape: (1000, 9)
Similarity matrix shape: (1000, 1000)


In [5]:
# Create index for lookup
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

# Function to get recommendations
def get_recommendations(title, cosine_sim=cosine_sim, n=10):
    # Get index of the anime
    if title not in indices:
        return "Anime not found!"
    
    idx = indices[title]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top n (excluding itself)
    sim_scores = sim_scores[1:n+1]
    
    # Get anime indices
    anime_indices = [i[0] for i in sim_scores]
    
    return df['name'].iloc[anime_indices]

print("Recommendation function created!")

Recommendation function created!


In [6]:
# Test recommendations
print("=== Sample Recommendations ===")

# Get a sample anime
sample_anime = df['name'].iloc[0]
print("\nRecommendations for:", sample_anime)
print("-" * 40)

recommendations = get_recommendations(sample_anime)
if isinstance(recommendations, pd.Series):
    for i, anime in enumerate(recommendations):
        print(str(i+1) + ".", anime)

=== Sample Recommendations ===

Recommendations for: Anime_1_Death Note
----------------------------------------
1. Anime_58_One Piece
2. Anime_158_My Hero Academia
3. Anime_230_Demon Slayer
4. Anime_476_Tokyo Ghoul
5. Anime_539_Dragon Ball Z
6. Anime_565_Death Note
7. Anime_616_Death Note
8. Anime_692_Fullmetal Alchemist
9. Anime_784_My Hero Academia
10. Anime_911_Naruto


In [7]:
# Popularity-based recommendations (as baseline)
print("=== Popularity-Based Recommendations ===")

if 'rating' in df.columns and 'members' in df.columns:
    # Create a copy to avoid modifying the original dataframe
    df_popular = df.copy()
    
    # Replace -1 ratings with NaN, as they indicate not-yet-rated anime
    df_popular['rating'] = df_popular['rating'].replace(-1, np.nan)
    
    # Sort by rating and members, dropping NaNs in rating
    popular = df_popular.dropna(subset=['rating']).sort_values(['rating', 'members'], ascending=False)
    
    print("\nTop 10 Most Popular Anime (based on rating and members):")
    print(popular[['name', 'rating', 'members']].head(10))
    
elif 'rating' in df.columns:
    # Create a copy to avoid modifying the original dataframe
    df_popular = df.copy()
    
    # Replace -1 ratings with NaN
    df_popular['rating'] = df_popular['rating'].replace(-1, np.nan)
    
    popular = df_popular.dropna(subset=['rating']).sort_values('rating', ascending=False)
    
    print("\nTop 10 Highest Rated Anime:")
    print(popular[['name', 'rating']].head(10))

=== Popularity-Based Recommendations ===

Top 10 Most Popular Anime (based on rating and members):
                              name  rating  members
624          Anime_625_Tokyo Ghoul   10.00   977954
828          Anime_829_Tokyo Ghoul    9.99   364678
27          Anime_28_Dragon Ball Z    9.99    40780
842         Anime_843_Demon Slayer    9.98   786299
226        Anime_227_Dragon Ball Z    9.97     1793
330      Anime_331_Attack on Titan    9.96   426198
429         Anime_430_Demon Slayer    9.95   807243
731  Anime_732_Fullmetal Alchemist    9.95   134402
5            Anime_6_Dragon Ball Z    9.93   778153
78             Anime_79_Death Note    9.93   541965


## Summary

**Types of Recommendation Systems:**

1. **Content-Based Filtering**
   - Uses item features (genre, description)
   - Recommends similar items

2. **Collaborative Filtering**
   - Uses user-item interactions
   - Recommends based on similar users

3. **Hybrid Methods**
   - Combines both approaches