# Recommendation System

In [31]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
file_path = 'anime.csv'
df = pd.read_csv(file_path)


In [33]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [35]:
### Data Preprocessing

# Handle missing values by removing rows with missing values in key columns
df.dropna(subset=['name', 'genre', 'type', 'rating'], inplace=True)

# Fill missing values for 'episodes' and 'members' with median values
df['members'] = df['members'].fillna(df['members'].median())

In [37]:
### Feature Extraction

# Convert genres to a TF-IDF matrix for cosine similarity
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genre'])

In [39]:
tfidf

In [41]:
# Compute cosine similarity matrix based on genres
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [42]:
# Function to recommend anime based on cosine similarity
def recommend_anime(title, df, cosine_sim=cosine_sim, threshold=0.5):
    # Get index of the anime that matches the title
    idx = df.index[df['name'] == title].tolist()
    if len(idx) == 0:
        print("Anime title not found in dataset.")
        return []
    idx = idx[0]
    
    # Get similarity scores for all anime with the target anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort anime based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Filter by threshold and exclude the target anime itself
    sim_scores = [(i, score) for i, score in sim_scores if score >= threshold and i != idx]
    
    # Get top 10 most similar anime
    anime_indices = [i[0] for i in sim_scores[:10]]
    return df['name'].iloc[anime_indices]

In [43]:
# Test the recommendation function
print("Recommendations for 'Naruto':")
print(recommend_anime("Naruto", df))

Recommendations for 'Naruto':
486                              Boruto: Naruto the Movie
615                                    Naruto: Shippuuden
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
7628                              Kyutai Panic Adventure!
784            Naruto: Shippuuden Movie 6 - Road to Ninja
Name: name, dtype: object


In [47]:
### Evaluation

# For evaluation purposes, split into training and testing (80-20 split)
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Placeholder for evaluation metrics (precision, recall, f1)
precision, recall, f1 = [], [], []

In [49]:
# Test recommendation function on a few test instances
for title in test['name'].sample(10, random_state=42):
    recommended_anime = recommend_anime(title, train)
    # Consider a recommendation as relevant if it includes any anime from the test set
    relevant_anime = test[test['name'].isin(recommended_anime)]
    
    # Calculate metrics
    y_true = [1 if anime in test['name'].values else 0 for anime in recommended_anime]
    y_pred = [1] * len(y_true)
    if len(y_true) > 0:
        precision.append(precision_score(y_true, y_pred, zero_division=0))
        recall.append(recall_score(y_true, y_pred, zero_division=0))
        f1.append(f1_score(y_true, y_pred, zero_division=0))

Anime title not found in dataset.
Anime title not found in dataset.
Anime title not found in dataset.
Anime title not found in dataset.
Anime title not found in dataset.
Anime title not found in dataset.
Anime title not found in dataset.
Anime title not found in dataset.
Anime title not found in dataset.
Anime title not found in dataset.


In [51]:
# Calculate and print average metrics
print("\n### Evaluation Results ###")
print(f"Precision: {np.mean(precision):.2f}")
print(f"Recall: {np.mean(recall):.2f}")
print(f"F1-Score: {np.mean(f1):.2f}")


### Evaluation Results ###
Precision: nan
Recall: nan
F1-Score: nan


## Interview Question Responses

#### What is the difference between user-based and item-based collaborative filtering?
#### User-based collaborative filtering: Recommends items based on similarity between users, assuming similar users like similar items.
#### Item-based collaborative filtering: Recommends items based on similarity between items, assuming that if a user likes one item, they might like similar items.

#### What is collaborative filtering, and how does it work?
#### Collaborative filtering: A recommendation method that relies on past user interactions with items. It works by identifying patterns in user behavior to recommend items to users based on similarities among users or items, without needing detailed item attributes.