In [30]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
def recommend_anime_cosine_similarity(anime_name, anime_df, similarity_features, top_n=10):

    # Check if the target anime exists in the dataset
    if anime_name not in anime_df['name'].values:
        print(f"Anime '{anime_name}' not found in the dataset.")
        return None

    # Get the index of the target anime
    target_anime_index = anime_df[anime_df['name'] == anime_name].index[0]

    # Calculate cosine similarity between the target anime and all other animes
    # The result is a flattened array of similarity scores
    similarity_scores = cosine_similarity(
        similarity_features.iloc[target_anime_index:target_anime_index + 1],
        similarity_features
    ).flatten()

    # Get the indices of the most similar animes.
    # We sort in descending order and slice from the second element (index 1)
    # to exclude the target anime itself.
    similar_anime_indices = similarity_scores.argsort()[::-1][1:top_n + 1]

    # Get the recommended animes and their similarity scores
    recommended_animes = anime_df.iloc[similar_anime_indices].copy()
    recommended_animes['similarity_score'] = similarity_scores[similar_anime_indices]

    return recommended_animes

In [12]:
# Load the dataset
anime_df = pd.read_csv('anime.csv')
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [13]:
# Handle missing values
# The 'episodes' column has 'Unknown' values, we'll replace them with NaN and then the mode
anime_df['episodes'] = anime_df['episodes'].replace('Unknown', np.nan)
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'])
anime_df['episodes'].fillna(anime_df['episodes'].mode()[0], inplace=True)

In [14]:
# The 'rating' column has some missing values, we'll fill them with the mean rating
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

In [15]:
# Remove rows with missing genres to simplify one-hot encoding
anime_df.dropna(subset=['genre'], inplace=True)
anime_df.reset_index(drop=True, inplace=True) # Reset index after dropping rows

In [16]:
# --- Feature Extraction ---
# Create a list of all unique genres
all_genres = set()
anime_df['genre'].str.split(', ').apply(all_genres.update)
genre_list = sorted(list(all_genres))

In [17]:
# Create a genre-based DataFrame for one-hot encoding
genre_df = anime_df['genre'].str.get_dummies(sep=', ')

In [19]:
genre_df

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12230,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Add any missing genre columns with 0 values to ensure consistent columns
for genre in genre_list:
    if genre not in genre_df.columns:
        genre_df[genre] = 0

In [20]:
# Select and normalize numerical features ('rating' and 'episodes')
numerical_features = anime_df[['rating', 'episodes']]
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(numerical_features)
numerical_scaled_df = pd.DataFrame(numerical_scaled, columns=['rating_scaled', 'episodes_scaled'], index=anime_df.index)

In [21]:
numerical_features

Unnamed: 0,rating,episodes
0,9.37,1.0
1,9.26,64.0
2,9.25,51.0
3,9.17,24.0
4,9.16,51.0
...,...,...
12227,4.15,1.0
12228,4.28,1.0
12229,4.88,4.0
12230,4.98,1.0


In [22]:
numerical_scaled

array([[0.92436975, 0.        ],
       [0.91116447, 0.03467254],
       [0.90996399, 0.02751789],
       ...,
       [0.38535414, 0.00165107],
       [0.39735894, 0.        ],
       [0.45498199, 0.        ]])

In [23]:
numerical_scaled_df

Unnamed: 0,rating_scaled,episodes_scaled
0,0.924370,0.000000
1,0.911164,0.034673
2,0.909964,0.027518
3,0.900360,0.012658
4,0.899160,0.027518
...,...,...
12227,0.297719,0.000000
12228,0.313325,0.000000
12229,0.385354,0.001651
12230,0.397359,0.000000


In [24]:
# Combine all features (one-hot encoded genres and scaled numericals)
features_for_similarity = pd.concat([genre_df.reindex(columns=genre_list), numerical_scaled_df], axis=1)
features_for_similarity

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,rating_scaled,episodes_scaled
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0.924370,0.000000
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0.911164,0.034673
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.909964,0.027518
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.900360,0.012658
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.899160,0.027518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.297719,0.000000
12228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.313325,0.000000
12229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.385354,0.001651
12230,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.397359,0.000000


In [25]:
# Recommendation System
target_anime = 'Kimi no Na wa.'
recommendations = recommend_anime_cosine_similarity(
    target_anime,
    anime_df,
    features_for_similarity,
    top_n=10
)

In [27]:
# Display the recommendations
if recommendations is not None:
    print(f"\nRecommendations for '{target_anime}':")
    print("-" * 50)
    print(recommendations[['name', 'genre', 'rating', 'similarity_score']])


Recommendations for 'Kimi no Na wa.':
--------------------------------------------------
                                                   name  \
5803                        Wind: A Breath of Heart OVA   
6391                       Wind: A Breath of Heart (TV)   
1111              Aura: Maryuuin Kouga Saigo no Tatakai   
208                       Kokoro ga Sakebitagatterunda.   
504   Clannad: After Story - Mou Hitotsu no Sekai, K...   
1201                     Angel Beats!: Another Epilogue   
1435                                         True Tears   
1436                 &quot;Bungaku Shoujo&quot; Memoire   
1494                                           Harmonie   
1631                                Kimikiss Pure Rouge   

                                             genre  rating  similarity_score  
5803          Drama, Romance, School, Supernatural    6.35          0.987372  
6391          Drama, Romance, School, Supernatural    6.14          0.985443  
1111  Comedy, Drama, Ro

In [32]:
# Reload and reprocess the data for evaluation to ensure we have the original genre column for relevance check
df_eval = pd.read_csv('anime.csv')
df_eval

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [33]:
# Handle missing values
df_eval['genre'] = df_eval['genre'].astype(str)
df_eval['genre'].fillna('Unknown', inplace=True)
df_eval['rating'].fillna(df_eval['rating'].mean(), inplace=True)
df_eval['episodes'].fillna('Unknown', inplace=True)
df_eval['episodes'] = pd.to_numeric(df_eval['episodes'], errors='coerce').fillna(0).astype(int)

In [34]:
# Create a column for processed genres (list of genres for each anime)
df_eval['processed_genre'] = df_eval['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

In [35]:
# Drop 'anime_id' and 'type' for feature extraction
df_eval.drop(columns=['anime_id', 'type'], inplace=True)

In [36]:
# Separate features for similarity calculation
df_features_eval = df_eval.copy()

In [37]:
# One-hot encode the 'processed_genre' column
mlb_eval = MultiLabelBinarizer()
genre_encoded_eval = mlb_eval.fit_transform(df_features_eval['processed_genre'])
genre_df_eval = pd.DataFrame(genre_encoded_eval, columns=mlb_eval.classes_, index=df_features_eval.index)
df_features_eval = pd.concat([df_features_eval, genre_df_eval], axis=1)
df_features_eval.drop(['genre', 'processed_genre'], axis=1, inplace=True)

In [38]:
# Normalize numerical features: 'rating', 'episodes', 'members'
scaler_eval = MinMaxScaler()
numerical_cols_eval = ['rating', 'episodes', 'members']
df_features_eval[numerical_cols_eval] = scaler_eval.fit_transform(df_features_eval[numerical_cols_eval])

In [39]:
# Set 'name' as index for the feature DataFrame
df_features_eval.set_index('name', inplace=True)

In [40]:
def evaluate_recommendation_system(df_full, df_features_full, top_n=10, threshold=0.5, test_size=0.2):

    # Split the dataset (anime titles) into training and testing sets
    anime_titles = df_full['name'].tolist()
    train_titles, test_titles = train_test_split(anime_titles, test_size=test_size, random_state=42)

    precisions = []
    recalls = []
    f1_scores = []

    # Create a set of all unique genres in the dataset
    all_genres = list(set(g for genre_list in df_full['processed_genre'] for g in genre_list))

    for anime_title in test_titles:
        # Get recommendations for the current anime
        recommendations = recommend_anime_cosine_similarity(anime_title, df_full, df_features_full, top_n=top_n)

        # If no recommendations are found, skip this anime
        if recommendations is None or recommendations.empty:
            continue

        # Get the actual genres of the target anime
        actual_genres = set(df_full[df_full['name'] == anime_title]['processed_genre'].iloc[0])

        # Get the genres of the recommended anime
        recommended_genres = []
        for recommended_anime_title in recommendations['name']:
            # Find the original genre list for the recommended anime
            recommended_genres.extend(df_full[df_full['name'] == recommended_anime_title]['processed_genre'].iloc[0])
        recommended_genres = set(recommended_genres)

        # Create binary relevance vectors for the current anime
        actual_vector = [1 if genre in actual_genres else 0 for genre in all_genres]
        recommended_vector = [1 if genre in recommended_genres else 0 for genre in all_genres]

        # Calculate metrics if there are any actual or recommended genres
        if sum(actual_vector) > 0 or sum(recommended_vector) > 0:
             precision = precision_score([actual_vector], [recommended_vector], average='weighted', zero_division=0)
             recall = recall_score([actual_vector], [recommended_vector], average='weighted', zero_division=0)
             f1 = f1_score([actual_vector], [recommended_vector], average='weighted', zero_division=0)
             precisions.append(precision)
             recalls.append(recall)
             f1_scores.append(f1)


    # Calculate the average precision, recall, and F1-score across all test anime
    avg_precision = np.mean(precisions) if precisions else 0
    avg_recall = np.mean(recalls) if recalls else 0
    avg_f1_score = np.mean(f1_scores) if f1_scores else 0

    return {
        'Precision': avg_precision,
        'Recall': avg_recall,
        'F1-score': avg_f1_score
    }


In [41]:
# Evaluate the recommendation system
evaluation_results = evaluate_recommendation_system(df_eval, df_features_eval, top_n=10, test_size=0.2)

print("\nRecommendation System Evaluation Results:")
print(pd.DataFrame([evaluation_results]).to_markdown(index=False, numalign="left", stralign="left"))


Recommendation System Evaluation Results:
| Precision   | Recall   | F1-score   |
|:------------|:---------|:-----------|
| 0.996444    | 0.996444 | 0.996444   |


**INTERVIEW QUESTIONS**

### 1. Can you explain the difference between user-based and item-based collaborative filtering?

**User-Based Collaborative Filtering**
1.  **Similarity Focus:** Finds people who are similar to you.
2.  **Recommendation Logic:** Recommends what similar people liked.
3.  **Stability:** Less stable because people's tastes can change.

**Item-Based Collaborative Filtering**
1.  **Similarity Focus:** Finds items that are similar to each other.
2.  **Recommendation Logic:** Recommends items similar to the ones you've already liked.
3.  **Stability:** More stable because item characteristics don't change.

### 2. What is collaborative filtering, and how does it work?

Collaborative filtering is a way to recommend things by using the preferences of many people. It works by finding patterns in how users rate or interact with items. The main idea is that if you and another person have similar tastes in the past, you'll likely have similar tastes in the future.
