# Recommendation System


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

In [2]:
# Load the dataset
anime_data = pd.read_csv(r"anime.csv")
print(anime_data)

       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama°   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type episodes  \
0                   Drama, Romance, School, Supernatural  Movie        1   
1      

In [3]:
# Data Preprocessing

# Handle missing values by filling them or dropping
anime_data['genre'].fillna('Unknown', inplace=True)
anime_data['type'].fillna('Unknown', inplace=True)
anime_data['rating'].fillna(anime_data['rating'].mean(), inplace=True)

In [4]:
# Ensure each genre entry is a string before splitting it into a list
anime_data['genre'] = anime_data['genre'].apply(lambda x: x if isinstance(x, list) else str(x).split(', '))

# Convert genres into a one-hot encoding (multi-label binarization)
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(anime_data['genre'])
print(anime_data['genre'])
print(mlb)
print(genre_encoded)

0                   [Drama, Romance, School, Supernatural]
1        [Action, Adventure, Drama, Fantasy, Magic, Mil...
2        [Action, Comedy, Historical, Parody, Samurai, ...
3                                       [Sci-Fi, Thriller]
4        [Action, Comedy, Historical, Parody, Samurai, ...
                               ...                        
12289                                             [Hentai]
12290                                             [Hentai]
12291                                             [Hentai]
12292                                             [Hentai]
12293                                             [Hentai]
Name: genre, Length: 12294, dtype: object
MultiLabelBinarizer()
[[0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [5]:
# Normalize numerical features: rating and members
scaler = StandardScaler()
rating_scaled = scaler.fit_transform(anime_data[['rating', 'members']])
print(rating_scaled)
print(scaler)

[[ 2.84753513  3.33024137]
 [ 2.73937967 14.14840622]
 [ 2.72954735  1.75471335]
 ...
 [-1.56717438 -0.32566298]
 [-1.46885123 -0.32646563]
 [-0.9969001  -0.32706762]]
StandardScaler()


In [6]:
# Combine the genre and rating features
features = np.hstack([genre_encoded, rating_scaled])
print(features)

[[ 0.          0.          0.         ...  0.          2.84753513
   3.33024137]
 [ 1.          1.          0.         ...  0.          2.73937967
  14.14840622]
 [ 1.          0.          0.         ...  0.          2.72954735
   1.75471335]
 ...
 [ 0.          0.          0.         ...  0.         -1.56717438
  -0.32566298]
 [ 0.          0.          0.         ...  0.         -1.46885123
  -0.32646563]
 [ 0.          0.          0.         ...  0.         -0.9969001
  -0.32706762]]


In [7]:
def recommend_anime(anime_name, anime_data, features, threshold=0.5):
    # Find the index of the anime
    anime_idx = anime_data[anime_data['name'] == anime_name].index[0]
    
    # Compute cosine similarity between the target anime and all others
    similarity_scores = cosine_similarity([features[anime_idx]], features)[0]
    
    # Get indices of anime with similarity scores above the threshold
    similar_indices = np.where(similarity_scores > threshold)[0]
    
    # Sort these indices based on similarity scores (excluding the first one since it's the anime itself)
    similar_indices = similar_indices[similar_indices != anime_idx]
    sorted_indices = similar_indices[np.argsort(similarity_scores[similar_indices])[::-1]]
    
    # Recommend these anime
    return anime_data.iloc[sorted_indices][['name', 'genre', 'rating']]

In [8]:
# Example of recommendation
recommendations = recommend_anime('Kimi no Na wa.', anime_data, features, threshold=0.6)
print(recommendations)

                                                   name  \
504   Clannad: After Story - Mou Hitotsu no Sekai, K...   
60                                   Hotarubi no Mori e   
1201                     Angel Beats!: Another Epilogue   
219   Yahari Ore no Seishun Love Comedy wa Machigatt...   
337                                        Kanon (2006)   
...                                                 ...   
770                         One Punch Man: Road to Hero   
3192                                        Zombie-Loan   
3560  Naruto Movie 3: Dai Koufun! Mikazuki Jima no A...   
9                              Gintama&#039;: Enchousen   
869                               Inu x Boku SS Special   

                                                  genre  rating  
504                            [Drama, Romance, School]    8.02  
60               [Drama, Romance, Shoujo, Supernatural]    8.61  
1201                      [Drama, School, Supernatural]    7.63  
219                    [Com

In [9]:
# Evaluation

# Since this is a recommendation system, evaluation metrics are different. We simulate a train-test split.
train_df, test_df, train_features, test_features = train_test_split(anime_data, features, test_size=0.2, random_state=42)

In [10]:
def evaluate_recommendations(test_df, test_features, train_df, train_features, threshold=0.5):
    y_true = []
    y_pred = []
    
    for idx in range(len(test_df)):
        anime_name = test_df.iloc[idx]['name']
        # Consider the anime in the test set as the ground truth relevant item
        y_true.append(1)
        
        # Generate recommendations from the training set
        recommendations = recommend_anime(anime_name, train_df, train_features, threshold=threshold)
        
        # Check if the anime was recommended (this is a simplification)
        if any(test_df.iloc[idx]['name'] == rec for rec in recommendations['name']):
            y_pred.append(1)
        else:
            y_pred.append(0)
    
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    
    return precision, recall, f1

In [11]:
def recommend_anime(anime_name, anime_data, features, threshold=0.5):
    # Find the index of the anime
    matching_anime = anime_data[anime_data['name'] == anime_name]
    
    if matching_anime.empty:
        raise ValueError(f"Anime '{anime_name}' not found in the dataset.")
    
    anime_idx = matching_anime.index[0]
    
    # Compute cosine similarity between the target anime and all others
    similarity_scores = cosine_similarity([features[anime_idx]], features)[0]
    
    # Continue with the recommendation logic...

In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(test_df, test_features, train_df, train_features, threshold=0.6):
    y_true = []
    y_pred = []
    
    for idx, row in test_df.iterrows():
        anime_name = row['name']
        
        # True label (we assume the anime is relevant to itself)
        y_true.append(1)
        
        try:
            # Generate recommendations from the training set
            recommendations = recommend_anime(anime_name, train_df, train_features, threshold=threshold)
            
            # Check if the anime was recommended (this is a simplification)
            is_recommended = any(test_df.iloc[idx]['name'] == rec for rec in recommendations['name'])
            y_pred.append(1 if is_recommended else 0)
        
        except ValueError:
            # If the anime is not found in the dataset, we consider it not recommended
            y_pred.append(0)
    
    # Calculate precision, recall, and F1-score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1

**Interview Questions:**

- 1. Can you explain the difference between user-based and item-based   collaborative filtering?

   -User-based Collaborative Filtering:
   - In user-based collaborative filtering, we predict a user’s preferences        based on the preferences of similar users. It identifies pairs or groups of    users who share similar tastes, then recommends items that these similar        users liked but the target user hasn't yet interacted with.
   - Example: If User A and User B have similar preferences and User A likes      Item X, the algorithm will likely recommend Item X to User B.
   - Advantages: Works well when there are many users with similar preferences,    especially in diverse datasets.
   - Challenges: Struggles when the user base is large, as finding similarities    among thousands or millions of users can be computationally demanding.

   -Item-based Collaborative Filtering:
   - In item-based collaborative filtering, recommendations are based on the      similarities between items rather than users. It calculates similarity          between items based on users’ ratings, then recommends items that are          similar to those a user has liked.
   - Example: If Item X and Item Y are similar, and a user likes Item X,          they’ll likely be recommended Item Y.
   - Advantages: Tends to be more stable, as item similarities are often less      variable than user preferences. It also scales well in environments with        many items.
   - Challenges: Doesn’t handle new or niche items as effectively since it        relies on existing user-item interactions.


- 2. What is collaborative filtering, and how does it work?

  -Collaborative Filtering:
  Collaborative filtering is a recommendation technique used in recommendation   systems to suggest items to users based on past interactions. It relies on     user-item interaction data, such as ratings, clicks, or purchase history, to   find patterns among users and items.


**How It Works:**


- 1. Data Collection: The system collects data on user interactions with items   (e.g., ratings, clicks, purchases).
- 2. Similarity Calculation:
   - In user-based filtering, it calculates similarities between users (often      using metrics like cosine similarity or Pearson correlation).
   - In item-based filtering, it calculates similarities between items.
- 3. Recommendations:
   - For user-based filtering, it recommends items liked by similar users.
   - For item-based filtering, it recommends items similar to those the user      has liked or interacted with.
- 4. Evaluation and Updating: The system updates recommendations continuously based on new user interactions.

**Advantages:**

   - No Need for Item Metadata: Only relies on user-item interactions, making      it flexible across domains.

   - Accurate Recommendations: Captures patterns effectively in environments      with a lot of user-item interaction data.

**Challenges:**
 
   - Cold Start Problem: Limited by the availability of user or item              interactions, making it difficult to recommend items to new users or            recommend new items.

   - Data Sparsity: Requires enough interactions to work effectively, so sparse    datasets can be problematic.
