<a href="https://colab.research.google.com/github/gnani321/basic-ds-/blob/main/Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommendation System

# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load the dataset
df = pd.read_csv('anime.csv')

In [None]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [None]:
# Handling missing values
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


# Feature Extraction

In [None]:
# one-hot encoding for the genre
genere_dummies = df['genre'].str.get_dummies(sep=',')
genere_dummies

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Merge the genre dummy columns back to the orginal dataset
anime_df = pd.concat([df, genere_dummies], axis=1)
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Normalize numerical features
from sklearn.preprocessing import MinMaxScaler
numerical_features = ['rating', 'members']
anime_df[numerical_features] = (anime_df[numerical_features] - anime_df[numerical_features].mean()) / anime_df[numerical_features].std()

In [None]:
# now the data frame contains one-hot encoded genre columns, normalized ratings and episodes
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,2.820656,3.330106,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,2.713522,14.147831,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,2.703782,1.754642,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,2.625866,11.957179,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,2.616127,2.429643,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Recommendation System

In [None]:
# cosine similarity
# Selecting the relevent features for similarity computation (genre dummies, rating, episodes)
features = ['Action', 'Adventure', 'Cars']
# Assuming you want to select these columns from a DataFrame
# features = df[['Action', 'Adventure', 'Cars']]
features

['Action', 'Adventure', 'Cars']

In [None]:
# Compute cosine similarity between the target anime and all other anime
cosine_sim = cosine_similarity(anime_df[features], anime_df[features])
cosine_sim

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# Store the similarities in a dataframe for easier handling
similarity_df = pd.DataFrame(cosine_sim, index=anime_df['name'], columns=anime_df['name'])
similarity_df

name,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou,Hunter x Hunter (2011),Ginga Eiyuu Densetsu,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare,Gintama&#039;: Enchousen,...,Super Erotic Anime,Taimanin Asagi 3,Teleclub no Himitsu,Tenshi no Habataki Jun,The Satisfaction,Toushindai My Lover: Minami tai Mecha-Minami,Under World,Violence Gekiga David no Hoshi,Violence Gekiga Shin David no Hoshi: Inma Densetsu,Yasuji no Pornorama: Yacchimae!!
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kimi no Na wa.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fullmetal Alchemist: Brotherhood,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gintama°,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Steins;Gate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gintama&#039;,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Toushindai My Lover: Minami tai Mecha-Minami,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Under World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Violence Gekiga David no Hoshi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Violence Gekiga Shin David no Hoshi: Inma Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def recommend_anime(target_title, top_n=5):
    # Get the similarity scores for the target anime
    similarity_scores = similarity_df[target_title]
    similarity_scores = similarity_scores.sort_values(ascending=False)
    # Return the similarity scores to make them accessible outside the function
    return similarity_scores

# Call the function to calculate and assign similarity_scores
similarity_scores = recommend_anime('Death Note') # Example target title


# sort the scores in descending order and exclude the target anime itself
similarity_scores_sorted = similarity_scores.sort_values(ascending=False)
similarity_scores_sorted

Unnamed: 0_level_0,Death Note
name,Unnamed: 1_level_1
Kimi no Na wa.,0.0
Mirai Nikki,0.0
Ping Pong The Animation,0.0
Koe no Katachi,0.0
Gintama°,0.0
...,...
The Satisfaction,0.0
Toushindai My Lover: Minami tai Mecha-Minami,0.0
Under World,0.0
Violence Gekiga David no Hoshi,0.0


In [None]:
# Get the top_n similar anime
top_n = 5  # Define top_n here
recommended_anime = similarity_scores_sorted.index[1:top_n+1]
recommended_anime

Index(['Mirai Nikki', 'Ping Pong The Animation', 'Koe no Katachi', 'Gintama°',
       'Steins;Gate'],
      dtype='object', name='name')

In [None]:
# Get recommendations for taeget anime
target_anime = 'Death Note'
recommendations = recommend_anime(target_anime)
recommendations

Unnamed: 0_level_0,Death Note
name,Unnamed: 1_level_1
Kimi no Na wa.,0.0
Taku Boda,0.0
Backkom Mission Impossible,0.0
Backkom Specials,0.0
Backstage Idol Story,0.0
...,...
Yuu☆Yuu☆Hakusho: Mu Mu Hakusho,0.0
3-gatsu no Lion meets Bump of Chicken,0.0
Bannou Bunka Neko-Musume,0.0
Choujikuu Seiki Orguss,0.0


# Evaluation

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Assuming the dataset has ratings (for wvaluation purposes)
# Create a simple binary rating based on a threshold(e.g., ratings above 7 are consider "liked")
anime_df['rating_binary'] = (anime_df['rating'] >= 7).astype(int)
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,...,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,rating_binary
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,2.820656,3.330106,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,2.713522,14.147831,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,2.703782,1.754642,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,2.625866,11.957179,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,2.616127,2.429643,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Split the dataset into training and tresting  sets
train_data, test_data = train_test_split(anime_df, test_size=0.2, random_state=42)
train_data.head()
test_data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,...,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,rating_binary
6329,17209,Suzy&#039;s Zoo: Daisuki! Witzy - Happy Birthday,Kids,Special,1,-0.295985,-0.326762,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2167,173,Tactics,"Comedy, Drama, Fantasy, Mystery, Shounen, Supe...",TV,25,0.843537,0.169401,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2882,3616,Kamen no Maid Guy,"Action, Comedy, Ecchi, Super Power",TV,12,0.648747,0.176752,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4700,18799,Take Your Way,"Action, Music, Seinen, Supernatural",Music,1,0.181251,-0.304344,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7258,18831,Rinkaku,"Dementia, Horror, Music",Music,1,-0.851137,-0.31859,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# For simplicity, We Will use the training set to create recommendations and test on the test set
def evaluate_recommendations(test_data, similarity_df, top_n=5):
    precision = []
    recall = []

    for _,test_anime in test_data.iterrows():
        target_title = test_anime['name']
        recommended_anime = recommend_anime(target_title, top_n)
        relevent_recommendations = [1 if anime in recommended_anime and anime_df.loc[anime_df['name'] == target_title, 'rating_binary'].values[0] == 1 else 0 for anime in recommended_anime]
        precision.append(sum(relevent_recommendations) / len(recommended_anime) if len(recommended_anime) > 0 else 0)
        if len(test_data[test_data['rating_binary'] == 1]) > 0:
            recall.append(sum(relevent_recommendations) / len(test_data[test_data['rating_binary'] == 1]))
        else:
            recall.append(0)
    print(f"Precision: {np.mean(precision)}")
    print(f"Recall: {np.mean(recall)}")
    print(f"Recall: {np.mean(recall)}")
    return np.mean(precision), np.mean(recall)


In [None]:
# Call the evaluate_recommendations function to assign values to target_title, top_n, and similarity_df
evaluate_recommendations(test_data, similarity_df)  # Assuming you have test_data and similarity_df defined

In [None]:
# Modify the recommend_anime function to accept top_n and similarity_df as arguments
def recommend_anime(target_title, top_n=5, similarity_df=similarity_df):# Using the global similarity_df as default
    # Get the similarity scores for the target anime
    similarity_scores = similarity_df[target_title]
    similarity_scores = similarity_scores.sort_values(ascending=False)

    # Retutn the top_n similar anime(excluding the target anime itself)
    recommend_anime = similarity_scores.index[1:top_n+1]
    return recommend_anime

In [None]:
# Calculate Precision and Recall
relevent_recommendations = [1 if anime in recommended_anime and anime_df.loc[anime_df['name'] == target_anime, 'rating_binary'].values[0] == 1 else 0 for anime in recommended_anime]
precision = sum(relevent_recommendations) / len(recommended_anime) if len(recommended_anime) > 0 else 0 # Handle zero division for precision
# Check if the denominator is 0 before calculating recall
if len(test_data[test_data['rating_binary'] == 1]) > 0:
    recall = sum(relevent_recommendations) / len(test_data[test_data['rating_binary'] == 1])
else:
    recall = 0  # or another appropriate value like np.nan if you prefer

In [None]:
# Evaluate the recommendation system
print(f"Precision: {precision}")
print(f"Recall: {recall}")

# Interview Questions

1. Can you explain the difference between user-based and item-based collaborative filtering?



*   User-based Collaborative Filtering: This method recommends items to a user by finding other users who have similar preferences or behaviors (i.e., similar ratings for items). It then recommends items that those similar users have liked.
*   Item-based Collaborative Filtering: Instead of finding similar users, item-based filtering focuses on the similarity between items. It recommends items that are similar to those the user has already liked or interacted with.





2. . What is collaborative filtering, and how does it work?



*   Collaborative filtering is a technique used in recommendation systems to predict a user's interests by collecting preferences or taste information from many users. It is based on the idea that if two users have agreed on one issue, they are likely to agree on other issues as well. Collaborative filtering can be divided into:
*  User-based: Recommending items by identifying similar users.

*   Item-based: Recommending items by finding similarites between items



