In [109]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

#EDA

In [110]:
#Load dataset
anime = pd.read_csv('anime.csv')

In [111]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [112]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


#Handling Missing Values

In [113]:
#Check for null values
anime.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [114]:
anime1 = anime.dropna(subset=['genre', 'type'])

In [115]:
anime1.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,193
members,0


In [116]:
#Mean imputation
mean = anime1['rating'].mean()
print(mean)

6.478264125821752


In [117]:
anime1['rating'].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime1['rating'].fillna(mean, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime1['rating'].fillna(mean, inplace=True)


In [118]:
anime1.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


#Change datatype

In [119]:
print(anime1['episodes'].unique())

['1' '64' '51' '24' '10' '148' '110' '13' '201' '25' '22' '75' '4' '26'
 '12' '27' '43' '74' '37' '2' '11' '99' 'Unknown' '39' '101' '47' '50'
 '62' '33' '112' '23' '3' '94' '6' '8' '14' '7' '40' '15' '203' '77' '291'
 '120' '102' '96' '38' '79' '175' '103' '70' '153' '45' '5' '21' '63' '52'
 '28' '145' '36' '69' '60' '178' '114' '35' '61' '34' '109' '20' '9' '49'
 '366' '97' '48' '78' '358' '155' '104' '113' '54' '167' '161' '42' '142'
 '31' '373' '220' '46' '195' '17' '1787' '73' '147' '127' '16' '19' '98'
 '150' '76' '53' '124' '29' '115' '224' '44' '58' '93' '154' '92' '67'
 '172' '86' '30' '276' '59' '72' '330' '41' '105' '128' '137' '56' '55'
 '65' '243' '193' '18' '191' '180' '91' '192' '66' '182' '32' '164' '100'
 '296' '694' '95' '68' '117' '151' '130' '87' '170' '119' '84' '108' '156'
 '140' '331' '305' '300' '510' '200' '88' '1471' '526' '143' '726' '136'
 '1818' '237' '1428' '365' '163' '283' '71' '260' '199' '225' '312' '240'
 '1306' '1565' '773' '1274' '90' '475' '263' '8

In [120]:
original_unknown_count = anime1['episodes'].value_counts()['Unknown']

print(f"Number of 'Unknown' values in 'episodes': {original_unknown_count}")

Number of 'Unknown' values in 'episodes': 307


In [121]:
anime1['episodes'] = anime1['episodes'].replace('Unknown', 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime1['episodes'] = anime1['episodes'].replace('Unknown', 0)


In [122]:
print(anime1['episodes'].unique())

['1' '64' '51' '24' '10' '148' '110' '13' '201' '25' '22' '75' '4' '26'
 '12' '27' '43' '74' '37' '2' '11' '99' 0 '39' '101' '47' '50' '62' '33'
 '112' '23' '3' '94' '6' '8' '14' '7' '40' '15' '203' '77' '291' '120'
 '102' '96' '38' '79' '175' '103' '70' '153' '45' '5' '21' '63' '52' '28'
 '145' '36' '69' '60' '178' '114' '35' '61' '34' '109' '20' '9' '49' '366'
 '97' '48' '78' '358' '155' '104' '113' '54' '167' '161' '42' '142' '31'
 '373' '220' '46' '195' '17' '1787' '73' '147' '127' '16' '19' '98' '150'
 '76' '53' '124' '29' '115' '224' '44' '58' '93' '154' '92' '67' '172'
 '86' '30' '276' '59' '72' '330' '41' '105' '128' '137' '56' '55' '65'
 '243' '193' '18' '191' '180' '91' '192' '66' '182' '32' '164' '100' '296'
 '694' '95' '68' '117' '151' '130' '87' '170' '119' '84' '108' '156' '140'
 '331' '305' '300' '510' '200' '88' '1471' '526' '143' '726' '136' '1818'
 '237' '1428' '365' '163' '283' '71' '260' '199' '225' '312' '240' '1306'
 '1565' '773' '1274' '90' '475' '263' '83' '85' 

In [123]:
anime1['episodes'] = pd.to_numeric(anime1['episodes'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime1['episodes'] = pd.to_numeric(anime1['episodes'], errors='coerce')


In [124]:
anime1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12210 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12210 non-null  int64  
 1   name      12210 non-null  object 
 2   genre     12210 non-null  object 
 3   type      12210 non-null  object 
 4   episodes  12210 non-null  int64  
 5   rating    12210 non-null  float64
 6   members   12210 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 763.1+ KB


#Feature extraction

In [125]:
features = anime1[['genre', 'type', 'episodes', 'rating']]
features

Unnamed: 0,genre,type,episodes,rating
0,"Drama, Romance, School, Supernatural",Movie,1,9.37
1,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26
2,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25
3,"Sci-Fi, Thriller",TV,24,9.17
4,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16
...,...,...,...,...
12289,Hentai,OVA,1,4.15
12290,Hentai,OVA,1,4.28
12291,Hentai,OVA,4,4.88
12292,Hentai,OVA,1,4.98


In [126]:
# Multi-hot encoding for 'genre'
features['genre'] = features['genre'].str.split(',').apply(lambda x: [genre.strip() for genre in x])
all_genres = set([genre for sublist in features['genre'] for genre in sublist])
for genre in all_genres:
    features[f'genre_{genre}'] = features['genre'].apply(lambda x: 1 if genre in x else 0)
features = features.drop(['genre'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['genre'] = features['genre'].str.split(',').apply(lambda x: [genre.strip() for genre in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[f'genre_{genre}'] = features['genre'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[f'genre_

In [127]:
features

Unnamed: 0,type,episodes,rating,genre_Military,genre_Shounen Ai,genre_Space,genre_Thriller,genre_Drama,genre_Slice of Life,genre_Music,...,genre_Comedy,genre_Parody,genre_Super Power,genre_School,genre_Mystery,genre_Yaoi,genre_Dementia,genre_Harem,genre_Psychological,genre_Fantasy
0,Movie,1,9.37,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,TV,64,9.26,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,TV,51,9.25,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
3,TV,24,9.17,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TV,51,9.16,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,OVA,1,4.15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,OVA,1,4.28,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,OVA,4,4.88,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,OVA,1,4.98,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
# One-hot encoding for 'type'
features = pd.get_dummies(features, columns=['type'], prefix=['type'], dummy_na=False)
features

Unnamed: 0,episodes,rating,genre_Military,genre_Shounen Ai,genre_Space,genre_Thriller,genre_Drama,genre_Slice of Life,genre_Music,genre_Sports,...,genre_Dementia,genre_Harem,genre_Psychological,genre_Fantasy,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,1,9.37,0,0,0,0,1,0,0,0,...,0,0,0,0,True,False,False,False,False,False
1,64,9.26,1,0,0,0,1,0,0,0,...,0,0,0,1,False,False,False,False,False,True
2,51,9.25,0,0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
3,24,9.17,0,0,0,1,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
4,51,9.16,0,0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,1,4.15,0,0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,True,False,False
12290,1,4.28,0,0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,True,False,False
12291,4,4.88,0,0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,True,False,False
12292,1,4.98,0,0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,True,False,False


#Normalize numerical features

In [129]:
# Normalize numerical features using StandardScaler
numerical_features = ['episodes', 'rating']
scaler = StandardScaler()
features[numerical_features] = scaler.fit_transform(features[numerical_features])
features.describe()

Unnamed: 0,episodes,rating,genre_Military,genre_Shounen Ai,genre_Space,genre_Thriller,genre_Drama,genre_Slice of Life,genre_Music,genre_Sports,...,genre_Comedy,genre_Parody,genre_Super Power,genre_School,genre_Mystery,genre_Yaoi,genre_Dementia,genre_Harem,genre_Psychological,genre_Fantasy
count,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,...,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0,12210.0
mean,4.6554810000000004e-18,-7.076331e-16,0.034726,0.005324,0.031122,0.007043,0.164537,0.099918,0.07027,0.04439,...,0.380098,0.033415,0.03792,0.09959,0.040459,0.003112,0.019656,0.025962,0.018755,0.188698
std,1.000041,1.000041,0.183092,0.072771,0.173655,0.083632,0.370778,0.299903,0.255612,0.205968,...,0.485431,0.179726,0.19101,0.299465,0.19704,0.055703,0.138821,0.159029,0.135665,0.391284
min,-0.2610482,-4.733986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.2394986,-0.5693312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.217949,0.0706277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,-0.002453228,0.68105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,38.91608,3.467332,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [130]:
# Calculate the similarity matrix for anime using training data
anime_sim = 1 - pairwise_distances(features.values, metric='cosine')

# Get anime names corresponding to train_features indices
anime_names = anime1['name'].tolist()

# Convert the similarity matrix to a pandas DataFrame with anime names
anime_sim_df = pd.DataFrame(anime_sim, index=anime_names, columns=anime_names)

# Display the first few rows and columns of the similarity matrix
anime_sim_df.iloc[0:5, 0:5]

Unnamed: 0,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;
Kimi no Na wa.,1.0,0.574421,0.519119,0.649413,0.509375
Fullmetal Alchemist: Brotherhood,0.574421,1.0,0.693779,0.657273,0.689252
Gintama°,0.519119,0.693779,1.0,0.740085,0.999865
Steins;Gate,0.649413,0.657273,0.740085,1.0,0.73256
Gintama&#039;,0.509375,0.689252,0.999865,0.73256,1.0


In [131]:
np.fill_diagonal(anime_sim_df.values, 0)

In [132]:
anime_sim_df.iloc[0:5, 0:5]

Unnamed: 0,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;
Kimi no Na wa.,0.0,0.574421,0.519119,0.649413,0.509375
Fullmetal Alchemist: Brotherhood,0.574421,0.0,0.693779,0.657273,0.689252
Gintama°,0.519119,0.693779,0.0,0.740085,0.999865
Steins;Gate,0.649413,0.657273,0.740085,0.0,0.73256
Gintama&#039;,0.509375,0.689252,0.999865,0.73256,0.0


In [133]:
# Most Similar Anime
anime_sim_df.idxmax(axis=1)[0:5]

Unnamed: 0,0
Kimi no Na wa.,Kokoro ga Sakebitagatterunda.
Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist
Gintama°,Gintama&#039;
Steins;Gate,Steins;Gate Movie: Fuka Ryouiki no Déjà vu
Gintama&#039;,Gintama°


#Recommendation system

In [134]:

def get_recommendations(target_anime_name, similarity_threshold=0.8):
    # Get similarity scores for the target anime
    similarity_scores = anime_sim_df.loc[target_anime_name]

    # Filter for similar anime above the threshold
    similar_anime_names = similarity_scores[similarity_scores >= similarity_threshold].index.tolist()

    # Return the list of recommended anime names
    return similar_anime_names

In [135]:
recommendations = get_recommendations('Kimi no Na wa.')
print(recommendations)

['Koe no Katachi', 'Sen to Chihiro no Kamikakushi', 'Suzumiya Haruhi no Shoushitsu', 'Howl no Ugoku Shiro', 'Kara no Kyoukai 5: Mujun Rasen', 'Hotarubi no Mori e', 'Bakemono no Ko', 'Hotaru no Haka', 'Girls und Panzer der Film', 'Doukyuusei (Movie)', 'Kokoro ga Sakebitagatterunda.', 'Tsumiki no Ie', 'Kaze Tachinu', 'Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai. Movie', 'Byousoku 5 Centimeter', 'Colorful (Movie)', 'Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen', 'Momo e no Tegami', 'Aura: Maryuuin Kouga Saigo no Tatakai', 'Harmonie', 'Zutto Mae kara Suki deshita.: Kokuhaku Jikkou Iinkai', 'Air Movie', 'Michi', 'Mirai ni Mukete: Bousai wo Kangaeru', 'Oshin']


## Trying different similarity threshold

In [136]:

def get_recommendations2(target_anime_name, similarity_threshold=0.88):
    # Get similarity scores for the target anime
    similarity_scores = anime_sim_df.loc[target_anime_name]

    # Filter for similar anime above the threshold
    similar_anime_names = similarity_scores[similarity_scores >= similarity_threshold].index.tolist()

    # Return the list of recommended anime names
    return similar_anime_names

In [137]:
recommendations = get_recommendations2('Kimi no Na wa.')
print(recommendations)

['Hotarubi no Mori e', 'Kokoro ga Sakebitagatterunda.']


#Evaluation

In [138]:
#Create train and test datasets
train_anime, test_anime = train_test_split(anime1['name'].tolist(), test_size=0.2, random_state=42)

In [139]:
#Establish "ground truth" using the training set
train_recommendations = {}  # Store recommendations for training anime
for anime_name in train_anime:
    train_recommendations[anime_name] = get_recommendations(anime_name)

In [140]:
# Evaluate the recommendations on the testing set
precisions = []
recalls = []
f1_scores = []

for anime_name in test_anime:
    # Get recommendations for the test anime
    recommendations = get_recommendations(anime_name)

    # Find similar anime in the training set (ground truth)
    similar_train_anime = []
    for train_anime_name, train_recs in train_recommendations.items():
        if anime_name in train_recs:  # If test anime is in recommendations of a train anime
            similar_train_anime.append(train_anime_name)

    # Calculate metrics (only if there are recommendations and similar train anime)
    if recommendations and similar_train_anime:
        precision = precision_score([1 if anime in similar_train_anime else 0 for anime in recommendations], [1] * len(recommendations))
        recall = recall_score([1 if anime in similar_train_anime else 0 for anime in recommendations], [1] * len(recommendations))
        f1 = f1_score([1 if anime in similar_train_anime else 0 for anime in recommendations], [1] * len(recommendations))

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

# 8. Calculate average metrics
avg_precision = np.mean(precisions) if precisions else 0
avg_recall = np.mean(recalls) if recalls else 0
avg_f1_score = np.mean(f1_scores) if f1_scores else 0

In [141]:
print(f"Average Precision: {avg_precision}")
print(f"Average Recall: {avg_recall}")
print(f"Average F1-score: {avg_f1_score}")

Average Precision: 0.8055351364512866
Average Recall: 1.0
Average F1-score: 0.8878491268375871


**Evaluation Analysis: **

Metrics:

Average Precision: 0.8055 - This indicates that about 80.55% of the recommended anime were actually considered "relevant" based on the training set's recommendations for similar anime. This is a reasonably good precision score, suggesting that the system is making mostly relevant recommendations.

Average Recall: 1.0 - This indicates that the system is recommending all of the relevant anime for the test set. This might seem perfect, but it's important to remember that the definition of "relevance" in this case is based on the training set's recommendations. A recall of 1.0 could also indicate that the system is being too broad in its recommendations, potentially recommending more items than necessary.

Average F1-score: 0.8878 - This is the harmonic mean of precision and recall, providing a balanced measure of performance. An F1-score of 0.8878 is quite good, indicating a good balance between precision and recall.

Overall:

The recommendation system seems to be performing well, with good precision and recall scores. However, the perfect recall score should be interpreted cautiously, as it might indicate a tendency towards over-recommendation.

**Areas of Improvement:**

Refine Relevance Definition: The current evaluation relies on the training set's recommendations as a proxy for relevance. This might not perfectly capture user preferences. If possible, try to incorporate more direct feedback data (ratings, likes, etc.) to define relevance more accurately.

Address Potential Over-Recommendation: The recall of 1.0 suggests that the system might be recommending more items than necessary. Consider adjusting the similarity threshold or the number of recommendations to find a balance between providing enough options and avoiding overwhelming the user.

Explore Other Similarity Metrics: Cosine similarity is a commonly used metric, but other options like Euclidean distance or Pearson correlation could be explored to see if they improve performance.

#Interview questions



> Can you explain the difference between user-based and item-based collaborative filtering?

User-based collaborative filtering: Focuses on finding users with similar tastes to the target user. It recommends items that those similar users have liked or interacted with.

Item-based collaborative filtering: Focuses on finding items similar to the ones the target user has liked or interacted with. It recommends items that are frequently co-rated or co-purchased with the target user's preferred items.

> What is collaborative filtering, and how does it work?

Collaborative filtering is a recommendation technique that leverages the collective preferences of a group of users to predict the interests of an individual user. It's based on the idea that users who have agreed in the past are likely to agree again in the future.

How it works:

Data Collection: Gather data on user interactions with items (ratings, purchases, likes, etc.).
Similarity Calculation: Calculate the similarity between users (user-based) or items (item-based) based on their interaction patterns.
Prediction: Predict the rating or preference of a target user for an item based on the preferences of similar users or the similarity to items they have liked.
Recommendation: Recommend items with the highest predicted ratings or preferences.
Example:

If user A and user B have rated similar movies highly in the past, and user A has recently rated a new movie positively, collaborative filtering might predict that user B would also like that new movie and recommend it to them.

