In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings("ignore")

In [11]:
def Recm_Sys():
    try:
        df_anime = pd.read_csv("anime.csv")
        df_anime.info()

        print("\nOriginal Data")
        print(df_anime.describe())
        print(df_anime.dtypes)

        print("\nData Cleaning")
        has_nan = df_anime.isna().any().any()
        print("\nDoes the dataframe have any NaN values?",has_nan)

        count = df_anime.isnull().sum()
        print("\nBelow are the count of missing values according to columns:")
        print(count)

        #Fill missing values
        df_anime['type'] = df_anime['type'].fillna('None')
        df_anime['genre'] = df_anime['genre'].fillna('None')
        df_anime['rating'] = df_anime['rating'].fillna(df_anime['rating'].median())
        df_anime['episodes'] = pd.to_numeric(df_anime['episodes'],errors='coerce')
        df_anime['episodes'].fillna(df_anime['episodes'].median(),inplace=True)
        episode_ = df_anime['episodes']
        df_anime.isnull().sum()

        genre_list = df_anime['genre'].to_string()
        type_ = pd.get_dummies(df_anime['type'])
        genre_ = df_anime['genre'].str.get_dummies(sep=",")
        genre_.head()

        X = pd.concat([genre_,type_,episode_,df_anime['rating'],df_anime['members']],axis=1)
        X.head()

        scaled = MaxAbsScaler()
        X = scaled.fit_transform(X)

        #KNN
        recommendations = NearestNeighbors(n_neighbors=11, algorithm='ball_tree').fit(X)
        recommendations.kneighbors(X)
        anime_indices = recommendations.kneighbors(X)[1] # picks off the array for anime indices

        #Use MultiLabelBinarizer for one hot encoding genres
        mlb = MultiLabelBinarizer()
        genre_encoded = pd.DataFrame(mlb.fit_transform(df_anime['genre']),columns=mlb.classes_,index=df_anime.index)

        #Normalize numerical featues(eg. rating)
        scaler = MinMaxScaler()
        df_anime[['rating']]=scaler.fit_transform(df_anime[['rating']])

        #Merge genre encoding and rating
        feature_matrix = pd.concat([genre_encoded,df_anime[['rating']]],axis=1)

        def get_index(x):
            #give index for the anime
            return df_anime[df_anime['name']==x].index.tolist()[0]
        
        #Compute the consine similarity matrix
        cosine_sim = cosine_similarity(feature_matrix)

        #Function to get recommendations
        def recommendation_anime(title,df,cosine_sim,top_n=5):
            if title not in df['name'].values:
                return "Anime not found in dataset"
            
            #Get the index of the anime
            #idx = get_index(title)
            idx = df_anime[df_anime['name'] == title].index[0]

            #Get similarity scored for all anime
            sim_scores = list(enumerate(cosine_sim[idx]))

            #Sort based on similarity score
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

            #Get top N recommendations (excluding the anime itself)
            top_anime = [df['name'].iloc[i[0]] for i in sim_scores[1:top_n+1]]

            return top_anime
        
        #Example usage
        print("\n",recommendation_anime("Naruto",df_anime,cosine_sim, top_n=5))

        #Evaluation
        #Splitting data into train and test sets
        train,test = train_test_split(df_anime,test_size=0.2,random_state=42)

        #Generate recommendation for test set
        y_true = test['name'].tolist()
        y_pred = [recommendation_anime(title,df_anime,cosine_sim) for title in test['name']]

        #Convert to binary relevance (1 if match, 0 otherwise)
        y_true_binary =[1 if title in pred else 0 for title,pred in zip(y_true,y_pred)]
        
        #Genrate predicted binary labels
        y_pred_binary = [1] * len(y_pred ) #All recommendations are considered positive samples

        #Compute Evalteion Metrics
        precision = precision_score(y_true_binary,y_pred_binary,zero_division=1)
        recall = recall_score(y_true_binary,y_pred_binary,zero_division=1)
        f1 = f1_score(y_true_binary,y_pred_binary,zero_division=1)

        print(f"\nPrecision: {precision:.2f}")
        print(f"\nRecall: {recall:.2f}")
        print(f"\nF1-Score: {f1:.2f}")

    except FileNotFoundError:
        print("Error")

Recm_Sys()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB

Original Data
           anime_id        rating       members
count  12294.000000  12064.000000  1.229400e+04
mean   14058.221653      6.473902  1.807134e+04
std    11455.294701      1.026746  5.482068e+04
min        1.000000      1.670000  5.000000e+00
25%     3484.250000      5.880000  2.250000e+02
50%    10260.500000      6.570000  1.550000e+03
75%    24794.500000      7.180000  9.437000e+03
max    34527.000000     10.000000  1.013917e+06
anime_id      int64
name      