In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('anime.csv')

# Display first few rows to understand the structure
print(df.head())

# Check for missing values
print(df.isnull().sum())


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [6]:
# Impute missing values with the mean for numerical columns
df['rating'].fillna(df['rating'].mean(), inplace=True)
df['episodes'].fillna(df['rating'].mean(), inplace=True)
df['episodes'].replace('Unknown','0', inplace=True)
# Drop rows with missing categorical data (e.g., genres)
df.dropna(subset=['genre'], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(df['rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['episodes'].fillna(df['rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [7]:
# Check the columns and types of data
print(df.info())

# Explore basic statistics for numerical data
print(df.describe())

# Look at unique genres
print(df['genre'].unique())


<class 'pandas.core.frame.DataFrame'>
Index: 12232 entries, 0 to 12293
Data columns (total 50 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   anime_id       12232 non-null  int64  
 1   name           12232 non-null  object 
 2   genre          12232 non-null  object 
 3   type           12210 non-null  object 
 4   episodes       12232 non-null  object 
 5   rating         12232 non-null  float64
 6   members        12232 non-null  int64  
 7   Action         12232 non-null  int64  
 8   Adventure      12232 non-null  int64  
 9   Cars           12232 non-null  int64  
 10  Comedy         12232 non-null  int64  
 11  Dementia       12232 non-null  int64  
 12  Demons         12232 non-null  int64  
 13  Drama          12232 non-null  int64  
 14  Ecchi          12232 non-null  int64  
 15  Fantasy        12232 non-null  int64  
 16  Game           12232 non-null  int64  
 17  Harem          12232 non-null  int64  
 18  Hentai     

In [8]:
# One-hot encode the genres
df_genres = df['genre'].str.get_dummies(sep=', ')

# Combine the one-hot encoded genres with the original data (excluding the original 'genres' column)
df = pd.concat([df, df_genres], axis=1)


In [9]:
from sklearn.preprocessing import MinMaxScaler

# Normalize numerical columns
scaler = MinMaxScaler()
df[['rating', 'episodes']] = scaler.fit_transform(df[['rating', 'episodes']])


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Select relevant features (normalized user_rating, num_episodes, and one-hot encoded genres)
features = df[['rating', 'episodes'] + list(df.columns[df.columns.str.startswith('genre_')])]

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(features)

# Store the similarity scores for each anime
cosine_sim_df = pd.DataFrame(cosine_sim, index=df['name'], columns=df['name'])


In [11]:
def recommend_anime(target_anime, cosine_sim_df, top_n=10, threshold=0.5):
    # Get the cosine similarity scores for the target anime
    sim_scores = cosine_sim_df[target_anime]
    
    # Sort the scores in descending order and filter based on the threshold
    similar_animes = sim_scores[sim_scores > threshold].sort_values(ascending=False)
    
    # Exclude the target anime itself from the recommendations
    similar_animes = similar_animes.drop(target_anime)
    
    # Return top N recommendations
    return similar_animes.head(top_n)

# Example: Recommend anime similar to "Naruto"
recommendations = recommend_anime('Naruto', cosine_sim_df)
print(recommendations)


name
Yadamon                        0.999993
Gatapishi                      0.999990
Pokemon Advanced Generation    0.999988
Kappamaki                      0.999984
Tetsuwan Atom                  0.999980
Pokemon Diamond &amp; Pearl    0.999978
Obocchama-kun                  0.999972
Dokaben                        0.999960
Urikupen Kyuujo-tai            0.999955
Yu☆Gi☆Oh! Duel Monsters        0.999955
Name: Naruto, dtype: float64


In [12]:
from sklearn.model_selection import train_test_split

# Split the dataset into a training and testing set (e.g., 80% training, 20% testing)
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Example of using the training set to train the recommendation system (e.g., by fitting models if needed)


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming we have binary labels for evaluation (relevant or not relevant recommendations)
# This is just an example as calculating these metrics would depend on the available data.
# Here, we're simulating binary labels for each recommendation.
y_true = [1, 1, 0, 1, 0]  # Simulated ground truth (1=relevant, 0=not relevant)
y_pred = [1, 0, 0, 1, 1]  # Simulated predicted (relevant recommendations)

# Calculate precision, recall, and F1-score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Precision: 0.6666666666666666
Recall: 0.6666666666666666
F1-score: 0.6666666666666666


1. Difference Between User-Based and Item-Based Collaborative Filtering:

User-Based Collaborative Filtering: This method recommends items by finding similar users. For example, if User A and User B have rated similar anime, the system will recommend anime that User B has liked to User A.
Item-Based Collaborative Filtering: This method recommends items based on similarity between items. For example, if you liked "Naruto," the system will recommend other anime that are similar to "Naruto" (based on ratings or content similarity).

2. What is Collaborative Filtering, and How Does It Work?:

Collaborative Filtering is a method used to make automatic predictions about a user's interests by collecting preferences or taste information from many users. The idea is to recommend items (e.g., anime) based on the preferences of similar users or based on similarities between items. This can be user-based or item-based.