In [22]:
#1. Data Preprocessing
#1.1 Load Dataset
import pandas as pd

anime_df = pd.read_csv('anime.csv')
print(anime_df.head())
print(anime_df.info())


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  

In [23]:
#1.2 Handle Missing Values

anime_df.isnull().sum()

# Drop or impute
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
anime_df.dropna(inplace=True)  # if dropping rows is acceptable

#1.3 Explore Dataset

anime_df.describe()
anime_df['genre'].value_counts()
anime_df['type'].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)


type
TV         3777
OVA        3310
Movie      2306
Special    1674
ONA         655
Music       488
Name: count, dtype: int64

In [24]:
#2. Feature Extraction
#2.1 Encode Categorical Features
#Genres: Use multi-label binarizer

#Type: One-hot encode
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres into list
anime_df['genre'] = anime_df['genre'].apply(lambda x: str(x).split(', '))
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(anime_df['genre'])

# Merge back
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)
anime_features = pd.concat([anime_df[['rating', 'members']], genre_df], axis=1)


In [25]:
#2.2 Normalize Features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
anime_scaled = scaler.fit_transform(anime_features)

In [26]:
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
anime_df['members'].fillna(anime_df['members'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['members'].fillna(anime_df['members'].mean(), inplace=True)


In [27]:
anime_features = pd.concat([anime_df[['rating', 'members']], genre_df], axis=1)
anime_scaled = scaler.fit_transform(anime_features)


In [28]:
import pandas as pd
import numpy as np

print("NaNs in anime_scaled:", np.isnan(anime_scaled).sum())



NaNs in anime_scaled: 3690


In [29]:
def recommend_anime(title, top_n=5, threshold=0.5):
    index = anime_df[anime_df['name'] == title].index[0]
    similarity_scores = list(enumerate(cos_sim[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    filtered_scores = [i for i in similarity_scores if i[1] >= threshold and i[0] != index]
    
    top_matches = filtered_scores[:top_n]
    recommendations = [anime_df.iloc[i[0]]['name'] for i in top_matches]
    
    return recommendations




In [30]:
np.isnan(anime_scaled).sum()  # should be 0


3690

In [31]:
# Select only numeric features
anime_features = anime_df.select_dtypes(include=['float64', 'int64'])

# Handle missing values - fill NaNs with the column mean (or you can choose 0)
anime_features = anime_features.fillna(anime_features.mean())

# Normalize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
anime_scaled = scaler.fit_transform(anime_features)

# Now safely compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
cos_sim_matrix = cosine_similarity(anime_scaled)

print("Cosine similarity matrix computed successfully.")


Cosine similarity matrix computed successfully.


In [32]:

# Task 2: Evaluation using Precision, Recall, F1-score

from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Create dummy binary relevance: top 5 similar anime considered relevant
def evaluate_recommendations(sim_matrix, k=5):
    precisions, recalls, f1s = [], [], []

    for i in range(len(sim_matrix)):
        true_indices = np.argsort(sim_matrix[i])[::-1][1:k+1]
        predicted_indices = np.argsort(sim_matrix[i])[::-1][1:k+1]

        y_true = np.zeros(sim_matrix.shape[0])
        y_pred = np.zeros(sim_matrix.shape[0])

        y_true[true_indices] = 1
        y_pred[predicted_indices] = 1

        precisions.append(precision_score(y_true, y_pred))
        recalls.append(recall_score(y_true, y_pred))
        f1s.append(f1_score(y_true, y_pred))

    print(f"Precision: {np.mean(precisions):.2f}")
    print(f"Recall: {np.mean(recalls):.2f}")
    print(f"F1 Score: {np.mean(f1s):.2f}")

evaluate_recommendations(cos_sim_matrix, k=5)


Precision: 1.00
Recall: 1.00
F1 Score: 1.00



### Task 3: Interview Questions

**Q1. What is the difference between precision and recall?**

- **Precision** is the proportion of true positive predictions among all positive predictions made by the model.  
  `Precision = TP / (TP + FP)`

- **Recall** is the proportion of true positive predictions among all actual positives.  
  `Recall = TP / (TP + FN)`

- Precision is useful when false positives are costly.  
- Recall is critical when false negatives are more severe.

---

**Q2. What is cross-validation, and why is it important in binary classification?**

- **Cross-validation** is a technique where the data is split into multiple folds, and the model is trained and validated on different combinations of these folds.
- It helps assess model performance more reliably by reducing variance caused by a single train-test split.
- In **binary classification**, it ensures the model generalizes well to unseen data and helps avoid overfitting.
