In [3]:
import pandas as pd
import pickle
import re
from difflib import get_close_matches
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the preprocessed anime dataset
anime_df = pd.read_csv("../data/anime_cleaned.csv")  # Use relative path

# Ensure 'genre' is a list, handle missing values, and join words correctly
def clean_genre(genre):
    if isinstance(genre, list):
        return " ".join(genre)
    elif isinstance(genre, str):
        try:
            genre_list = eval(genre)
            if isinstance(genre_list, list):
                return " ".join(genre_list)
        except:
            return ""
    return ""

# Apply genre cleaning
anime_df["genre_str"] = anime_df["genre"].apply(clean_genre)
anime_df = anime_df[anime_df["genre_str"].str.strip() != ""]

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words=None, token_pattern=r"\b\w+\b")
tfidf_matrix = tfidf.fit_transform(anime_df["genre_str"])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Save cosine similarity matrix
with open("../models/cosine_sim.pkl", "wb") as f:
    pickle.dump(cosine_sim, f)

# Create an index mapping from anime name to dataframe index
indices = pd.Series(anime_df.index, index=anime_df["name"]).drop_duplicates()

# Normalize anime titles
def normalize_title(title):
    title = title.lower()
    title = re.sub(r"[^\w\s]", "", title)
    return title

anime_df["normalized_name"] = anime_df["name"].apply(normalize_title)

# Find best match for user input
def find_best_match(user_input, anime_names):
    user_input = normalize_title(user_input)
    matches = get_close_matches(user_input, anime_names, n=1, cutoff=0.6)
    return matches[0] if matches else None

# Get recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in indices:
        return f"'{title}' not found in dataset."

    idx = indices[title]
    sim_scores = sorted(enumerate(cosine_sim[idx]), key=lambda x: x[1], reverse=True)[1:11]
    anime_indices = [i[0] for i in sim_scores]

    recommended_anime = anime_df["name"].iloc[anime_indices].tolist()

    print(f"\n🔹 Top 10 anime similar to {title}: \n")
    for i, anime in enumerate(recommended_anime, start=1):
        print(f"   {i}. {anime}")

    return recommended_anime

# Save indices mapping
with open("../models/indices.pkl", "wb") as f:
    pickle.dump(indices, f)

print("✅ indices.pkl and cosine_sim.pkl created successfully!")

# Test with sample input
print(get_recommendations("Naruto"))


✅ indices.pkl and cosine_sim.pkl created successfully!

🔹 Top 10 anime similar to Naruto: 

   1. Naruto: Shippuuden
   2. Naruto
   3. Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi
   4. Naruto x UT
   5. Naruto: Shippuuden Movie 4 - The Lost Tower
   6. Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono
   7. Naruto Shippuuden: Sunny Side Battle
   8. Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!
   9. Kyutai Panic Adventure!
   10. Naruto: Shippuuden Movie 6 - Road to Ninja
['Naruto: Shippuuden', 'Naruto', 'Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi', 'Naruto x UT', 'Naruto: Shippuuden Movie 4 - The Lost Tower', 'Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono', 'Naruto Shippuuden: Sunny Side Battle', 'Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!', 'Kyutai Panic Adventure!', 'Naruto: Shippuuden Movie 6 - Road to Ninja']
