In [4]:
# ====================================================
# Recommendation System..
# ====================================================

# ---------- Imports ----------
%matplotlib inline
import os
import re
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

sns.set()

# ---------- File path (change if needed) ----------
DATA_PATH = r"C:\Users\abhin\Downloads\anime (2).csv"
FALLBACK = r"/mnt/data/anime (2).csv"

# ---------- Load dataset ----------
try:
    df = pd.read_csv(DATA_PATH)
    print("Loaded CSV from:", DATA_PATH)
except Exception as e1:
    try:
        df = pd.read_csv(FALLBACK)
        print("Loaded CSV from fallback:", FALLBACK)
    except Exception as e2:
        raise FileNotFoundError(f"Could not load dataset from either {DATA_PATH} or {FALLBACK}.\nErrors:\n{e1}\n{e2}")

print("Initial shape:", df.shape)
display(df.head(3))

# ---------- Basic cleaning & column mapping ----------
col_map = {}
if 'anime_id' in df.columns: col_map['anime_id'] = 'id'
if 'animeId' in df.columns: col_map['animeId'] = 'id'
if 'name' in df.columns: col_map['name'] = 'title'
if 'title' in df.columns and 'name' not in df.columns: col_map['title'] = 'title'
# prefer 'genres' or 'genre'
if 'genres' in df.columns: col_map['genres'] = 'genres'
elif 'genre' in df.columns: col_map['genre'] = 'genres'
if 'episodes' in df.columns: col_map['episodes'] = 'episodes'
if 'rating' in df.columns: col_map['rating'] = 'rating'
if 'members' in df.columns: col_map['members'] = 'members'
if 'type' in df.columns: col_map['type'] = 'type'

df = df.rename(columns=col_map)
print("Columns after mapping:", df.columns.tolist())

# Ensure required columns exist
if 'title' not in df.columns:
    raise ValueError("Column 'title' not found. Rename your title column to 'title' or 'name' in CSV.")
if 'genres' not in df.columns:
    # try to find any column with 'genre' substring
    cand = [c for c in df.columns if 'genre' in c.lower()]
    if cand:
        df = df.rename(columns={cand[0]:'genres'})
    else:
        raise ValueError("Column 'genres' not found. Please ensure CSV has a genre column.")

# ---------- Genres cleaning ----------
df['genres'] = df['genres'].fillna("").astype(str)

def normalize_genres(s):
    s = s.lower().replace('|',',').replace(';',',')
    # keep words and multi-word phrases; remove extra spaces
    tokens = [t.strip() for t in s.split(',') if t.strip()!='']
    return ','.join(tokens)

df['genres_clean'] = df['genres'].apply(normalize_genres)

# ---------- Safe numeric conversion + imputation ----------
for col in ['episodes','rating','members']:
    if col in df.columns:
        # attempt to extract first integer/float from string if present
        # fallback: coerce to numeric directly
        converted = pd.to_numeric(df[col], errors='coerce')
        # if many NaNs, try extracting numbers from strings
        if converted.isna().sum() > 0:
            def extract_num(x):
                if pd.isna(x):
                    return np.nan
                s = str(x)
                m = re.search(r'\d+(\.\d+)?', s)
                return float(m.group()) if m else np.nan
            extracted = df[col].apply(extract_num)
            # use extracted where converted is NaN
            converted = converted.fillna(extracted)
        med = converted.median()
        if np.isnan(med):
            med = 0.0
        df[col] = converted.fillna(med).astype(float)
        print(f"Column '{col}': converted to numeric, median used = {med}")

# drop exact duplicate titles (keep first)
df = df.drop_duplicates(subset=['title']).reset_index(drop=True)
print("After dedup shape:", df.shape)

# ---------- Feature extraction: TF-IDF on genres + numeric features ----------
tfidf = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")  # tokens are word tokens (genres)
tfidf_genres = tfidf.fit_transform(df['genres_clean'])
print("TF-IDF genres shape:", tfidf_genres.shape)

numeric_cols = [c for c in ['episodes','rating','members'] if c in df.columns]
if numeric_cols:
    scaler = StandardScaler()
    numeric_arr = scaler.fit_transform(df[numeric_cols].fillna(0))
    numeric_sparse = sparse.csr_matrix(numeric_arr)
    features = sparse.hstack([tfidf_genres, numeric_sparse], format='csr')
else:
    features = tfidf_genres

print("Final feature matrix shape:", features.shape)

# ---------- Cosine similarity (item-based) ----------
# Note: for moderately sized datasets this is fine. If memory issues occur, use NearestNeighbors with metric='cosine'.
cos_sim = cosine_similarity(features, features)
print("Cosine similarity matrix computed:", cos_sim.shape)

# ---------- Utility maps & recommender ----------
title_to_idx = {t: i for i,t in enumerate(df['title'].values)}

def recommend_by_title(title, top_n=10, min_score=0.15, include_score=False):
    """
    Recommend anime similar to 'title' using cosine similarity on features.
    - top_n: max number of recommendations
    - min_score: minimal cosine similarity threshold (0..1)
    - include_score: whether to return similarity score
    """
    # exact match
    if title not in title_to_idx:
        # try partial case-insensitive matches
        matches = [t for t in df['title'] if title.lower() in t.lower()]
        if matches:
            raise ValueError(f"Title not exact. Possible matches (partial): {matches[:8]}")
        raise ValueError(f"Title '{title}' not found in dataset.")
    idx = title_to_idx[title]
    scores = cos_sim[idx].copy()
    scores[idx] = -1.0  # exclude self
    # candidates above threshold
    candidates = np.where(scores >= min_score)[0]
    if candidates.size == 0:
        # fallback to top_n highest scores
        top_idx = np.argsort(scores)[-top_n:][::-1]
    else:
        sorted_cand = candidates[np.argsort(scores[candidates])[::-1]]
        top_idx = sorted_cand[:top_n]
    out = df.iloc[top_idx].copy()
    if include_score:
        out = out[['title'] + numeric_cols].copy() if numeric_cols else out[['title']].copy()
        out['score'] = scores[top_idx]
    else:
        out = out[['title'] + numeric_cols] if numeric_cols else out[['title']]
    return out.reset_index(drop=True)

# ---------- Example usage (change the example_title as you want) ----------
print("\nSample titles (pick one for recommendations):")
print(df['title'].sample(8).values)

example_title = df['title'].iloc[0]
print("\nRecommendations for (example):", example_title)
display(recommend_by_title(example_title, top_n=8, min_score=0.12, include_score=True))

# ---------- Save top-10 recommendations for all items (optional) ----------
all_recs = []
for t in df['title']:
    try:
        recs = recommend_by_title(t, top_n=10, min_score=0.12, include_score=True)
        rec_titles = recs['title'].tolist()
        rec_scores = recs['score'].tolist() if 'score' in recs.columns else [None]*len(rec_titles)
        all_recs.append({
            'title': t,
            'recommendations': "|".join(rec_titles),
            'scores': "|".join([str(round(s,3)) for s in rec_scores])
        })
    except Exception:
        all_recs.append({'title': t, 'recommendations': "", 'scores': ""})

recs_df = pd.DataFrame(all_recs)
out_file = "anime_recommendations_top10.csv"
recs_df.to_csv(out_file, index=False)
print("\nSaved recommendations to:", os.path.abspath(out_file))

# ---------- How to call ----------
print("\nTo get recommendations, call:")
print("recommend_by_title('Exact Anime Title', top_n=5, min_score=0.15, include_score=True)")


Loaded CSV from: C:\Users\abhin\Downloads\anime (2).csv
Initial shape: (12294, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262


Columns after mapping: ['id', 'title', 'genres', 'type', 'episodes', 'rating', 'members']
Column 'episodes': converted to numeric, median used = 2.0
Column 'rating': converted to numeric, median used = 6.57
Column 'members': converted to numeric, median used = 1550.0
After dedup shape: (12292, 8)
TF-IDF genres shape: (12292, 47)
Final feature matrix shape: (12292, 50)
Cosine similarity matrix computed: (12292, 12292)

Sample titles (pick one for recommendations):
['Slayers: The Motion Picture'
 'Meitantei Hangyodon: Kaitou Ruzu Arawaruno-kan' 'Lady Lady!! (1988)'
 'Crazy for It' 'Nichijou: Tanken Nichijou no Machi'
 'Seikimatsu Occult Gakuin' 'Youkai Ningen Bem'
 'Hyakka Ryouran: Samurai Bride']

Recommendations for (example): Kimi no Na wa.


Unnamed: 0,title,episodes,rating,members,score
0,Hotarubi no Mori e,1.0,8.61,197439.0,0.973841
1,"Clannad: After Story - Mou Hitotsu no Sekai, K...",1.0,8.02,138364.0,0.973362
2,Suzumiya Haruhi no Shoushitsu,1.0,8.81,240297.0,0.964515
3,Sakamichi no Apollon,12.0,8.48,146592.0,0.959406
4,Haikyuu!! Second Season,25.0,8.93,179342.0,0.959179
5,"Clannad: Mou Hitotsu no Sekai, Tomoyo-hen",1.0,8.14,160423.0,0.956675
6,Nekomonogatari: Kuro,4.0,8.06,173264.0,0.956535
7,Yahari Ore no Seishun Love Comedy wa Machigatt...,13.0,8.31,222994.0,0.955931



Saved recommendations to: C:\Users\abhin\anime_recommendations_top10.csv

To get recommendations, call:
recommend_by_title('Exact Anime Title', top_n=5, min_score=0.15, include_score=True)


In [None]:
 # Interview Questions:
# 1. Can you explain the difference between user-based and item-based collaborative filtering?
# 2. What is collaborative filtering, and how does it work?
################# #####User-Based Collaborative Filtering#####################################
# ->It finds similar users based on their past behavior.
# ->Then it recommends items that similar users liked.
# Item-Based Collaborative Filtering
# ->It finds similar items based on user ratings.
# ->Then it recommends items that are similar to what the user already liked.

################## 2. What is Collaborative Filtering, and how does it work?###################################3
# Collaborative Filtering
# ->It is a recommendation method that uses past user behavior (like ratings, purchases, clicks).
# ->It assumes that users who behaved similarly in the past will like similar items in the future.
# How it works (very simple):
#  ->Looks at user–item interactions (what people rated or liked).
#  ->Finds patterns based on similar users or similar items.
# ->Recommends items that similar users liked or similar items related to what the user liked.