
# Assignment: Recommendation System using Cosine Similarity

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix

sns.set(style="whitegrid")


In [2]:
df = pd.read_csv("anime.csv")
print("Shape:", df.shape)
df.head()


Shape: (12294, 7)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama째,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


# 1) Data Preprocessing

In [3]:
df.info()
print("\nMissing values per column:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB

Missing values per column:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [4]:
df.describe(include="all").T.head(15)


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
anime_id,12294.0,,,,14058.221653,11455.294701,1.0,3484.25,10260.5,24794.5,34527.0
name,12294.0,12292.0,Saru Kani Gassen,2.0,,,,,,,
genre,12232.0,3264.0,Hentai,823.0,,,,,,,
type,12269.0,6.0,TV,3787.0,,,,,,,
episodes,12294.0,187.0,1,5677.0,,,,,,,
rating,12064.0,,,,6.473902,1.026746,1.67,5.88,6.57,7.18,10.0
members,12294.0,,,,18071.338864,54820.676925,5.0,225.0,1550.0,9437.0,1013917.0



##Basic cleaning

Common columns in this anime dataset:
- anime_id (or similar)
- name (anime title)
- genre(comma-separated text)
- episodes
- rating
- members

We will:
- Standardize column names to lower-case
- Ensure genre is text
- Convert numeric columns safely
- Fill missing values

In [5]:
# Standardize column names
df.columns = [c.strip().lower() for c in df.columns]

# Detect important columns - assumption from my side
title_col = "name" if "name" in df.columns else ("title" if "title" in df.columns else df.columns[1])
genre_col = "genre" if "genre" in df.columns else None

# Numeric columns
possible_numeric = ["episodes", "rating", "members"]
numeric_cols = [c for c in possible_numeric if c in df.columns]

print("Title column:", title_col)
print("Genre column:", genre_col)
print("Numeric columns:", numeric_cols)


Title column: name
Genre column: genre
Numeric columns: ['episodes', 'rating', 'members']


##Handle missing values

In [6]:
df_clean = df.copy()

# Genre: fill missing with empty string
if genre_col is not None:
    df_clean[genre_col] = df_clean[genre_col].fillna("").astype(str)
else:
    df_clean["genre"] = ""
    genre_col = "genre"

# Episodes sometimes has 'Unknown'; coerce to numeric
if "episodes" in df_clean.columns:
    df_clean["episodes"] = pd.to_numeric(df_clean["episodes"], errors="coerce")

# Convert numeric columns; fill missing with median
for c in numeric_cols:
    df_clean[c] = pd.to_numeric(df_clean[c], errors="coerce")
    df_clean[c] = df_clean[c].fillna(df_clean[c].median())

# Drop rows with missing title
df_clean = df_clean.dropna(subset=[title_col])

df_clean[[title_col, genre_col] + numeric_cols].head()


Unnamed: 0,name,genre,episodes,rating,members
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",1.0,9.37,200630
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",64.0,9.26,793665
2,Gintama째,"Action, Comedy, Historical, Parody, Samurai, S...",51.0,9.25,114262
3,Steins;Gate,"Sci-Fi, Thriller",24.0,9.17,673572
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",51.0,9.16,151266


# 2) Feature Extraction


We  use
-TF-IDF vectorization
-Scaling

Then we combine them and compute cosine similarity.


##TF-IDF for genres

In [7]:
tfidf = TfidfVectorizer(stop_words="english")
genre_tfidf = tfidf.fit_transform(df_clean[genre_col])

print("TF-IDF matrix shape:", genre_tfidf.shape)


TF-IDF matrix shape: (12294, 46)


##Scaling

In [8]:
if len(numeric_cols) > 0:
    scaler = MinMaxScaler()
    X_num = scaler.fit_transform(df_clean[numeric_cols])
    X_num_sparse = csr_matrix(X_num)
    print("Numeric matrix shape:", X_num_sparse.shape)
else:
    X_num_sparse = csr_matrix((df_clean.shape[0], 0))
    print("No numeric columns found; using only genre features.")


Numeric matrix shape: (12294, 3)


In [9]:
X_features = hstack([genre_tfidf, X_num_sparse]).tocsr()
print("Final feature matrix shape:", X_features.shape)


Final feature matrix shape: (12294, 49)


# 3) Recommendation System (Cosine Similarity)

##index mapping

In [10]:
df_clean["title_lower"] = df_clean[title_col].astype(str).str.lower()
title_to_index = pd.Series(df_clean.index, index=df_clean["title_lower"]).drop_duplicates()

print("Unique titles indexed:", title_to_index.shape[0])


Unique titles indexed: 12294


##Recommendation function

In [11]:
def recommend_anime(target_title, top_n=10, min_similarity=0.20, show_similarity=True):
    """Recommend similar anime using cosine similarity."""
    if not isinstance(target_title, str) or target_title.strip() == "":
        raise ValueError("Please provide a valid anime title (string).")

    key = target_title.strip().lower()
    if key not in title_to_index:
        candidates = df_clean[df_clean["title_lower"].str.contains(key, na=False)][title_col].head(10).tolist()
        raise ValueError(f"Title not found. Try one of these close matches: {candidates}")

    idx = int(title_to_index[key])

    sims = cosine_similarity(X_features[idx], X_features).flatten()

    result = df_clean[[title_col, genre_col] + numeric_cols].copy()
    result["similarity"] = sims
    result = result[result.index != idx]

    if min_similarity is not None:
        result = result[result["similarity"] >= float(min_similarity)]

    result = result.sort_values("similarity", ascending=False).head(top_n)

    if not show_similarity:
        result = result.drop(columns=["similarity"])

    return result.reset_index(drop=True)


In [14]:
# Replace with any title from your dataset
try:
    recommend_anime("Naruto", top_n=10, min_similarity=0.20)
except Exception as e:
    print("Error:", e)


In [13]:
test_title = "Naruto"

for thr in [0.10, 0.20, 0.30, 0.40]:
    try:
        recs = recommend_anime(test_title, top_n=5, min_similarity=thr)
        print(f"Threshold {thr} -> {len(recs)} recommendations")
        display(recs)
    except Exception as e:
        print(f"Threshold {thr} -> Error:", e)
        break


Threshold 0.1 -> 5 recommendations


Unnamed: 0,name,genre,episodes,rating,members,similarity
0,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",2.0,7.94,533578,0.991495
1,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",291.0,8.32,375662,0.94279
2,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",153.0,8.16,316102,0.915894
3,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",1.0,7.53,84527,0.905891
4,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",1.0,7.5,83515,0.905552


Threshold 0.2 -> 5 recommendations


Unnamed: 0,name,genre,episodes,rating,members,similarity
0,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",2.0,7.94,533578,0.991495
1,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",291.0,8.32,375662,0.94279
2,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",153.0,8.16,316102,0.915894
3,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",1.0,7.53,84527,0.905891
4,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",1.0,7.5,83515,0.905552


Threshold 0.3 -> 5 recommendations


Unnamed: 0,name,genre,episodes,rating,members,similarity
0,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",2.0,7.94,533578,0.991495
1,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",291.0,8.32,375662,0.94279
2,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",153.0,8.16,316102,0.915894
3,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",1.0,7.53,84527,0.905891
4,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",1.0,7.5,83515,0.905552


Threshold 0.4 -> 5 recommendations


Unnamed: 0,name,genre,episodes,rating,members,similarity
0,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",2.0,7.94,533578,0.991495
1,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar...",291.0,8.32,375662,0.94279
2,Dragon Ball,"Adventure, Comedy, Fantasy, Martial Arts, Shou...",153.0,8.16,316102,0.915894
3,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",1.0,7.53,84527,0.905891
4,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P...",1.0,7.5,83515,0.905552
