In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

print("Environment ready ✅")

Environment ready ✅


In [30]:
data = [
    {"item_id": 1, "title": "Rescue Mission", "genre": "Action", "price": 10, "duration": 155,
     "description": "An elite soldier leads a dangerous rescue mission behind enemy lines."},
    {"item_id": 2, "title": "City Under Siege", "genre": "Action", "price": 12, "duration": 140,
     "description": "A tactical unit races to stop a coordinated attack in a modern city."},
    {"item_id": 3, "title": "Love in Paris", "genre": "Romance", "price": 9, "duration": 110,
     "description": "Two strangers meet in Paris and fall in love while chasing their dreams."},
    {"item_id": 4, "title": "Comedy Nights", "genre": "Comedy", "price": 8, "duration": 95,
     "description": "A struggling comedian turns daily disasters into hilarious stand-up routines."},
    {"item_id": 5, "title": "Space Odyssey", "genre": "SciFi", "price": 14, "duration": 165,
     "description": "A crew explores deep space and discovers a mystery that changes humanity."},
    {"item_id": 6, "title": "Haunted Manor", "genre": "Horror", "price": 11, "duration": 105,
     "description": "A group investigates a haunted manor where every room hides a dark secret."},
]

df = pd.DataFrame(data)
df

Unnamed: 0,item_id,title,genre,price,duration,description
0,1,Rescue Mission,Action,10,155,An elite soldier leads a dangerous rescue miss...
1,2,City Under Siege,Action,12,140,A tactical unit races to stop a coordinated at...
2,3,Love in Paris,Romance,9,110,Two strangers meet in Paris and fall in love w...
3,4,Comedy Nights,Comedy,8,95,A struggling comedian turns daily disasters in...
4,5,Space Odyssey,SciFi,14,165,A crew explores deep space and discovers a mys...
5,6,Haunted Manor,Horror,11,105,A group investigates a haunted manor where eve...


In [3]:
# COnvert Text to Vector for easy comparison

tfidf = TfidfVectorizer(stop_words="english")
#Convert each description into a vector of numbers
X_text = tfidf.fit_transform(df["description"]).toarray()

# Normalize text vectors (good practice for cosine similarity)
X_text = X_text / (np.linalg.norm(X_text, axis=1, keepdims=True) + 1e-12)

print("X_text shape:", X_text.shape)
print("First 10 TF-IDF values for movie 1:", X_text[0][:10])
# TF-IDF makes a vector where: words that appear a lot in this description get higher values or 
# words that appear in many descriptions (common words) get lower values
# ONE text vector for ONE movie description below shown

X_text shape: (6, 47)
First 10 TF-IDF values for movie 1: [0.         0.         0.         0.         0.         0.
 0.         0.         0.35355339 0.        ]


In [4]:
# Convert Genre to One-Hot Vectors - Represent genres/categories numerically without inventing fake order.

# List all unique genres
genres = sorted(df["genre"].unique())
print("Genres:", genres)

# Create a mapping: genre -> index
genre_index = {g: i for i, g in enumerate(genres)}

# Create one-hot matrix
X_genre = np.zeros((len(df), len(genres)))

for i, g in enumerate(df["genre"]):
    X_genre[i, genre_index[g]] = 1.0

# Normalize genre vectors (good practice)
X_genre = X_genre / (np.linalg.norm(X_genre, axis=1, keepdims=True) + 1e-12)

X_genre
# Each row is one movie and each column is one genre

Genres: ['Action', 'Comedy', 'Horror', 'Romance', 'SciFi']


array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.]])

In [5]:
# Step 5: Convert numeric features (price, duration) to vectors

scaler = MinMaxScaler()

# Convert price & duration into 0–1 range
X_num = scaler.fit_transform(df[["price", "duration"]].values)

# Normalize numeric vectors (good practice for cosine similarity)
X_num = X_num / (np.linalg.norm(X_num, axis=1, keepdims=True) + 1e-12)

print("X_num shape:", X_num.shape)
print("First 3 numeric vectors:\n", X_num[:3])

X_num shape: (6, 2)
First 3 numeric vectors:
 [[0.36244626 0.93200467]
 [0.7198443  0.69413557]
 [0.61394061 0.78935222]]


In [7]:
# Step 6: Combine vectors CORRECTLY using concatenation

w_text, w_genre, w_num = 0.6, 0.25, 0.15  # weights sum to 1

X_final = np.hstack([
    w_text * X_text,
    w_genre * X_genre,
    w_num * X_num
])

# Normalize final vectors
X_final = X_final / (np.linalg.norm(X_final, axis=1, keepdims=True) + 1e-12)

print("X_text shape:", X_text.shape)
print("X_genre shape:", X_genre.shape)
print("X_num shape:", X_num.shape)
print("X_final shape:", X_final.shape)

X_text shape: (6, 47)
X_genre shape: (6, 5)
X_num shape: (6, 2)
X_final shape: (6, 54)


In [8]:
S = cosine_similarity(X_final)
sim_df = pd.DataFrame(S, index=df["title"], columns=df["title"])
sim_df

title,Rescue Mission,City Under Siege,Love in Paris,Comedy Nights,Space Odyssey,Haunted Manor
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Rescue Mission,1.0,0.186352,0.048448,0.0,0.04628,0.030567
City Under Siege,0.186352,1.0,0.050049,0.0,0.050553,0.044638
Love in Paris,0.048448,0.050049,1.0,0.0,0.050171,0.040812
Comedy Nights,0.0,0.0,0.0,1.0,0.0,0.0
Space Odyssey,0.04628,0.050553,0.050171,0.0,1.0,0.044199
Haunted Manor,0.030567,0.044638,0.040812,0.0,0.044199,1.0


In [28]:
def recommend(title, top_n=3):
    # get the item row (first match)
    row = df[df["title"].str.lower() == title.lower()]
    if len(row) == 0:
        return "Title not found"

    item_id = int(row.iloc[0]["item_id"])

    # find row index i for that item_id
    i = int(df.index[df["item_id"] == item_id][0])

    scores = list(enumerate(S[i]))

    # remove itself using item_id (stable)
    scores = [(j, s) for j, s in scores if int(df.loc[j, "item_id"]) != item_id]

    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    results = []
    for j, score in scores[:top_n]:
        results.append((df.loc[j, "title"], df.loc[j, "genre"], round(float(score), 3)))
    return results

In [23]:
recommend("Rescue Mission", top_n=3)

[('City Under Siege', 'Action', 0.186),
 ('Love in Paris', 'Romance', 0.048),
 ('Space Odyssey', 'SciFi', 0.046)]

In [29]:
# e.g User is looking at Love in Paris.
# Based on its vector, find the top 3 most similar items.
recommend("Love in Paris", top_n=3)
recommend("Space Odyssey", top_n=3)

[('City Under Siege', 'Action', 0.051),
 ('Love in Paris', 'Romance', 0.05),
 ('Rescue Mission', 'Action', 0.046)]

In [26]:
sim_df.loc["Rescue Mission"].sort_values(ascending=False)

title
Rescue Mission      1.000000
City Under Siege    0.186352
Love in Paris       0.048448
Space Odyssey       0.046280
Haunted Manor       0.030567
Comedy Nights       0.000000
Name: Rescue Mission, dtype: float64