In [1]:
import torch

torch.cuda.is_available(), torch.cuda.get_device_name(0)

(True, 'Tesla T4')

In [2]:
!pip install -q transformers accelerate pandas numpy

In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

# Load movies.json from Kaggle upload
df = pd.read_json("/kaggle/input/movies-dataset/movies.json")

df.head()

Unnamed: 0,genres,id,keywords,overview,tagline,title,production_companies,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,Avatar,"[Ingenious Film Partners, Twentieth Century Fo...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[James Horner, James Cameron]"
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,"[Walt Disney Pictures, Jerry Bruckheimer Films...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[Gore Verbinski, Hans Zimmer]"
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,A Plan No One Escapes,Spectre,"[Columbia Pictures, Danjaq, B24]","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[Thomas Newman, Sam Mendes]"
3,"[Action, Crime, Drama, Thriller]",49026,"[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,The Legend Ends,The Dark Knight Rises,"[Legendary Pictures, Warner Bros., DC Entertai...","[Christian Bale, Michael Caine, Gary Oldman, A...","[Hans Zimmer, Christopher Nolan]"
4,"[Action, Adventure, Science Fiction]",49529,"[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","Lost in our world, found in another.",John Carter,[Walt Disney Pictures],"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[Andrew Stanton, Michael Giacchino]"


In [4]:
def join_list(x):
    # Converts list of strings → single string
    # example: ["sci fi", "space"] -> "sci fi space"
    return " ".join(x) if isinstance(x, list) else str(x)

# Build text field for embeddings using ALL relevant columns except "id"
df["embedding_text"] = (
    df["title"].fillna("") + " " +
    df["tagline"].fillna("") + " " +
    df["overview"].fillna("") + " " +
    df["genres"].apply(join_list) + " " +
    df["keywords"].apply(join_list) + " " +
    df["cast"].apply(join_list) + " " +
    df["crew"].apply(join_list) + " " +
    df["production_companies"].apply(join_list)
)

df["embedding_text"].head()

0    Avatar Enter the World of Pandora. In the 22nd...
1    Pirates of the Caribbean: At World's End At th...
2    Spectre A Plan No One Escapes A cryptic messag...
3    The Dark Knight Rises The Legend Ends Followin...
4    John Carter Lost in our world, found in anothe...
Name: embedding_text, dtype: object

In [9]:
df["embedding_text"][0]

'Avatar Enter the World of Pandora. In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez Giovanni Ribisi Joel David Moore CCH Pounder Wes Studi Laz Alonso James Horner James Cameron Ingenious Film Partners Twentieth Century Fox Film Corporation Dune Entertainment Lightstorm Entertainment'

In [10]:
model_name = "BAAI/bge-large-en-v1.5"

from transformers import AutoTokenizer, AutoModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model ON GPU
model = AutoModel.from_pretrained(model_name).to("cuda")

model_name

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

2026-01-23 08:08:50.153999: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769155730.437083      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769155730.527751      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769155731.177560      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769155731.177605      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769155731.177608      55 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'BAAI/bge-large-en-v1.5'

In [11]:
# Count tokens for each movie using the tokenizer
token_counts = df["embedding_text"].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))

# Show basic stats
token_counts.describe()

count    3757.000000
mean      147.142667
std        39.016140
min        47.000000
25%       119.000000
50%       144.000000
75%       169.000000
max       368.000000
Name: embedding_text, dtype: float64

In [12]:
import torch
import numpy as np

# Mean Pooling function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # [batch, seq, hidden]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Function to embed a batch of texts
def embed_texts(text_list, batch_size=16):
    """
    text_list: list of strings
    returns: numpy array of shape (len(text_list), 1024)
    
    example returned array:
    array([[0.12, -0.08, ... 1024 dims ...],
           [0.05,  0.22, ... 1024 dims ...]])
    """
    
    all_embeddings = []

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]

        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to("cuda")

        with torch.no_grad():
            model_output = model(**enc)

        # Mean pooling → [batch_size, 1024]
        embeddings = mean_pooling(model_output, enc["attention_mask"])

        # Normalize embeddings (recommended for BGE)
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

        all_embeddings.append(embeddings.cpu().numpy())

    return np.vstack(all_embeddings)

In [13]:
# Extract the list of texts to embed
texts = df["embedding_text"].tolist()

# Generate embeddings (this takes ~6–10 minutes on Kaggle T4)
embeddings = embed_texts(texts, batch_size=16)

# Show shape of the result
embeddings.shape

(3757, 1024)

In [14]:
import pickle

# 1) Save the embeddings (numpy array)
with open("movie_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

# 2) Save the row index → movie ID mapping  
# If your df has "id", use it; otherwise just use row index
if "id" in df.columns:
    movie_ids = df["id"].tolist()
else:
    movie_ids = list(range(len(df)))

with open("movie_ids.pkl", "wb") as f:
    pickle.dump(movie_ids, f)

embeddings.shape, len(movie_ids)

((3757, 1024), 3757)