In [None]:
!pip install transformers accelerate bitsandbytes sentencepiece

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import pickle
import os

MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
DEVICE = "cuda"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True
)

In [None]:
import pandas as pd
df = pd.read_json("/kaggle/input/movies-dataset/movies.json")  # change if needed
movies = df.to_dict(orient="records")
N = len(movies)

In [None]:
df.head(2)

In [None]:
def join_list(x):
    # Converts list of strings → single string
    # example: ["sci fi", "space"] -> "sci fi space"
    return " ".join(x) if isinstance(x, list) else str(x)

# Build text field for embeddings using ALL relevant columns except "id"
df["embedding_text"] = (
    df["title"].fillna("") + " " +
    df["tagline"].fillna("") + " " +
    df["overview"].fillna("") + " " +
    df["genres"].apply(join_list) + " " +
    df["keywords"].apply(join_list) + " " +
    df["cast"].apply(join_list) + " " +
    df["crew"].apply(join_list) + " " +
    df["production_companies"].apply(join_list)
)

df["embedding_text"].head()

In [None]:
df["embedding_text"][0]

In [None]:
BATCH_SIZE = 32
SAVE_EVERY = 10  # batches
OUT_DIR = "/kaggle/working/tags/"
os.makedirs(OUT_DIR, exist_ok=True)

def make_prompt(text):
    return (
        'Generate 3–5 short style tags.\n'
        'Output ONLY a JSON list: ["tag1","tag2","tag3"]\n\n'
        f'Movie: {text}'
    )

In [None]:
# ============================================
# STEP 3 — Resume Support
# ============================================
done_ranges = []
for f in os.listdir(OUT_DIR):
    if f.endswith(".pkl"):
        a, b = f.replace("tags_", "").replace(".pkl","").split("_")
        done_ranges.append((int(a), int(b)))

def is_done(idx):
    for a, b in done_ranges:
        if a <= idx <= b:
            return True
    return False

In [None]:
# ============================================
# STEP 4 — Main Loop
# ============================================
buffer = {}
batch_count = 0
start_index = 0

# find resume point
for i in range(0, N, BATCH_SIZE):
    idx = min(i + BATCH_SIZE - 1, N - 1)
    if not is_done(i) and not is_done(idx):
        start_index = i
        break

print("Resuming from:", start_index)

for i in range(start_index, N, BATCH_SIZE):
    batch = df.iloc[i:i+BATCH_SIZE]

    for _, m in batch.iterrows():
        text = m["embedding_text"]
        prompt = make_prompt(text)

        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=64,
                temperature=0.2,
                top_p=0.9,
                do_sample=True
            )
        resp = tokenizer.decode(out[0], skip_special_tokens=True)
        buffer[m["id"]] = resp


    batch_count += 1

    # save every 10 batches
    if batch_count % SAVE_EVERY == 0:
        a = i - (SAVE_EVERY - 1) * BATCH_SIZE
        b = i + BATCH_SIZE - 1
        b = min(b, N - 1)
        name = f"tags_{a:04d}_{b:04d}.pkl"
        with open(os.path.join(OUT_DIR, name), "wb") as f:
            pickle.dump(buffer, f)
        buffer = {}

# save last leftover
if buffer:
    a = (N // (BATCH_SIZE * SAVE_EVERY)) * (BATCH_SIZE * SAVE_EVERY)
    b = N - 1
    name = f"tags_{a:04d}_{b:04d}.pkl"
    with open(os.path.join(OUT_DIR, name), "wb") as f:
        pickle.dump(buffer, f)

print("DONE.")