In [1]:
!pip install -q sentence-transformers faiss-cpu pandas numpy tqdm


In [2]:
from pathlib import Path
import json, math
import numpy as np
import pandas as pd
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
import torch
import faiss

# 🔧 EDIT this if your CSV is elsewhere
CSV_PATH = Path("/Users/dihan_ahmed/DRIVE_1/UTS/4th semester/ilab/artifacts/ucf_llm_featrures_2.csv")

# Where to save the vector index + metadata
OUT_DIR = Path("artifacts/index")
OUT_DIR.mkdir(parents=True, exist_ok=True)
FAISS_PATH   = OUT_DIR / "faiss.index"
META_PATH    = OUT_DIR / "meta.jsonl"
MANIFEST_PATH= OUT_DIR / "manifest.json"

print("CSV:", CSV_PATH.resolve())
print("Out:", OUT_DIR.resolve())


  from .autonotebook import tqdm as notebook_tqdm


CSV: /Users/dihan_ahmed/DRIVE_1/UTS/4th semester/ilab/artifacts/ucf_llm_featrures_2.csv
Out: /Users/dihan_ahmed/DRIVE_1/UTS/4th semester/ilab/artifacts/index


In [3]:
#load the extracted features csv
assert CSV_PATH.exists(), f"CSV not found at {CSV_PATH}"
df = pd.read_csv(CSV_PATH)

# Normalize/clean a bit
for col in ["attributes", "actions", "interactions", "sentence"]:
    if col in df.columns:
        df[col] = df[col].fillna("").astype(str).str.strip()
    else:
        df[col] = ""

df["video"]     = df["video"].astype(str)
df["scene_idx"] = df["scene_idx"].astype(int)
df["t_start"]   = pd.to_numeric(df["t_start"], errors="coerce")
df["t_end"]     = pd.to_numeric(df["t_end"], errors="coerce")

print("Rows:", len(df))
df.head(3)


Rows: 230


Unnamed: 0,video,scene_idx,t_start,t_end,sentence,attributes,actions,interactions
0,Abuse001_x264,0,0.0,5.3,"A woman with short hair, slightly fat, wearing...",female;short hair;slightly fat,picking up a book;opening it to read,table
1,Abuse001_x264,1,7.0,8.5,A man wearing a white shirt and black pants en...,male;white shirt;black pants,entered the house;walked towards,short-haired and fat woman
2,Abuse001_x264,2,7.2,8.5,A man wearing a black shirt and black pants en...,male;black shirt;black pants,entered the house;walked towards,short-haired and fat woman


In [5]:
#build interactions strings (where we'll embed)
def make_interaction_string(row) -> str:
    # Keep gender inside attributes (as you requested)
    bits = []
    if row["attributes"]:
        bits.append(f"attributes: {row['attributes']}")
    if row["actions"]:
        bits.append(f"actions: {row['actions']}")
    if row["interactions"]:
        bits.append(f"interacts_with: {row['interactions']}")
    # Include the natural sentence (helps retrieval phrasing)
    if row["sentence"]:
        bits.append(f"sentence: {row['sentence']}")
    # Join with clear separators
    return " | ".join(bits)

df["interaction"] = df.apply(make_interaction_string, axis=1)

# Create stable doc ids per row
df["doc_id"] = df.apply(lambda r: f"{r['video']}__{int(r['scene_idx'])}", axis=1)

# Drop rows with no content at all (rare)
df = df[df["interaction"].str.len() > 0].reset_index(drop=True)
print("Usable rows:", len(df))
df[["doc_id","interaction"]].head(10)


Usable rows: 230


Unnamed: 0,doc_id,interaction
0,Abuse001_x264__0,attributes: female;short hair;slightly fat | a...
1,Abuse001_x264__1,attributes: male;white shirt;black pants | act...
2,Abuse001_x264__2,attributes: male;black shirt;black pants | act...
3,Abuse001_x264__3,sentence: A man wearing a white shirt and blac...
4,Abuse001_x264__4,attributes: male;black clothes | actions: punc...
5,Abuse001_x264__5,attributes: wooden;red | actions: fell;knocked...
6,Abuse001_x264__6,attributes: female;short hair;fat figure | act...
7,Abuse001_x264__7,attributes: unknown;short hair;slightly fat;wh...
8,Abuse001_x264__8,attributes: unknown;short hair;slightly fat | ...
9,Abuse002_x264__0,attributes: smooth;traffic;green light;vehicle...


In [6]:
# load the embedding model
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384-dim, lightweight, good quality
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer(EMBED_MODEL, device=device)
print("Model:", EMBED_MODEL, "| device:", device)


Model: sentence-transformers/all-MiniLM-L6-v2 | device: mps


In [7]:
#embeded interaction strings
texts = df["interaction"].tolist()

# Encode in batches; convert_to_numpy gives float32 numpy array
embeddings = model.encode(
    texts,
    batch_size=256,
    normalize_embeddings=False,   # we'll L2-normalize ourselves for cosine
    convert_to_numpy=True,
    show_progress_bar=True,
)

# L2-normalize → cosine similarity via inner product
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
embeddings = embeddings / norms

print("Embeddings:", embeddings.shape, embeddings.dtype)


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]

Embeddings: (230, 384) float32





In [8]:
#build and save the faiss index
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)         # inner product
index.add(embeddings.astype("float32"))
print("FAISS ntotal:", index.ntotal)

# Save FAISS index
faiss.write_index(index, str(FAISS_PATH))

# Save row-level metadata as JSONL aligned to vector order
with META_PATH.open("w", encoding="utf-8") as f:
    for i, row in df.iterrows():
        meta = {
            "idx": i,
            "doc_id": row["doc_id"],
            "video": row["video"],
            "scene_idx": int(row["scene_idx"]),
            "t_start": None if math.isnan(row["t_start"]) else float(row["t_start"]),
            "t_end":   None if math.isnan(row["t_end"])   else float(row["t_end"]),
            "sentence": row["sentence"],
            "attributes": row["attributes"],
            "actions": row["actions"],
            "interactions": row["interactions"],
            "interaction_string": row["interaction"],
        }
        f.write(json.dumps(meta, ensure_ascii=False) + "\n")

# Write a small manifest for reproducibility
manifest = {
    "embedding_model": EMBED_MODEL,
    "embedding_dim": dim,
    "index_type": "IndexFlatIP (cosine via normalized vectors)",
    "vectors": int(index.ntotal),
    "sources": {
        "csv": str(CSV_PATH.resolve()),
    },
    "files": {
        "faiss_index": str(FAISS_PATH.resolve()),
        "metadata_jsonl": str(META_PATH.resolve()),
    }
}
MANIFEST_PATH.write_text(json.dumps(manifest, indent=2), encoding="utf-8")

print("✅ Saved:")
print(" -", FAISS_PATH)
print(" -", META_PATH)
print(" -", MANIFEST_PATH)


FAISS ntotal: 230
✅ Saved:
 - artifacts/index/faiss.index
 - artifacts/index/meta.jsonl
 - artifacts/index/manifest.json


In [9]:
# Lightweight search function (for quick local testing)

# Utility: load FAISS + metadata back
def load_index_and_meta():
    idx = faiss.read_index(str(FAISS_PATH))
    metas = []
    with META_PATH.open("r", encoding="utf-8") as f:
        for line in f:
            metas.append(json.loads(line))
    return idx, metas

index, metas = load_index_and_meta()

def search(query: str, top_k: int = 5):
    # embed query
    qv = model.encode([query], normalize_embeddings=False, convert_to_numpy=True)
    qv = qv / (np.linalg.norm(qv, axis=1, keepdims=True) + 1e-12)
    # search
    D, I = index.search(qv.astype("float32"), top_k)
    I, D = I[0], D[0]
    results = []
    for rank, (i, score) in enumerate(zip(I, D), start=1):
        if int(i) < 0: 
            continue
        m = metas[int(i)]
        results.append({
            "rank": rank,
            "score": float(score),
            "video": m["video"],
            "time": [m["t_start"], m["t_end"]],
            "sentence": m["sentence"],
            "attributes": m["attributes"],
            "actions": m["actions"],
            "interactions": m["interactions"],
        })
    return results


In [10]:
# try a new search sanity check
tests = [
    "woman picks up a book and reads",
    "policeman watching and retreating",
    "pushing a wheelchair",
]

for q in tests:
    print("\n🔎 Query:", q)
    for r in search(q, top_k=5):
        print(f"  {r['rank']}. ({r['score']:.3f}) {r['video']}  t={r['time']}  | {r['sentence']}")



🔎 Query: woman picks up a book and reads
  1. (0.475) Abuse001_x264  t=[0.0, 5.3]  | A woman with short hair, slightly fat, wearing a white top and black pants stood in front of the table, picked up a book from the table, and opened it to read
  2. (0.316) Abuse008_x264  t=[176.2, 197.3]  | The female policeman held a book in her hand and bent down to ask the woman. The male policeman sat behind the woman, and the woman leaned on the male policeman’s hand.
  3. (0.297) Abuse002_x264  t=[14.3, 16.5]  | The woman in the couple picks up the child's things from the ground, and the man in the couple stands holding the child
  4. (0.267) Abuse001_x264  t=[8.9, 11.2]  | The woman fell to the ground in pain, and the book in her hand fell to the ground when she fell. At the same time, she knocked the red wooden table in front of her crookedly. There were three things on the table.
  5. (0.215) Abuse008_x264  t=[145.3, 149.6]  | The police pushed the woman hard into the prison. The woman fell t