In [15]:

!pip install -q spacy pandas tqdm

!pip install -q spacy-transformers
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.0 MB/s[0m  [33m0:00:00[0mm0:00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m46.3 MB/s[0m  [33m0:00:11[0m:00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [None]:
import json, re
from pathlib import Path
from collections import OrderedDict
from typing import List, Dict, Any, Tuple

import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher
import pandas as pd
from tqdm import tqdm

# ====== EDIT this to your JSON path ======
UCF_PATH = "/Users/dihan_ahmed/DRIVE_1/UTS/4th semester/ilab/UCFCrime_Train.json"

# Outputs (keep separate from your earlier run)
OUT_DIR = Path("artifacts/ner_plus"); OUT_DIR.mkdir(parents=True, exist_ok=True)
JSON_OUT = OUT_DIR / "ucf_ner_plus_extracted.json"
CSV_OUT  = OUT_DIR / "ucf_ner_plus_features.csv"

# LLM CSV (optional: if present, a tiny eval will run at the end)
LLM_CSV = Path("ucf_llm_featrures_2.csv")

# First 120 videos in file order
LIMIT_VIDEOS = 120

print("Dataset:", UCF_PATH)
print("Outputs:", JSON_OUT, CSV_OUT, sep="\n")


Dataset: /Users/dihan_ahmed/DRIVE_1/UTS/4th semester/ilab/UCFCrime_Train.json
Outputs:
artifacts/ner_plus/ucf_ner_plus_extracted.json
artifacts/ner_plus/ucf_ner_plus_features.csv


In [17]:
def load_ucf_in_order(path: str) -> "OrderedDict[str, dict]":
    """Load the UCFCrime JSON, preserving key order as written in file."""
    text = Path(path).read_text(encoding="utf-8")
    return json.loads(text, object_pairs_hook=OrderedDict)

ucf = load_ucf_in_order(UCF_PATH)
video_items = list(ucf.items())[:LIMIT_VIDEOS]
print(f"Total videos in file: {len(ucf)}")
print(f"Processing first {len(video_items)} videos (preserving input order).")
print("First 3 video ids:", [vid for vid, _ in video_items[:3]])


Total videos in file: 1165
Processing first 120 videos (preserving input order).
First 3 video ids: ['Abuse001_x264', 'Abuse002_x264', 'Abuse003_x264']


In [18]:
# Toggle: set to True to use the transformer model for better accuracy (slower)
USE_TRANSFORMER = False

nlp = spacy.load("en_core_web_trf" if USE_TRANSFORMER else "en_core_web_sm")

# -------- Domain lexicons (you can extend freely) ----------
COLOR_WORDS = {
    "white","black","red","green","blue","yellow","pink","purple","orange","grey","gray",
    "brown","beige","navy","silver"
}
CLOTHING_WORDS = {
    "shirt","t-shirt","tee","top","jacket","coat","hoodie","sweater","jumper","dress","skirt",
    "pants","trousers","jeans","shorts","shoes","sneakers","boots","sandals",
    "cap","hat","scarf","gloves","belt","bag","backpack","sleeves","short-sleeves","long-sleeves"
}
PPE_WORDS = {"helmet","vest","reflective","hi-vis","high-vis"}
VEHICLE_WORDS = {"car","van","truck","bus","motorbike","motorcycle","bicycle","bike","scooter"}

PERSON_WORDS = {
    "man","woman","boy","girl","lady","gentleman","male","female","person","people",
    "policeman","policewoman","old man","old woman","young man","young woman"
}

# Scene objects we don't want as attributes
OBJECT_STOP_IN_ATTRS = {"table","ground","road","street","sidewalk","door","window","house","wall","floor"}

# Synonym/normalization map
SYN_MAP = {
    "t shirt":"t-shirt","tee shirt":"t-shirt","tee":"t-shirt","t- shirt":"t-shirt",
    "grey":"gray","hi vis":"hi-vis","high vis":"hi-vis",
    "police man":"policeman","police woman":"policewoman",
    "short sleeves":"short-sleeves","long sleeves":"long-sleeves"
}

def norm_text(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    s = SYN_MAP.get(s, s)
    return s

def dedup(seq: List[str]) -> List[str]:
    seen = OrderedDict()
    for x in seq:
        x = (x or "").strip()
        if x and x not in seen:
            seen[x] = True
    return list(seen.keys())

# --------- EntityRuler to boost domain coverage ---------
ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = []
# Colors
for w in COLOR_WORDS:
    patterns.append({"label": "COLOR", "pattern": w})
# PPE
for w in PPE_WORDS:
    patterns.append({"label": "PPE", "pattern": w})
# Vehicles
for w in VEHICLE_WORDS:
    patterns.append({"label": "VEHICLE", "pattern": w})
# Clothing (single words + common 2-token variants like 't shirt', 'short sleeves')
for w in CLOTHING_WORDS:
    if " " in w:
        patterns.append({"label": "CLOTHING", "pattern": w})
    else:
        patterns.append({"label": "CLOTHING", "pattern": w})
# Extra patterns for t-shirt variants
patterns += [
    {"label": "CLOTHING", "pattern": [{"LOWER": {"IN": ["t","t-","tee","tshirt","t-shirt"]}}, {"LOWER": {"IN": ["shirt","top"]}}]},
    {"label": "CLOTHING", "pattern": [{"LOWER": "short"}, {"LOWER": {"IN": ["sleeve","sleeves"]}}]},
    {"label": "CLOTHING", "pattern": [{"LOWER": "long"}, {"LOWER": {"IN": ["sleeve","sleeves"]}}]},
]
ruler.add_patterns(patterns)
print("EntityRuler patterns:", len(patterns))


EntityRuler patterns: 61


In [20]:
matcher = Matcher(nlp.vocab)

# Phrasal verb pattern: VERB + PART (e.g., pick up, get out, fall down, lean over)
matcher.add("PHRASAL_VERB", [[{"POS":"VERB"},{"POS":"PART"}]])

# Common surveillance verbs (lemma-based) for extra sensitivity
COMMON_VERBS = [
    "walk","run","approach","enter","exit","push","pull","pick","hold","carry","open","close",
    "drive","ride","talk","stand","sit","turn","fall","throw","lift","lean","watch","bump","knock","punch",
    "kick","grab","drop","kneel","bend","gesture","move"
]
for v in COMMON_VERBS:
    matcher.add(f"VERB_{v.upper()}", [[{"LEMMA": v}]])


In [21]:
def infer_gender_from_tokens(doc: spacy.tokens.Doc) -> str:
    toks = {t.lemma_.lower() for t in doc if not t.is_space}
    if {"woman","lady","girl","policewoman","female"} & toks:
        return "female"
    if {"man","gentleman","boy","policeman","male"} & toks:
        return "male"
    tset = {t.text.lower() for t in doc}
    if "she" in tset or "her" in tset:
        return "female"
    if "he" in tset or "his" in tset:
        return "male"
    return "unknown"

def is_person_token(tok: spacy.tokens.Token) -> bool:
    low = tok.lemma_.lower()
    if tok.ent_type_ == "PERSON":
        return True
    if low in PERSON_WORDS:
        return True
    if tok.pos_ == "PRON" and tok.text.lower() in {"he","she","they","him","her"}:
        return True
    return False

def keep_reasonable(items: List[str], max_len: int = 48) -> List[str]:
    out = []
    for x in items:
        x = norm_text(x)
        if 0 < len(x) <= max_len and re.search(r"[a-zA-Z]", x):
            out.append(x)
    return dedup(out)


In [22]:
def extract_attributes(doc: spacy.tokens.Doc) -> List[str]:
    attrs = []

    # 1) Gender (per your schema it lives inside attributes)
    g = infer_gender_from_tokens(doc)
    if g: attrs.append(g)

    # 2) From entities: colors, clothing, PPE
    for ent in doc.ents:
        if ent.label_ in {"COLOR","CLOTHING","PPE"}:
            attrs.append(norm_text(ent.text))

    # 3) Person adjectives (e.g., 'short-haired woman' -> 'short-haired')
    for t in doc:
        if is_person_token(t):
            for ch in t.lefts:
                if ch.dep_ == "amod" and ch.pos_ == "ADJ":
                    attrs.append(norm_text(ch.text))

    # 4) Lexical fallback: color/clothing tokens (with compound/adjective modifiers)
    for t in doc:
        low = t.lemma_.lower()
        if low in COLOR_WORDS or low in CLOTHING_WORDS:
            chunk = t.text
            mods = [m.text for m in t.lefts if m.dep_ in {"amod","compound"} or m.lemma_.lower() in COLOR_WORDS]
            if mods:
                chunk = " ".join([*mods, chunk])
            attrs.append(norm_text(chunk))

    # 5) Filter junk
    attrs = [a for a in attrs if a not in OBJECT_STOP_IN_ATTRS]
    return keep_reasonable(attrs)


def extract_actions_interactions(doc: spacy.tokens.Doc) -> Tuple[List[str], List[str]]:
    actions, interactions = [], []

    # Optional: not used further here but can be leveraged for debugging/flags
    _matches = matcher(doc)

    for v in doc:
        if v.pos_ != "VERB":
            continue

        # Person subject required
        subj = [c for c in v.children if c.dep_ in {"nsubj","nsubjpass"}]
        if not any(is_person_token(s) for s in subj):
            continue

        # Build action phrase: lemma + particle + (first prep) + (first object head)
        parts = [v.lemma_.lower()]

        # Phrasal particle (pick up, get out, fall down, lean over)
        prt = [c for c in v.children if c.dep_ == "prt"]
        if prt:
            parts.append(prt[0].text.lower())

        # First preposition to compactly capture direction (e.g., 'toward', 'into')
        prep_child = next((c for c in v.children if c.dep_ == "prep"), None)
        if prep_child:
            parts.append(prep_child.text.lower())

        # Objects: direct + prepositional
        objs = [c for c in v.children if c.dep_ in {"dobj","obj"} and c.pos_ in {"NOUN","PROPN"}]
        if prep_child:
            objs += [c for c in prep_child.children if c.dep_ == "pobj"]

        obj_terms = []
        for o in objs:
            phrase = o.text.lower()
            mods = [lc.text.lower() for lc in o.lefts if lc.dep_ in {"compound","amod"}]
            if mods:
                phrase = " ".join(mods + [phrase])
            obj_terms.append(norm_text(phrase))

        # Action phrase includes first object head when present
        act = " ".join(parts + ([obj_terms[0]] if obj_terms else []))
        actions.append(act.strip())

        # Interactions = all object terms for this verb
        interactions.extend(obj_terms)

    actions = keep_reasonable([a for a in actions if a and a != "be"])
    interactions = keep_reasonable([x for x in interactions if x and x not in OBJECT_STOP_IN_ATTRS])
    return actions, interactions


In [23]:
def process_sentence(sentence: str) -> Dict[str, List[str]]:
    sentence = (sentence or "").strip()
    if not sentence:
        return {"attributes": [], "actions": [], "interactions": []}

    doc = nlp(sentence)
    attributes = extract_attributes(doc)
    actions, interactions = extract_actions_interactions(doc)

    # Ensure a gender token exists (fallback 'unknown')
    if not any(g in attributes for g in ("male","female","unknown")):
        attributes = ["unknown"] + attributes

    return {
        "attributes": dedup(attributes),
        "actions": dedup(actions),
        "interactions": dedup(interactions)
    }


In [24]:
rows: List[Dict[str, Any]] = []

for vid, blob in tqdm(video_items, desc="Processing videos"):
    timestamps = blob.get("timestamps", [])
    sentences  = blob.get("sentences", [])
    for i, sent in enumerate(sentences):
        t0, t1 = (timestamps[i] if i < len(timestamps) else [None, None])
        feats = process_sentence(sent)

        rows.append({
            "video": vid,
            "scene_idx": i,
            "t_start": t0,
            "t_end": t1,
            "sentence": sent,
            # join lists for CSV (gender stays inside attributes)
            "attributes": ";".join(feats["attributes"]),
            "actions": ";".join(feats["actions"]),
            "interactions": ";".join(feats["interactions"]),
            # keep raw for JSON auditing
            "_raw": feats
        })

print("Total rows produced:", len(rows))
pd.DataFrame(rows).head(8)


Processing videos: 100%|██████████| 120/120 [00:04<00:00, 27.38it/s]

Total rows produced: 1243





Unnamed: 0,video,scene_idx,t_start,t_end,sentence,attributes,actions,interactions,_raw
0,Abuse001_x264,0,0.0,5.3,"A woman with short hair, slightly fat, wearing...",female;white;top;black;pants;white top,stand in front,front,"{'attributes': ['female', 'white', 'top', 'bla..."
1,Abuse001_x264,1,7.0,8.5,A man wearing a white shirt and black pants en...,female;white;shirt;black;pants;haired;white shirt,enter house,,"{'attributes': ['female', 'white', 'shirt', 'b..."
2,Abuse001_x264,2,7.2,8.5,A man wearing a black shirt and black pants en...,female;black;shirt;pants;haired;black shirt,enter house,,"{'attributes': ['female', 'black', 'shirt', 'p..."
3,Abuse001_x264,3,8.2,8.9,A man wearing a white shirt and black pants ap...,female;white;shirt;black;pants;red;haired;fat;...,approach haired fat woman;pay attention;pull o...,haired fat woman;attention;piece;left side,"{'attributes': ['female', 'white', 'shirt', 'b..."
4,Abuse001_x264,4,8.9,11.2,A man in black clothes approached a short-hair...,female;black;white;top;pants;haired;fat;white top,approach haired fat woman;turn;punch in woman,haired fat woman;woman;head,"{'attributes': ['female', 'black', 'white', 't..."
5,Abuse001_x264,5,8.9,11.2,"The woman fell to the ground in pain, and the ...",female;red,fall to ground;fall;knock at red wooden table,red wooden table;same time,"{'attributes': ['female', 'red'], 'actions': [..."
6,Abuse001_x264,6,11.3,13.3,A woman with short hair and a fat figure weari...,female;white;top;black;pants;white top,fall to ground,,"{'attributes': ['female', 'white', 'top', 'bla..."
7,Abuse001_x264,7,15.2,18.9,"A woman with short hair, slightly fat, wearing...",female;white;top;black;pants;white top,fall to ground;touch with forehead;retract lef...,forehead;right hand;left leg,"{'attributes': ['female', 'white', 'top', 'bla..."


In [25]:
# JSON: list of dicts (includes raw lists)
JSON_OUT.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")

# CSV: your project schema columns
cols = ["video","scene_idx","t_start","t_end","sentence","attributes","actions","interactions"]
df = pd.DataFrame(rows)[cols]
df.to_csv(CSV_OUT, index=False, encoding="utf-8")

print("✅ Wrote JSON:", JSON_OUT)
print("✅ Wrote CSV :", CSV_OUT)
df.head(10)


✅ Wrote JSON: artifacts/ner_plus/ucf_ner_plus_extracted.json
✅ Wrote CSV : artifacts/ner_plus/ucf_ner_plus_features.csv


Unnamed: 0,video,scene_idx,t_start,t_end,sentence,attributes,actions,interactions
0,Abuse001_x264,0,0.0,5.3,"A woman with short hair, slightly fat, wearing...",female;white;top;black;pants;white top,stand in front,front
1,Abuse001_x264,1,7.0,8.5,A man wearing a white shirt and black pants en...,female;white;shirt;black;pants;haired;white shirt,enter house,
2,Abuse001_x264,2,7.2,8.5,A man wearing a black shirt and black pants en...,female;black;shirt;pants;haired;black shirt,enter house,
3,Abuse001_x264,3,8.2,8.9,A man wearing a white shirt and black pants ap...,female;white;shirt;black;pants;red;haired;fat;...,approach haired fat woman;pay attention;pull o...,haired fat woman;attention;piece;left side
4,Abuse001_x264,4,8.9,11.2,A man in black clothes approached a short-hair...,female;black;white;top;pants;haired;fat;white top,approach haired fat woman;turn;punch in woman,haired fat woman;woman;head
5,Abuse001_x264,5,8.9,11.2,"The woman fell to the ground in pain, and the ...",female;red,fall to ground;fall;knock at red wooden table,red wooden table;same time
6,Abuse001_x264,6,11.3,13.3,A woman with short hair and a fat figure weari...,female;white;top;black;pants;white top,fall to ground,
7,Abuse001_x264,7,15.2,18.9,"A woman with short hair, slightly fat, wearing...",female;white;top;black;pants;white top,fall to ground;touch with forehead;retract lef...,forehead;right hand;left leg
8,Abuse001_x264,8,19.7,25.4,"A woman with short hair, slightly fat, wearing...",female;white;top;black;pants;white top,sit on ground;turn head,head
9,Abuse002_x264,0,0.0,2.3,"At an intersection with smooth traffic, the gr...",unknown;green;white,,
