In [1]:
!pip install -q spacy pandas tqdm
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m711.8 kB/s[0m  [33m0:00:17[0m0:00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import json, re
from pathlib import Path
from collections import OrderedDict
from typing import List, Dict, Any

import spacy
import pandas as pd
from tqdm import tqdm


In [3]:
# ======= EDIT THIS to your actual path =======
UCF_PATH = "/Users/dihan_ahmed/DRIVE_1/UTS/4th semester/ilab/UCFCrime_Train.json"

# Outputs
OUT_DIR = Path("artifacts/ner"); OUT_DIR.mkdir(parents=True, exist_ok=True)
JSON_OUT = OUT_DIR / "ucf_ner_extracted.json"
CSV_OUT  = OUT_DIR / "ucf_ner_features.csv"

LIMIT_VIDEOS = 120  # first 120 videos in dataset order

print("Dataset:", UCF_PATH)
print("Will write:")
print(" - JSON:", JSON_OUT)
print(" -  CSV:", CSV_OUT)


Dataset: /Users/dihan_ahmed/DRIVE_1/UTS/4th semester/ilab/UCFCrime_Train.json
Will write:
 - JSON: artifacts/ner/ucf_ner_extracted.json
 -  CSV: artifacts/ner/ucf_ner_features.csv


In [None]:
# load dataset and take 120 videos

def load_ucf_in_order(path: str) -> OrderedDict:
    """
    Load JSON while preserving key order (first 120 == first 120 in file).
    """
    text = Path(path).read_text(encoding="utf-8")
    return json.loads(text, object_pairs_hook=OrderedDict)

ucf = load_ucf_in_order(UCF_PATH)
video_items = list(ucf.items())[:LIMIT_VIDEOS]  # [(video_id, blob), ...]

print(f"Total videos in file: {len(ucf)}")
print(f"Processing first {len(video_items)} videos (preserving input order).")
print("First 3 video ids:", [vid for vid, _ in video_items[:3]])


Total videos in file: 1165
Processing first 120 videos (preserving input order).
First 3 video ids: ['Abuse001_x264', 'Abuse002_x264', 'Abuse003_x264']


In [5]:

# Load spaCy & small lexicons
# Small, fast English pipeline (NER + POS + parser)
nlp = spacy.load("en_core_web_sm")

# Domain lexicons (can extend as needed)
COLOR_WORDS = {
    "white","black","red","green","blue","yellow","pink","purple","orange","grey","gray","brown","beige","navy"
}
CLOTHING_WORDS = {
    "shirt","t-shirt","tee","top","jacket","coat","hoodie","sweater","jumper","dress","skirt","pants","trousers",
    "jeans","shorts","shoes","sneakers","boots","sandals","cap","hat","scarf","gloves","belt","bag","backpack"
}
PERSON_WORDS = {
    "man","woman","boy","girl","lady","gentleman","male","female","person","people","policeman","policewoman"
}
OBJECT_STOP_IN_ATTRS = {"table","ground","road","street","sidewalk","car","door","window","house","wall","floor"}

def dedup(seq: List[str]) -> List[str]:
    seen = OrderedDict()
    for x in seq:
        x = (x or "").strip()
        if x and x not in seen:
            seen[x] = True
    return list(seen.keys())


In [6]:
#Gender inference & helpers
def infer_gender_from_tokens(doc: spacy.tokens.Doc) -> str:
    """
    Heuristic gender from obvious lexical cues; else 'unknown'.
    """
    toks = {t.lemma_.lower() for t in doc if not t.is_space}
    if {"woman","lady","girl","policewoman","female"} & toks:
        return "female"
    if {"man","gentleman","boy","policeman","male"} & toks:
        return "male"
    # simple pronoun cues (weak heuristic)
    tset = {t.text.lower() for t in doc}
    if "she" in tset or "her" in tset:
        return "female"
    if "he" in tset or "his" in tset:
        return "male"
    return "unknown"

def is_person_token(tok: spacy.tokens.Token) -> bool:
    if tok.ent_type_ == "PERSON":
        return True
    if tok.lemma_.lower() in PERSON_WORDS:
        return True
    if tok.pos_ == "PRON" and tok.text.lower() in {"he","she","they","him","her"}:
        return True
    return False


In [7]:
#Extract attributes (with gender), actions, interactions
def extract_attributes(doc: spacy.tokens.Doc) -> List[str]:
    """
    Attributes include: gender, clothing (with color modifiers), colors, person-describing adjectives.
    Gender lives inside attributes per your schema.
    """
    attrs = []

    # Gender first
    g = infer_gender_from_tokens(doc)
    if g:
        attrs.append(g)

    # Colors anywhere
    for t in doc:
        low = t.lemma_.lower()
        if low in COLOR_WORDS:
            attrs.append(low)

    # Clothing (+ left modifiers: amod/compound like 'white shirt', 'red jacket')
    for t in doc:
        low = t.lemma_.lower()
        if low in CLOTHING_WORDS:
            mods = []
            for ch in t.lefts:
                if ch.dep_ in {"amod","compound"} or ch.lemma_.lower() in COLOR_WORDS:
                    mods.append(ch.text.lower())
            chunk = (" ".join(mods) + " " if mods else "") + t.text.lower()
            attrs.append(chunk.strip())

    # Person adjectives (e.g., 'short-haired woman' => 'short-haired')
    for t in doc:
        if is_person_token(t):
            for ch in t.lefts:
                if ch.dep_ == "amod" and ch.pos_ == "ADJ":
                    attrs.append(ch.text.lower())

    # Clean obvious scene objects from attrs
    attrs = [a for a in attrs if a not in OBJECT_STOP_IN_ATTRS]
    return dedup(attrs)


def extract_actions_interactions(doc: spacy.tokens.Doc):
    """
    Actions: verb with a person subject + optional particle + (first) prep + (first) object head
             e.g., 'pick up book', 'walk toward door'
    Interactions: list of objects linked to those actions (dobj/pobj).
    """
    actions = []
    interactions = []

    for v in doc:
        if v.pos_ != "VERB":
            continue

        # Subject must likely be a person
        subj = [c for c in v.children if c.dep_ in {"nsubj","nsubjpass"}]
        has_person_subj = any(is_person_token(s) for s in subj)
        if not has_person_subj:
            continue

        # Build action phrase = verb lemma + particle + (prep) + (object head)
        parts = [v.lemma_.lower()]

        # phrasal verb particle (e.g., 'pick up')
        prt = [c for c in v.children if c.dep_ == "prt"]
        if prt:
            parts.append(prt[0].text.lower())

        # preposition (first only: 'toward', 'to', 'from'—keeps it short)
        preps = [c.text.lower() for c in v.children if c.dep_ == "prep"]
        if preps:
            parts.append(preps[0])

        # direct object & prepositional objects
        dobj = [c for c in v.children if c.dep_ in {"dobj","obj"} and c.pos_ in {"NOUN","PROPN"}]
        pobj = []
        for p in [c for c in v.children if c.dep_ == "prep"]:
            pobj.extend([c for c in p.children if c.dep_ == "pobj"])

        obj_terms = []
        for o in dobj + pobj:
            phrase = o.text.lower()
            # include simple compounds/adjectives to tighten: "police officer", "number plate"
            left_mods = [lc.text.lower() for lc in o.lefts if lc.dep_ in {"compound","amod"}]
            if left_mods:
                phrase = " ".join(left_mods + [phrase])
            obj_terms.append(phrase)

        # Action phrase includes first object head if present (e.g., 'pick up book')
        act = " ".join(parts + ([obj_terms[0]] if obj_terms else []))
        actions.append(act.strip())

        # Interactions list = all object terms we found for this verb
        interactions.extend(obj_terms)

    return dedup(actions), dedup(interactions)


In [8]:
# One-sentence → features (attributes/actions/interactions)
def process_sentence(sentence: str) -> Dict[str, List[str]]:
    """
    Convert a raw annotation sentence into structured lists.
    Ensures gender is present in attributes ('male'/'female'/'unknown').
    """
    sentence = (sentence or "").strip()
    if not sentence:
        return {"attributes": [], "actions": [], "interactions": []}

    doc = nlp(sentence)
    attributes = extract_attributes(doc)
    actions, interactions = extract_actions_interactions(doc)

    # Ensure some gender token is present as agreed (fallback 'unknown')
    if not any(g in attributes for g in ("male","female","unknown")):
        attributes = ["unknown"] + attributes

    return {
        "attributes": attributes,
        "actions": actions,
        "interactions": interactions
    }


In [None]:
# Run NER extractor over first 120 videos → create rows
rows: List[Dict[str, Any]] = []

for vid, blob in tqdm(video_items, desc="Processing videos"):
    timestamps = blob.get("timestamps", [])
    sentences  = blob.get("sentences", [])
    for i, sent in enumerate(sentences):
        # timestamps per sentence, may be missing
        t0, t1 = (timestamps[i] if i < len(timestamps) else [None, None])

        feats = process_sentence(sent)

        rows.append({
            "video": vid,
            "scene_idx": i,
            "t_start": t0,
            "t_end": t1,
            "sentence": sent,
            # join lists into your CSV string format (gender remains inside attributes)
            "attributes": ";".join(feats["attributes"]),
            "actions": ";".join(feats["actions"]),
            "interactions": ";".join(feats["interactions"]),
            # keep raw for auditing in JSON
            "_raw": feats
        })

print("Total rows produced:", len(rows))
pd.DataFrame(rows).head(5)


Processing videos: 100%|██████████| 120/120 [00:04<00:00, 28.22it/s]

Total rows produced: 1243





Unnamed: 0,video,scene_idx,t_start,t_end,sentence,attributes,actions,interactions,_raw
0,Abuse001_x264,0,0.0,5.3,"A woman with short hair, slightly fat, wearing...",female;white;black;white top,stand in front,front,"{'attributes': ['female', 'white', 'black', 'w..."
1,Abuse001_x264,1,7.0,8.5,A man wearing a white shirt and black pants en...,female;white;black;white shirt;haired,enter house,house,"{'attributes': ['female', 'white', 'black', 'w..."
2,Abuse001_x264,2,7.2,8.5,A man wearing a black shirt and black pants en...,female;black;black shirt;haired,enter house,house,"{'attributes': ['female', 'black', 'black shir..."
3,Abuse001_x264,3,8.2,8.9,A man wearing a white shirt and black pants ap...,female;white;black;red;white shirt;haired;fat,approach haired fat woman;pay attention;pull o...,haired fat woman;attention;piece;left side,"{'attributes': ['female', 'white', 'black', 'r..."
4,Abuse001_x264,4,8.9,11.2,A man in black clothes approached a short-hair...,female;black;white;white top;haired;fat,approach haired fat woman;turn;punch in woman,haired fat woman;woman;head,"{'attributes': ['female', 'black', 'white', 'w..."


In [10]:
pd.DataFrame(rows)

Unnamed: 0,video,scene_idx,t_start,t_end,sentence,attributes,actions,interactions,_raw
0,Abuse001_x264,0,0.0,5.3,"A woman with short hair, slightly fat, wearing...",female;white;black;white top,stand in front,front,"{'attributes': ['female', 'white', 'black', 'w..."
1,Abuse001_x264,1,7.0,8.5,A man wearing a white shirt and black pants en...,female;white;black;white shirt;haired,enter house,house,"{'attributes': ['female', 'white', 'black', 'w..."
2,Abuse001_x264,2,7.2,8.5,A man wearing a black shirt and black pants en...,female;black;black shirt;haired,enter house,house,"{'attributes': ['female', 'black', 'black shir..."
3,Abuse001_x264,3,8.2,8.9,A man wearing a white shirt and black pants ap...,female;white;black;red;white shirt;haired;fat,approach haired fat woman;pay attention;pull o...,haired fat woman;attention;piece;left side,"{'attributes': ['female', 'white', 'black', 'r..."
4,Abuse001_x264,4,8.9,11.2,A man in black clothes approached a short-hair...,female;black;white;white top;haired;fat,approach haired fat woman;turn;punch in woman,haired fat woman;woman;head,"{'attributes': ['female', 'black', 'white', 'w..."
...,...,...,...,...,...,...,...,...,...
1238,Assault025_x264,0,1.2,14.4,A group of people stood on one side of the roa...,unknown,,,"{'attributes': ['unknown'], 'actions': [], 'in..."
1239,Assault025_x264,1,14.2,17.7,A man wearing a white short-sleeved shirt atta...,male;white;white sleeved shirt,attack man,man,"{'attributes': ['male', 'white', 'white sleeve..."
1240,Assault025_x264,2,18.0,20.8,Another man in dark clothes continued to beat ...,male;injured,continue,,"{'attributes': ['male', 'injured'], 'actions':..."
1241,Assault025_x264,3,20.1,25.2,"The man in white ran back to help, and then left.",male;white,run,,"{'attributes': ['male', 'white'], 'actions': [..."


In [11]:
# JSON: list of dicts (includes _raw lists)
JSON_OUT.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")

# CSV: your project schema
cols = ["video","scene_idx","t_start","t_end","sentence","attributes","actions","interactions"]
df = pd.DataFrame(rows)[cols]
df.to_csv(CSV_OUT, index=False, encoding="utf-8")

print("✅ Wrote JSON:", JSON_OUT)
print("✅ Wrote CSV :", CSV_OUT)
df.head(8)


✅ Wrote JSON: artifacts/ner/ucf_ner_extracted.json
✅ Wrote CSV : artifacts/ner/ucf_ner_features.csv


Unnamed: 0,video,scene_idx,t_start,t_end,sentence,attributes,actions,interactions
0,Abuse001_x264,0,0.0,5.3,"A woman with short hair, slightly fat, wearing...",female;white;black;white top,stand in front,front
1,Abuse001_x264,1,7.0,8.5,A man wearing a white shirt and black pants en...,female;white;black;white shirt;haired,enter house,house
2,Abuse001_x264,2,7.2,8.5,A man wearing a black shirt and black pants en...,female;black;black shirt;haired,enter house,house
3,Abuse001_x264,3,8.2,8.9,A man wearing a white shirt and black pants ap...,female;white;black;red;white shirt;haired;fat,approach haired fat woman;pay attention;pull o...,haired fat woman;attention;piece;left side
4,Abuse001_x264,4,8.9,11.2,A man in black clothes approached a short-hair...,female;black;white;white top;haired;fat,approach haired fat woman;turn;punch in woman,haired fat woman;woman;head
5,Abuse001_x264,5,8.9,11.2,"The woman fell to the ground in pain, and the ...",female;red,fall to ground;fall;knock at red wooden table,ground;pain;red wooden table;same time;front
6,Abuse001_x264,6,11.3,13.3,A woman with short hair and a fat figure weari...,female;white;black;white top,fall to ground,ground
7,Abuse001_x264,7,15.2,18.9,"A woman with short hair, slightly fat, wearing...",female;white;black;white top,fall to ground;touch with forehead;retract lef...,ground;forehead;right hand;left leg
