In [None]:
#Pedestrian Identification (this was a trial - REID is Terrible for pedestrian tracking)
import cv2
import csv
import numpy as np
from ultralytics import YOLO
import os

# ===================== CONFIG =====================
VIDEO_PATH = '/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial2.mp4'
OUTPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial28.csv"
OUTPUT_VIDEO = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial28output.mp4"
FRAME_LINE = 10  # frame to use for picking zone points

# Zones
ZONES = ["C1", "C2", "C3", "C4"]
ZONE_COLORS = {"C1": (255,0,0), "C2": (0,255,0), "C3": (0,0,255), "C4": (255,255,0)}

# YOLO pedestrian class
PEDESTRIAN_CLASS = [0]
MODEL_PATH = "yolo11m-seg.pt"
WRITE_VIDEO = True

# ===================== HELPERS =====================
def draw_transparent_polygon(frame, points, color=(0, 0, 255), alpha=0.25):
    overlay = frame.copy()
    pts = np.array(points, dtype=np.int32)
    cv2.fillPoly(overlay, [pts], color)
    return cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0)

def select_zone_points(frame, zone_name, existing_zones):
    pts = []
    fc = frame.copy()
    for zname, zpts in existing_zones.items():
        fc = draw_transparent_polygon(fc, zpts, color=ZONE_COLORS.get(zname, (255,0,0)), alpha=0.2)

    win = f"Select {zone_name} - click 4 points (press q to abort)"
    cv2.namedWindow(win, cv2.WINDOW_NORMAL)
    cv2.imshow(win, fc)

    def click(event, x, y, flags, param):
        if event == cv2.EVENT_LBUTTONDOWN:
            pts.append((x, y))
            cv2.circle(fc, (x, y), 5, (0, 255, 0), -1)
            cv2.imshow(win, fc)
            if len(pts) == 4:
                cv2.destroyWindow(win)

    cv2.setMouseCallback(win, click)
    while len(pts) < 4:
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    cv2.destroyWindow(win)
    return np.array(pts, dtype=np.float32)

def point_in_polygon(point, polygon):
    poly = np.array(polygon, dtype=np.int32)
    return cv2.pointPolygonTest(poly, (int(point[0]), int(point[1])), False) >= 0

def get_zone_from_point(point, zones_src):
    for zone_name, pts in zones_src.items():
        if point_in_polygon(point, pts):
            return zone_name
    return None

def frame_to_hms(frame_num, fps):
    seconds = frame_num / fps
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h:01d}:{m:02d}:{s:02d}"

last_positions = {}  # PersonID → (x, y)

# def get_direction(prev, curr, threshold=5):
#     dx = curr[0] - prev[0]  # horizontal movement

#     # Ignore very small horizontal movements (jitter)
#     if abs(dx) < threshold:
#         return None

#     # Decide horizontal direction
#     if dx > 0:
#         return "Right"  # moving towards right of screen
#     else:
#         return "Left"   # moving towards left of screen
    

# ===================== VIDEO & ZONE SELECTION =====================
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise RuntimeError(f"Cannot open video: {VIDEO_PATH}")

cap.set(cv2.CAP_PROP_POS_FRAMES, FRAME_LINE)
ret, frame_line = cap.read()
if not ret:
    cap.release()
    raise RuntimeError(f"Cannot read frame {FRAME_LINE}")

ZONE_SRC_PTS = {}
print("Select 4 points for each zone (C1–C4)")
for zone in ZONES:
    src_pts = select_zone_points(frame_line, zone, ZONE_SRC_PTS)
    if src_pts.shape[0] != 4:
        raise RuntimeError(f"Zone {zone} selection aborted or not enough points.")
    ZONE_SRC_PTS[zone] = src_pts
cv2.destroyAllWindows()

# Zone persistence settings
last_zones = {}       # personID → last known zone
outside_count = {}    # personID → consecutive frames outside
OUTSIDE_GRACE = 30     # allow this many frames of "None" before clearing zone

# ===================== INITIALIZE YOLO =====================
model = YOLO(MODEL_PATH)

frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0

if WRITE_VIDEO:
    cap.release()
    cap = cv2.VideoCapture(VIDEO_PATH)
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    vid_out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (frame_w, frame_h))

# ===================== TRACKING & CSV LOGGING =====================
all_rows = []
frame_num = 0
try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_num += 1
        timestamp_text = frame_to_hms(frame_num, fps)

        results = model.track(frame, persist=True, classes=PEDESTRIAN_CLASS, verbose=False, tracker = '/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/bytetrack_test.yaml')

        # Draw zones
        for zname, pts in ZONE_SRC_PTS.items():
            frame = draw_transparent_polygon(frame, pts, color=ZONE_COLORS.get(zname, (200,200,200)), alpha=0.12)

        if results and len(results) > 0 and results[0].boxes is not None:
            boxes = results[0].boxes
            try:
                ids = boxes.id.int().cpu().tolist() if boxes.id is not None else [None]*len(boxes)
            except:
                ids = [None]*len(boxes)
            xyxy = boxes.xyxy.cpu().numpy()
            try:
                classes = boxes.cls.int().cpu().tolist()
            except:
                classes = [None]*len(boxes)

            for idx, box in enumerate(xyxy):
                pid = ids[idx] or f"det_{frame_num}_{idx}"
                cls = classes[idx] if idx < len(classes) else None
                if cls not in PEDESTRIAN_CLASS:
                    continue

                x1, y1, x2, y2 = box
                cx, cy = int((x1 + x2)/2), int(y2)
                pixel_pos = (cx, cy)

                # Determine zone with persistence
                zone = get_zone_from_point(pixel_pos, ZONE_SRC_PTS)

                if zone is None and pid in last_zones:
                    # If recently inside a zone, keep it until grace expires
                    if outside_count.get(pid, 0) < OUTSIDE_GRACE:
                        zone = last_zones[pid]
                        outside_count[pid] = outside_count.get(pid, 0) + 1
                    else:
                        zone = None
                else:
                    outside_count[pid] = 0  # reset if detected inside

                if zone is not None:
                    last_zones[pid] = zone
                
                # direction = None
                # if pid in last_positions:
                #     prev_pos = last_positions[pid]
                #     direction = get_direction(prev_pos, pixel_pos)

                # last_positions[pid] = pixel_pos  # update for next frame

                # Append row (only once, with direction)
                all_rows.append([timestamp_text, pid, frame_num, cx, cy, zone])

                # Draw detection + ID
                cv2.circle(frame, pixel_pos, 4, (0, 255, 0), -1)
                cv2.putText(frame, f"{pid}", (cx, max(15, cy-10)), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0,255,0),1)
                # cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (255,0,0), 2)
                # Draw timestamp on video
                cv2.putText(frame, f"Time: {timestamp_text}", (10, frame_h - 15),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2, cv2.LINE_AA)


        if WRITE_VIDEO:
            vid_out.write(frame)
        cv2.imshow("Tracking", frame)
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

finally:
    cap.release()
    if WRITE_VIDEO:
        vid_out.release()
    cv2.destroyAllWindows()

# ===================== SAVE CSV =====================
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
with open(OUTPUT_CSV, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Timestamp", "PersonID", "Frame", "PixelX", "PixelY", "Zone"])
    writer.writerows(all_rows)

print(f"Tracking CSV saved to {OUTPUT_CSV}")
print(f"Video with detections saved to {OUTPUT_VIDEO}")


Select 4 points for each zone (C1–C4)
Tracking CSV saved to /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial28.csv
Video with detections saved to /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial28output.mp4


In [1]:
#arrival time and arrival zone per ID - good
import pandas as pd
import os

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM.csv"
OUTPUT_CSV = INPUT_CSV.replace(".csv", "-arrivaltimezone.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# ===================== CONVERT NUMERIC IDS =====================
# Attempt to convert numeric IDs; keep others as NaN temporarily
df["PersonID_numeric"] = pd.to_numeric(df["PersonID"], errors="coerce")

# Find the maximum existing numeric ID
max_id = df["PersonID_numeric"].max()
if pd.isna(max_id):
    max_id = 0
max_id = int(max_id)

# Map non-numeric IDs to new numeric IDs
new_ids = {}  # store mapping of old -> new numeric ID
counter = max_id + 1

for i, row in df.iterrows():
    if pd.isna(row["PersonID_numeric"]):
        old_id = row["PersonID"]
        if old_id not in new_ids:
            new_ids[old_id] = counter
            counter += 1
        df.at[i, "PersonID_numeric"] = new_ids[old_id]

# Replace PersonID with numeric version
df["PersonID"] = df["PersonID_numeric"].astype(int)
df = df.drop(columns=["PersonID_numeric"])

# ===================== ARRIVAL INFO =====================
df = df.sort_values(["PersonID", "Frame"]).reset_index(drop=True)
arrival_info = {}
# direction_info = {}

for pid, group in df.groupby("PersonID"):
    first_row = group.iloc[0]
    last_row = group.iloc[-1]

    # Arrival info
    arrival_info[pid] = {
        "ArrivalTime": first_row["Timestamp"],
        "ArrivalZone": first_row["Zone"]
    }

    # # Direction based on first vs last PixelX
    # if last_row["PixelX"] > first_row["PixelX"]:
    #     direction = "Right"
    # elif last_row["PixelX"] < first_row["PixelX"]:
    #     direction = "Left"
    # else:
    #     direction = "Stationary"

    # direction_info[pid] = direction ###--------------- move to later passes

# Map arrival info back to all rows
df["ArrivalTime"] = df["PersonID"].map(lambda x: arrival_info[x]["ArrivalTime"])
df["ArrivalZone"] = df["PersonID"].map(lambda x: arrival_info[x]["ArrivalZone"])
# df["Direction"] = df["PersonID"].map(lambda x: direction_info[x])

# # ===================== DEPARTURE INFO ===================== add to later passes it works yay
# departure_info = {}

# for pid, group in df.groupby("PersonID"):
#     group = group.sort_values("Frame")
#     prev_zone = group.iloc[0]["Zone"]
#     departure_time = None
#     departure_zone = None

#     for i in range(1, len(group)):
#         current_zone = group.iloc[i]["Zone"]
#         if current_zone != prev_zone:
#             departure_time = group.iloc[i]["Timestamp"]
#             departure_zone = prev_zone
#             break  # stop at the first departure
#         prev_zone = current_zone

#     departure_info[pid] = {
#         "DepartureTime": departure_time,
#         "DepartureZone": departure_zone
#     }

# df["DepartureTime"] = df["PersonID"].map(lambda x: departure_info[x]["DepartureTime"])
# df["DepartureZone"] = df["PersonID"].map(lambda x: departure_info[x]["DepartureZone"])

# ===================== DEDUPLICATE STATIONARY ROWS =====================
# Remove consecutive rows for the same PersonID with same PixelX and PixelY
df = df.sort_values(["PersonID", "Frame"]).reset_index(drop=True)

before_rows = len(df)

mask = (df["PersonID"] != df["PersonID"].shift()) | \
       (df["PixelX"] != df["PixelX"].shift()) | \
       (df["PixelY"] != df["PixelY"].shift())

df = df[mask].reset_index(drop=True)

after_rows = len(df)
removed_rows = before_rows - after_rows
print(f"Deduplication removed {removed_rows} rows.")

df["PersonID"] = pd.to_numeric(df["PersonID"], errors="coerce").astype(int)

# ===================== REMOVE SINGLE-FRAME (FLICKER) DETECTIONS =====================
frame_counts = df.groupby("PersonID")["Frame"].count()
valid_pids = frame_counts[frame_counts > 1].index  # only keep PersonIDs with more than 1 row

df = df[df["PersonID"].isin(valid_pids)].reset_index(drop=True)

after_rows = len(df)
removed_rows = before_rows - after_rows
print(f"Deduplication removed {before_rows - after_rows} rows in total (including flickers).")

# Ensure PersonID is integer
df["PersonID"] = pd.to_numeric(df["PersonID"], errors="coerce").astype(int)

# ===================== SAVE =====================
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)

# Save mapping of non-numeric IDs (if any)
if new_ids:
    mapping_df = pd.DataFrame(list(new_ids.items()), columns=["OldID", "NewID"])
    print("Mapping of non-numeric IDs:")
    print(mapping_df)

print(f"CSV with arrival info and deduplication saved to {OUTPUT_CSV}")
print(df.head(10))

Deduplication removed 262730 rows.
Deduplication removed 280626 rows in total (including flickers).
Mapping of non-numeric IDs:
              OldID  NewID
0        det_1848_0   3491
1        det_1851_0   3492
2        det_1853_0   3493
3        det_1855_0   3494
4        det_1856_0   3495
...             ...    ...
17879  det_225974_1  21370
17880  det_225974_2  21371
17881  det_225974_3  21372
17882  det_225975_0  21373
17883  det_225975_1  21374

[17884 rows x 2 columns]
CSV with arrival info and deduplication saved to /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone.csv
  Timestamp  PersonID  Frame  PixelX  PixelY Zone ArrivalTime ArrivalZone
0   0:00:00         1      1     993     402   C2     0:00:00          C2
1   0:00:01         1     92     992     402   C2     0:00:00          C2
2   0:00:01         1     93     993     402   C2     0:00:00          C2
3   0:00:01         1    106     992     402 

In [None]:
# #post porcessing unification
# import pandas as pd
# import os
# import math

# # ===================== CONFIG =====================
# INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial23-arrivaltimezone.csv"
# OUTPUT_CSV = INPUT_CSV.replace(".csv", "-firstlastrow.csv")

# # ===================== LOAD =====================
# df = pd.read_csv(INPUT_CSV)

# # Sort by PersonID then Frame
# df = df.sort_values(["PersonID", "Frame"]).reset_index(drop=True)
# # # Keep a mapping of final merged IDs
# # id_mapping = {}

# # # Helper function: Euclidean distance
# # def distance(p1, p2):
# #     return math.sqrt((p1[0]-p2[0])**2 + (p1[1]-p2[1])**2)

# # # =====================MERGE PROCESS =====================
# # # Store last known position of each merged ID
# # last_positions = {}

# # for pid, group in df.groupby("PersonID"):
# #     first_row = group.iloc[0]
# #     last_row = group.iloc[-1]

# #     new_id = pid
# #     first_pos = (first_row["PixelX"], first_row["PixelY"])
# #     first_frame = first_row["Frame"]

# #     best_match = None
# #     best_dist = float("inf")

# #     # Check all existing merged IDs
# #     for existing_id, (ex_frame, ex_pos) in last_positions.items():
# #         if ex_frame < first_frame:  # only look at earlier tracks
# #             d = distance(first_pos, ex_pos)
# #             if DIST_MIN <= d <= DIST_MAX and d < best_dist:
# #                 best_match = existing_id
# #                 best_dist = d

# #     if best_match is not None:
# #         # Merge: map this PersonID to existing one
# #         id_mapping[new_id] = best_match
# #         # Update last position of matched ID with this new track’s last row
# #         last_positions[best_match] = (last_row["Frame"], (last_row["PixelX"], last_row["PixelY"]))
# #     else:
# #         # Keep as new identity
# #         id_mapping[new_id] = new_id
# #         last_positions[new_id] = (last_row["Frame"], (last_row["PixelX"], last_row["PixelY"]))

# # # ===================== APPLY MERGE =====================
# # df["MergedID"] = df["PersonID"].map(id_mapping)


# # ===================== SAVE =====================
# os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
# df.to_csv(OUTPUT_CSV, index=False)

# print(f"Post-processed merged CSV saved to {OUTPUT_CSV}")


Post-processed merged CSV saved to /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial20-postprocess-postprocess-mergedIDs.csv


In [3]:
#pass 1 : merge 1 
import pandas as pd
import networkx as nx

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone.csv"
FPS = 30  # frames per second (needed to convert seconds to frames)
X_THRESH = 10  # pixels
Y_THRESH = 5  # pixels
TIME_THRESH_SEC = 2  # max time difference in seconds
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace(".csv", "-merge1.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace(".csv", "-merge1summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV, header=None, names=["Time", "PersonID", "Frame", "X", "Y", "Zone", "Direction", "ArrivalTime", "ArrivalZone"])

# Convert numeric columns
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["PersonID"] = df["PersonID"].astype(str)

df = df.dropna(subset=["PersonID", "Frame", "X", "Y"]) # there shouldn't be any

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("PersonID").first().reset_index() # Get the first observation (row) for each PersonID
last_obs = df.groupby("PersonID").last().reset_index() # Get the last observation (row) for each PersonID

# Merge first and last for easier comparison 
first_last_df = pd.merge(first_obs, last_obs, on="PersonID", suffixes=("_first", "_last"))

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each PersonID is a node
for pid in first_last_df["PersonID"]:
    G.add_node(pid)

for idx_a, row_a in first_last_df.iterrows():
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.PersonID == row_b.PersonID:
            continue

        # Spatial proximity: last of A to first of B
        dx = abs(row_a.X_last - row_b.X_first)
        dy = abs(row_a.Y_last - row_b.Y_first)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            # Link these two IDs
            G.add_edge(row_a.PersonID, row_b.PersonID)

# ===================== ASSIGN MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for merge_id, comp in enumerate(components, start=1):
    for pid in comp:
        merge_map[pid] = merge_id

# Assign MergeID back to full dataset
df["MergeID"] = df["PersonID"].map(merge_map).fillna(-1).astype(int)

# df["PersonID"] = pd.to_numeric(df["PersonID"], errors="coerce").astype(int)
# df["MergeID"] = pd.to_numeric(df["MergeID"], errors="coerce").astype(int)

# #sort by MergeID, PersonID, then Frame
# df = df.sort_values(["MergeID", "PersonID", "Frame"]).reset_index(drop=True)

# # Reorder columns so MergeID is first
# cols = ["MergeID"] + [c for c in df.columns if c != "MergeID"]
# df = df[cols]

# # ===================== SORT DATA FOR FINAL OUTPUT =====================
# # Sort by MergeID, then PersonID, then Frame
# df = df.sort_values(
#     by=["MergeID", "PersonID", "Frame"]
# ).reset_index(drop=True)

# ===================== SAVE MERGED DATA =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID =====================
summary_rows = []
for merge_id in sorted(df["MergeID"].unique()):
    subset = df[df["MergeID"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID": merge_id,
        "PersonIDs": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(df.head(10))


  df = pd.read_csv(INPUT_CSV, header=None, names=["Time", "PersonID", "Frame", "X", "Y", "Zone", "Direction", "ArrivalTime", "ArrivalZone"])


Saved merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge1.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge1summary.csv
       Time PersonID  Frame      X      Y Zone Direction ArrivalTime  \
1   0:00:00        1    1.0  993.0  402.0   C2   0:00:00          C2   
2   0:00:01        1   92.0  992.0  402.0   C2   0:00:00          C2   
3   0:00:01        1   93.0  993.0  402.0   C2   0:00:00          C2   
4   0:00:01        1  106.0  992.0  402.0   C2   0:00:00          C2   
5   0:00:01        1  118.0  993.0  402.0   C2   0:00:00          C2   
6   0:00:02        1  121.0  992.0  402.0   C2   0:00:00          C2   
7   0:00:02        1  127.0  993.0  402.0   C2   0:00:00          C2   
8   0:00:02        1  132.0  992.0  402.0   C2   0:00:00          C2   
9   0:00:02        1  133.0 

In [4]:
#passs 2
# ===================== PASS 2 MERGE (Bounding Box Method) =====================
import pandas as pd
import networkx as nx

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge1.csv"
FPS = 30  # frames per second
X_THRESH = 15  # max difference in X (pixels)
Y_THRESH = 10  # max difference in Y (pixels)
TIME_THRESH_SEC = 2  # max time difference in seconds
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("1.csv", "2.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("1.csv", "2summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID"] = df["MergeID"].astype(str)

df = df.dropna(subset=["MergeID", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID").first().reset_index()
last_obs = df.groupby("MergeID").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID", suffixes=("_first", "_last"))

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each MergeID is a node
for mid in first_last_df["MergeID"]:
    G.add_node(mid)

for idx_a, row_a in first_last_df.iterrows():
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID == row_b.MergeID:
            continue

        # Bounding box proximity: last of A to first of B
        dx = abs(row_a.X_last - row_b.X_first)
        dy = abs(row_a.Y_last - row_b.Y_first)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID, row_b.MergeID)

# ===================== ASSIGN PASS 2 MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for merge_id, comp in enumerate(components, start=1):
    for mid in comp:
        merge_map[mid] = merge_id

# Map new MergeID back to full dataset
df["MergeID2"] = df["MergeID"].map(merge_map).fillna(-1).astype(int)

# df["PersonID"] = pd.to_numeric(df["PersonID"], errors="coerce").astype(int)
# df["MergeID2"] = pd.to_numeric(df["MergeID2"], errors="coerce").astype(int)

# Sort and reorder columns
df = df.sort_values(["MergeID2", "MergeID", "Frame"]).reset_index(drop=True)

# cols = ["MergeID2"] + [c for c in df.columns if c != "MergeID2"]
# df = df[cols]

# ===================== SAVE MERGED DATA =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID2 =====================
summary_rows = []
for merge_id in sorted(df["MergeID2"].unique()):
    subset = df[df["MergeID2"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID2": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 2 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(df.head(10))

Saved pass 2 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge2.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge2summary.csv
      Time  PersonID  Frame      X      Y Zone Direction ArrivalTime  \
0  0:00:00         1    1.0  993.0  402.0   C2   0:00:00          C2   
1  0:00:01         1   92.0  992.0  402.0   C2   0:00:00          C2   
2  0:00:01         1   93.0  993.0  402.0   C2   0:00:00          C2   
3  0:00:01         1  106.0  992.0  402.0   C2   0:00:00          C2   
4  0:00:01         1  118.0  993.0  402.0   C2   0:00:00          C2   
5  0:00:02         1  121.0  992.0  402.0   C2   0:00:00          C2   
6  0:00:02         1  127.0  993.0  402.0   C2   0:00:00          C2   
7  0:00:02         1  132.0  992.0  402.0   C2   0:00:00          C2   
8  0:00:02         1 

In [5]:
#pass 3 : merge 3
# ===================== PASS 3 MERGE (Bounding Box Method) =====================
import pandas as pd
import networkx as nx

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge2.csv"
FPS = 30  # frames per second
X_THRESH = 20  # max difference in X (pixels)
Y_THRESH = 15  # max difference in Y (pixels)
TIME_THRESH_SEC = 2  # max time difference in seconds
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("2.csv", "3.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("2.csv", "3summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID"] = df["MergeID"].astype(str)
df["MergeID2"] = df["MergeID2"].astype(str)


df = df.dropna(subset=["MergeID2", "MergeID", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID2").first().reset_index()
last_obs = df.groupby("MergeID2").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID2", suffixes=("_first", "_last"))

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each MergeID2 is a node
for mid in first_last_df["MergeID2"]:
    G.add_node(mid)

for idx_a, row_a in first_last_df.iterrows():
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID2 == row_b.MergeID2:
            continue

        # Bounding box proximity: last of A to first of B
        dx = abs(row_a.X_last - row_b.X_first)
        dy = abs(row_a.Y_last - row_b.Y_first)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID2, row_b.MergeID2)

# ===================== ASSIGN PASS 3 MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for merge_id, comp in enumerate(components, start=1):
    for mid in comp:
        merge_map[mid] = merge_id

# Map new MergeID back to full dataset
df["MergeID3"] = df["MergeID2"].map(merge_map).fillna(-1).astype(int)

# df["PersonID"] = pd.to_numeric(df["PersonID"], errors="coerce").astype(int)
# df["MergeID2"] = pd.to_numeric(df["MergeID2"], errors="coerce").astype(int)

# Sort and reorder columns
df = df.sort_values(["MergeID3", "MergeID2", "MergeID", "Frame"]).reset_index(drop=True)

# cols = ["MergeID2"] + [c for c in df.columns if c != "MergeID2"]
# df = df[cols]

# ===================== SAVE MERGED DATA =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID3 =====================
summary_rows = []
for merge_id in sorted(df["MergeID3"].unique()):
    subset = df[df["MergeID3"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID3": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 3 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(df.head(10))

Saved pass 3 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge3.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge3summary.csv
      Time  PersonID  Frame      X      Y Zone Direction ArrivalTime  \
0  0:00:00         1    1.0  993.0  402.0   C2   0:00:00          C2   
1  0:00:01         1   92.0  992.0  402.0   C2   0:00:00          C2   
2  0:00:01         1   93.0  993.0  402.0   C2   0:00:00          C2   
3  0:00:01         1  106.0  992.0  402.0   C2   0:00:00          C2   
4  0:00:01         1  118.0  993.0  402.0   C2   0:00:00          C2   
5  0:00:02         1  121.0  992.0  402.0   C2   0:00:00          C2   
6  0:00:02         1  127.0  993.0  402.0   C2   0:00:00          C2   
7  0:00:02         1  132.0  992.0  402.0   C2   0:00:00          C2   
8  0:00:02         1 

In [6]:
#pass 4 : merge 4 (Euclidean with min/max thresholds)
# ===================== PASS 4 MERGE (Euclidean Distance with Min/Max) =====================
import pandas as pd
import networkx as nx
import numpy as np

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge3.csv"
FPS = 30  # frames per second
DIST_MIN = 0   # minimum Euclidean distance to merge (pixels)
DIST_MAX = 10   # maximum Euclidean distance to merge (pixels)
TIME_THRESH_SEC = 3  # max time difference in seconds
FRAME_THRESH = TIME_THRESH_SEC * FPS
TIME_MIN_SEC = 0.3  # minimum time difference in seconds
FRAME_MIN = TIME_MIN_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("3.csv", "4.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("3.csv", "4summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID"] = df["MergeID"].astype(str)
df["MergeID2"] = df["MergeID2"].astype(str)
df["MergeID3"] = df["MergeID3"].astype(str)

df = df.dropna(subset=["MergeID3", "MergeID2", "MergeID", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID3").first().reset_index()
last_obs = df.groupby("MergeID3").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID3", suffixes=("_first", "_last"))

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each MergeID3 is a node
for mid in first_last_df["MergeID3"]:
    G.add_node(mid)

for idx_a, row_a in first_last_df.iterrows():
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID3 == row_b.MergeID3:
            continue

        # Euclidean distance
        dist = np.sqrt((row_a.X_last - row_b.X_first)**2 + (row_a.Y_last - row_b.Y_first)**2)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        # Merge only if distance is within min/max AND frame_diff within min/max
        if DIST_MIN <= dist <= DIST_MAX and FRAME_MIN <= frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID3, row_b.MergeID3)

# ===================== ASSIGN PASS 4 MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for merge_id, comp in enumerate(components, start=1):
    for mid in comp:
        merge_map[mid] = merge_id

# Map new MergeID back to full dataset
df["MergeID4"] = df["MergeID3"].map(merge_map).fillna(-1).astype(int)

# Sort and reorder columns
df = df.sort_values(["MergeID4", "MergeID3", "MergeID2", "MergeID", "Frame"]).reset_index(drop=True)

# ===================== SAVE MERGED DATA =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID4 =====================
summary_rows = []
for merge_id in sorted(df["MergeID4"].unique()):
    subset = df[df["MergeID4"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID4": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 4 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
# print(df.head(10))
print(summary_df.head(10))


Saved pass 4 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge4.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge4summary.csv
   MergeID4 PersonID FirstTime  FirstFrame  FirstX  FirstY FirstZone LastTime  \
0         1      [1]   0:00:00         1.0   993.0   402.0        C2  0:00:21   
1         2   [1266]   0:24:17     87341.0  1006.0   404.0        C2  0:24:17   
2         3    [791]   0:17:28     62868.0   828.0   383.0       NaN  0:17:29   
3         4   [2660]   0:46:42    167988.0   128.0   407.0       NaN  0:46:42   
4         5   [2664]   0:46:44    168100.0    62.0   413.0       NaN  0:46:44   
5         6   [2666]   0:46:46    168204.0  1020.0   382.0        C2  0:46:52   
6         7   [2667]   0:46:52    168603.0  1170.0   383.0       NaN  0:46:53   
7         8   [2668] 

In [7]:
# ===================== PASS 5 MERGE (Direct Euclidean Distance Merge) =====================
import pandas as pd
import numpy as np

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge4.csv"
FPS = 30  # frames per second
DIST_MIN = 2   # minimum Euclidean distance to merge (pixels)
DIST_MAX = 50   # maximum Euclidean distance to merge (pixels)
TIME_THRESH_SEC = 3  # max time difference in seconds
FRAME_THRESH = TIME_THRESH_SEC * FPS
TIME_MIN_SEC = 0.5  # minimum time difference in seconds
FRAME_MIN = TIME_MIN_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("4.csv", "5.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("4.csv", "5summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID4"] = df["MergeID4"].astype(str)
df = df.dropna(subset=["MergeID4", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID4").first().reset_index()
last_obs = df.groupby("MergeID4").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID4", suffixes=("_first", "_last"))

# ===================== DIRECT MERGE LOGIC =====================
merge_map = {}
next_merge_id = 1
used_nodes = set()

for idx_a, row_a in first_last_df.iterrows():
    if row_a.MergeID4 in used_nodes:
        continue

    # Find valid candidates within thresholds
    candidates = []
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID4 == row_b.MergeID4 or row_b.MergeID4 in used_nodes:
            continue

        dist = np.sqrt((row_a.X_last - row_b.X_first)**2 + (row_a.Y_last - row_b.Y_first)**2)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if DIST_MIN <= dist <= DIST_MAX and FRAME_MIN <= frame_diff <= FRAME_THRESH:
            candidates.append((dist, row_b.MergeID4))

    # Merge with the closest candidate only (or keep alone)
    merge_map[row_a.MergeID4] = next_merge_id
    used_nodes.add(row_a.MergeID4)

    if candidates:
        candidates.sort()  # sort by distance
        closest = candidates[0][1]
        merge_map[closest] = next_merge_id
        used_nodes.add(closest)

    next_merge_id += 1

# ===================== MAP BACK TO FULL DATA =====================
df["MergeID5"] = df["MergeID4"].map(merge_map).fillna(-1).astype(int)
df = df.sort_values(["MergeID5", "MergeID4", "MergeID3", "MergeID2", "MergeID", "Frame"]).reset_index(drop=True)
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID5 =====================
summary_rows = []
for merge_id in sorted(df["MergeID5"].unique()):
    subset = df[df["MergeID5"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID5": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 5 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(summary_df.head(10))


Saved pass 5 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge5.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge5summary.csv
   MergeID5            PersonID FirstTime  FirstFrame  FirstX  FirstY  \
0         1                 [1]   0:00:00         1.0   993.0   402.0   
1         2        [2380, 2386]   0:42:15    151955.0   816.0   390.0   
2         3        [2776, 2809]   0:48:21    173908.0    27.0   621.0   
3         4  [1453, 1465, 1469]   0:28:51    103798.0  1060.0   396.0   
4         5              [1454]   0:28:52    103827.0    35.0   555.0   
5         6        [1455, 1467]   0:28:52    103864.0    80.0   455.0   
6         7              [1456]   0:28:53    103892.0   907.0   391.0   
7         8              [1037]   0:21:47     78384.0   469.0   512.0   
8         9 

In [8]:
# ===================== PASS 6 MERGE (Direct Euclidean Distance Merge, 6s time threshold) =====================
import pandas as pd
import numpy as np

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge5.csv"
FPS = 30  # frames per second
DIST_MIN = 2   # minimum Euclidean distance to merge (pixels)
DIST_MAX = 50   # maximum Euclidean distance to merge (pixels)
TIME_THRESH_SEC = 15  # max time difference in seconds (changed from 3s)
FRAME_THRESH = TIME_THRESH_SEC * FPS
TIME_MIN_SEC = 0.5  # minimum time difference in seconds
FRAME_MIN = TIME_MIN_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("5.csv", "6.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("5.csv", "6summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID5"] = df["MergeID5"].astype(str)
df = df.dropna(subset=["MergeID5", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID5").first().reset_index()
last_obs = df.groupby("MergeID5").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID5", suffixes=("_first", "_last"))

# ===================== DIRECT MERGE LOGIC =====================
merge_map = {}
next_merge_id = 1
used_nodes = set()

for idx_a, row_a in first_last_df.iterrows():
    if row_a.MergeID5 in used_nodes:
        continue

    # Find valid candidates within thresholds
    candidates = []
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID5 == row_b.MergeID5 or row_b.MergeID5 in used_nodes:
            continue

        dist = np.sqrt((row_a.X_last - row_b.X_first)**2 + (row_a.Y_last - row_b.Y_first)**2)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if DIST_MIN <= dist <= DIST_MAX and FRAME_MIN <= frame_diff <= FRAME_THRESH:
            candidates.append((dist, row_b.MergeID5))

    # Merge with the closest candidate only (or keep alone)
    merge_map[row_a.MergeID5] = next_merge_id
    used_nodes.add(row_a.MergeID5)

    if candidates:
        candidates.sort()  # sort by distance
        closest = candidates[0][1]
        merge_map[closest] = next_merge_id
        used_nodes.add(closest)

    next_merge_id += 1

# ===================== MAP BACK TO FULL DATA =====================
df["MergeID6"] = df["MergeID5"].map(merge_map).fillna(-1).astype(int)
df = df.sort_values(["MergeID6", "MergeID5", "MergeID4", "MergeID3", "MergeID2", "MergeID", "Frame"]).reset_index(drop=True)
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID6 =====================
summary_rows = []
for merge_id in sorted(df["MergeID6"].unique()):
    subset = df[df["MergeID6"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID6": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 6 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(summary_df.head(10))

Saved pass 6 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge6.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge6summary.csv
   MergeID6                                           PersonID FirstTime  \
0         1                                                [1]   0:00:00   
1         2                                       [1460, 1461]   0:28:54   
2         3                                              [267]   0:06:51   
3         4                                 [1236, 1246, 1247]   0:24:02   
4         5                           [3459, 3462, 3463, 3464]   1:02:39   
5         6                                       [2474, 2476]   0:43:46   
6         7                                              [273]   0:06:54   
7         8                                       [3467, 3469

In [9]:
# ===================== PASS 7 MERGE (Direct Euclidean Distance Merge, ) =====================
import pandas as pd
import numpy as np

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge6.csv"
FPS = 30  # frames per second
DIST_MIN = 0   # minimum Euclidean distance to merge (pixels)
DIST_MAX = 60   # maximum Euclidean distance to merge (pixels)
TIME_THRESH_SEC = 10  # max time difference in seconds (changed from 3s)
FRAME_THRESH = TIME_THRESH_SEC * FPS
TIME_MIN_SEC = 0.5  # minimum time difference in seconds
FRAME_MIN = TIME_MIN_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("6.csv", "7.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("6.csv", "7summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID6"] = df["MergeID6"].astype(str)
df = df.dropna(subset=["MergeID6", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID6").first().reset_index()
last_obs = df.groupby("MergeID6").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID6", suffixes=("_first", "_last"))

# ===================== DIRECT MERGE LOGIC =====================
merge_map = {}
next_merge_id = 1
used_nodes = set()

for idx_a, row_a in first_last_df.iterrows():
    if row_a.MergeID6 in used_nodes:
        continue

    # Find valid candidates within thresholds
    candidates = []
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID6 == row_b.MergeID6 or row_b.MergeID6 in used_nodes:
            continue

        dist = np.sqrt((row_a.X_last - row_b.X_first)**2 + (row_a.Y_last - row_b.Y_first)**2)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if DIST_MIN <= dist <= DIST_MAX and FRAME_MIN <= frame_diff <= FRAME_THRESH:
            candidates.append((dist, row_b.MergeID6))

    # Merge with the closest candidate only (or keep alone)
    merge_map[row_a.MergeID6] = next_merge_id
    used_nodes.add(row_a.MergeID6)

    if candidates:
        candidates.sort()  # sort by distance
        closest = candidates[0][1]
        merge_map[closest] = next_merge_id
        used_nodes.add(closest)

    next_merge_id += 1

# ===================== MAP BACK TO FULL DATA =====================
df["MergeID7"] = df["MergeID6"].map(merge_map).fillna(-1).astype(int)
df = df.sort_values(["MergeID7", "MergeID6", "MergeID5", "MergeID4", "MergeID3", "MergeID2", "MergeID", "Frame"]).reset_index(drop=True)
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID7 =====================
summary_rows = []
for merge_id in sorted(df["MergeID7"].unique()):
    subset = df[df["MergeID7"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID7": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 7 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(summary_df.head(10))

Saved pass 7 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge7.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge7summary.csv
   MergeID7            PersonID FirstTime  FirstFrame  FirstX  FirstY  \
0         1         [1, 19, 23]   0:00:00         1.0   993.0   402.0   
1         2        [1237, 1242]   0:24:02     86480.0   749.0   378.0   
2         3                [51]   0:01:41      6080.0   526.0   460.0   
3         4              [2499]   0:43:59    158190.0  1036.0   393.0   
4         5  [1265, 1266, 1268]   0:24:16     87331.0  1044.0   396.0   
5         6          [896, 954]   0:19:25     69862.0   428.0   508.0   
6         7          [510, 514]   0:13:13     47554.0   944.0   395.0   
7         8               [511]   0:13:13     47563.0   871.0   393.0   
8         9 

In [10]:
# ===================== PASS 8 MERGE (Direct Euclidean Distance Merge, ) =====================
import pandas as pd
import numpy as np

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge7.csv"
FPS = 30  # frames per second
DIST_MIN = 0   # minimum Euclidean distance to merge (pixels)
DIST_MAX = 90   # maximum Euclidean distance to merge (pixels)
TIME_THRESH_SEC = 10  # max time difference in seconds (changed from 3s)
FRAME_THRESH = TIME_THRESH_SEC * FPS
TIME_MIN_SEC = 0.5  # minimum time difference in seconds
FRAME_MIN = TIME_MIN_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("7.csv", "8.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("7.csv", "8summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID7"] = df["MergeID7"].astype(str)
df = df.dropna(subset=["MergeID7", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID7").first().reset_index()
last_obs = df.groupby("MergeID7").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID7", suffixes=("_first", "_last"))

# ===================== DIRECT MERGE LOGIC =====================
merge_map = {}
next_merge_id = 1
used_nodes = set()

for idx_a, row_a in first_last_df.iterrows():
    if row_a.MergeID7 in used_nodes:
        continue

    # Find valid candidates within thresholds
    candidates = []
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID7 == row_b.MergeID7 or row_b.MergeID7 in used_nodes:
            continue

        dist = np.sqrt((row_a.X_last - row_b.X_first)**2 + (row_a.Y_last - row_b.Y_first)**2)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if DIST_MIN <= dist <= DIST_MAX and FRAME_MIN <= frame_diff <= FRAME_THRESH:
            candidates.append((dist, row_b.MergeID7))

    # Merge with the closest candidate only (or keep alone)
    merge_map[row_a.MergeID7] = next_merge_id
    used_nodes.add(row_a.MergeID7)

    if candidates:
        candidates.sort()  # sort by distance
        closest = candidates[0][1]
        merge_map[closest] = next_merge_id
        used_nodes.add(closest)

    next_merge_id += 1

# ===================== MAP BACK TO FULL DATA =====================
df["MergeID8"] = df["MergeID7"].map(merge_map).fillna(-1).astype(int)
df = df.sort_values(["MergeID8", "MergeID7", "MergeID6", "MergeID5", "MergeID4", "MergeID3", "MergeID2", "MergeID", "Frame"]).reset_index(drop=True)
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID8 =====================
summary_rows = []
for merge_id in sorted(df["MergeID8"].unique()):
    subset = df[df["MergeID8"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID8": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 8 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(summary_df.head(10))

Saved pass 8 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge8.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge8summary.csv
   MergeID8                                           PersonID FirstTime  \
0         1                                 [1, 9, 19, 23, 27]   0:00:00   
1         2                                             [2742]   0:48:05   
2         3                                       [1615, 1761]   0:31:13   
3         4                                         [969, 978]   0:20:08   
4         5                                              [972]   0:20:08   
5         6                                              [975]   0:20:09   
6         7                                    [979, 983, 985]   0:20:27   
7         8                                              [276

In [11]:
# ===================== PASS 9 MERGE (Direct Euclidean Distance Merge, ) =====================
import pandas as pd
import numpy as np

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge8.csv"
FPS = 30  # frames per second
DIST_MIN = 0   # minimum Euclidean distance to merge (pixels)
DIST_MAX = 150   # maximum Euclidean distance to merge (pixels)
TIME_THRESH_SEC = 10  # max time difference in seconds (changed from 3s)
FRAME_THRESH = TIME_THRESH_SEC * FPS
TIME_MIN_SEC = 1  # minimum time difference in seconds
FRAME_MIN = TIME_MIN_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("8.csv", "9.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("8.csv", "9summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID8"] = df["MergeID8"].astype(str)
df = df.dropna(subset=["MergeID8", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID8").first().reset_index()
last_obs = df.groupby("MergeID8").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID8", suffixes=("_first", "_last"))

# ===================== DIRECT MERGE LOGIC =====================
merge_map = {}
next_merge_id = 1
used_nodes = set()

for idx_a, row_a in first_last_df.iterrows():
    if row_a.MergeID8 in used_nodes:
        continue

    # Find valid candidates within thresholds
    candidates = []
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID8 == row_b.MergeID8 or row_b.MergeID8 in used_nodes:
            continue

        dist = np.sqrt((row_a.X_last - row_b.X_first)**2 + (row_a.Y_last - row_b.Y_first)**2)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if DIST_MIN <= dist <= DIST_MAX and FRAME_MIN <= frame_diff <= FRAME_THRESH:
            candidates.append((dist, row_b.MergeID8))

    # Merge with the closest candidate only (or keep alone)
    merge_map[row_a.MergeID8] = next_merge_id
    used_nodes.add(row_a.MergeID8)

    if candidates:
        candidates.sort()  # sort by distance
        closest = candidates[0][1]
        merge_map[closest] = next_merge_id
        used_nodes.add(closest)

    next_merge_id += 1

# ===================== MAP BACK TO FULL DATA =====================
df["MergeID9"] = df["MergeID8"].map(merge_map).fillna(-1).astype(int)
df = df.sort_values(["MergeID9", "MergeID8", "MergeID7", "MergeID6", "MergeID5", "MergeID4", "MergeID3", "MergeID2", "MergeID", "Frame"]).reset_index(drop=True)
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID9 =====================
summary_rows = []
for merge_id in sorted(df["MergeID9"].unique()):
    subset = df[df["MergeID9"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID9": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 9 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(summary_df.head(10))

Saved pass 9 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge9.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/5AM-arrivaltimezone-merge9summary.csv
   MergeID9                                           PersonID FirstTime  \
0         1                                 [1, 9, 19, 23, 27]   0:00:00   
1         2  [2536, 2537, 2539, 2541, 2543, 2546, 2553, 255...   0:45:32   
2         3  [372, 373, 379, 388, 394, 398, 401, 406, 407, ...   0:10:10   
3         4                                             [1051]   0:22:06   
4         5                                             [1632]   0:31:33   
5         6                           [1237, 1241, 1242, 1245]   0:24:02   
6         7               [1604, 1606, 1608, 1614, 1616, 1620]   0:31:01   
7         8                     [1048, 1103, 1134, 1137, 1154

In [None]:
#pass 4 : merge 4
# ===================== PASS 4 MERGE (Bounding Box Method) =====================
import pandas as pd
import networkx as nx

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial22-arrivaltimezone-merge3.csv"
FPS = 30  # frames per second
X_THRESH = 25  # max difference in X (pixels)
Y_THRESH = 20  # max difference in Y (pixels)
TIME_THRESH_SEC = 2  # max time difference in seconds
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("3.csv", "4.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("3.csv", "4summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID"] = df["MergeID"].astype(str)
df["MergeID2"] = df["MergeID2"].astype(str)
df["MergeID3"] = df["MergeID3"].astype(str)

df = df.dropna(subset=["MergeID3", "MergeID2", "MergeID", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID3").first().reset_index()
last_obs = df.groupby("MergeID3").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID3", suffixes=("_first", "_last"))

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each MergeID3 is a node
for mid in first_last_df["MergeID3"]:
    G.add_node(mid)

for idx_a, row_a in first_last_df.iterrows():
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID3 == row_b.MergeID3:
            continue

        # Bounding box proximity: last of A to first of B
        dx = abs(row_a.X_last - row_b.X_first)
        dy = abs(row_a.Y_last - row_b.Y_first)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID3, row_b.MergeID3)

# ===================== ASSIGN PASS 4 MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for merge_id, comp in enumerate(components, start=1):
    for mid in comp:
        merge_map[mid] = merge_id

# Map new MergeID back to full dataset
df["MergeID4"] = df["MergeID3"].map(merge_map).fillna(-1).astype(int)

# df["PersonID"] = pd.to_numeric(df["PersonID"], errors="coerce").astype(int)
# df["MergeID2"] = pd.to_numeric(df["MergeID2"], errors="coerce").astype(int)

# Sort and reorder columns
df = df.sort_values(["MergeID4", "MergeID3", "MergeID2", "MergeID", "Frame"]).reset_index(drop=True)

# cols = ["MergeID2"] + [c for c in df.columns if c != "MergeID2"]
# df = df[cols]

# ===================== SAVE MERGED DATA =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID4 =====================
summary_rows = []
for merge_id in sorted(df["MergeID4"].unique()):
    subset = df[df["MergeID4"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID4": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 4 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(df.head(10))

Saved pass 4 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial22-arrivaltimezone-merge4.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial22-arrivaltimezone-merge4summary.csv
      Time  PersonID  Frame      X      Y Zone Direction ArrivalTime  \
0  0:00:00         1    1.0  525.0  371.0   C4       NaN     0:00:00   
1  0:00:00         1    3.0  525.0  370.0   C4       NaN     0:00:00   
2  0:00:00         1    6.0  525.0  371.0   C4       NaN     0:00:00   
3  0:00:00         1    8.0  525.0  370.0   C4       NaN     0:00:00   
4  0:00:00         1   17.0  525.0  371.0   C4       NaN     0:00:00   
5  0:00:00         1   26.0  525.0  370.0   C4       NaN     0:00:00   
6  0:00:00         1   32.0  525.0  371.0   C4       NaN     0:00:00   
7  0:00:00         1   33.0  525.0  370.0   C4       NaN     0:00:00   
8  0:00:00   

In [43]:
# pass 3 : merge 3
# ===================== PASS 3 MERGE (Bounding Box Method) =====================
import pandas as pd
import networkx as nx

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial22-arrivaltimezone-merge2.csv"
FPS = 30  # frames per second
X_THRESH = 30  # max difference in X (pixels)
Y_THRESH = 20  # max difference in Y (pixels)
TIME_THRESH_SEC = 5  # max time difference in seconds
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace("2.csv", "3.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace("2.csv", "3summary.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure numeric types
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["MergeID2"] = df["MergeID2"].astype(str)

df = df.dropna(subset=["MergeID2", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("MergeID2").first().reset_index()
last_obs = df.groupby("MergeID2").last().reset_index()
first_last_df = pd.merge(first_obs, last_obs, on="MergeID2", suffixes=("_first", "_last"))

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each MergeID2 is a node
for mid in first_last_df["MergeID2"]:
    G.add_node(mid)

for idx_a, row_a in first_last_df.iterrows():
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.MergeID2 == row_b.MergeID2:
            continue

        # Bounding box proximity: last of A to first of B
        dx = abs(row_a.X_last - row_b.X_first)
        dy = abs(row_a.Y_last - row_b.Y_first)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID2, row_b.MergeID2)

# ===================== ASSIGN PASS 3 MERGEID3s =====================
components = list(nx.connected_components(G))
merge_map = {}
for merge_id, comp in enumerate(components, start=1):
    for mid in comp:
        merge_map[mid] = merge_id

# Map new MergeID2 back to full dataset
df["MergeID3"] = df["MergeID2"].map(merge_map).fillna(-1).astype(int)

df["PersonID"] = pd.to_numeric(df["PersonID"], errors="coerce").astype(int)
df["MergeID3"] = pd.to_numeric(df["MergeID3"], errors="coerce").astype(int)

# Sort and reorder columns
df = df.sort_values(["MergeID3", "MergeID", "Frame"]).reset_index(drop=True)

cols = ["MergeID3"] + [c for c in df.columns if c != "MergeID3"]
df = df[cols]

# ===================== SAVE MERGED DATA =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID3 =====================
summary_rows = []
for merge_id in sorted(df["MergeID3"].unique()):
    subset = df[df["MergeID3"] == merge_id].sort_values(by="Frame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]
    merged_persons = sorted(subset["PersonID"].unique())

    summary_rows.append({
        "MergeID3": merge_id,
        "PersonID": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "LastTime": last_row["Time"],
        "LastFrame": last_row["Frame"],
        "LastX": last_row["X"],
        "LastY": last_row["Y"],
        "LastZone": last_row.get("Zone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved pass 3 merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")
print(df.head(10))

Saved pass 3 merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial22-arrivaltimezone-merge3.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial22-arrivaltimezone-merge3summary.csv
   MergeID3 MergeID2  MergeID     Time  PersonID  Frame      X      Y Zone  \
0         1        1        1  0:00:00         1    1.0  525.0  371.0   C4   
1         1        1        1  0:00:00         1    3.0  525.0  370.0   C4   
2         1        1        1  0:00:00         1    6.0  525.0  371.0   C4   
3         1        1        1  0:00:00         1    8.0  525.0  370.0   C4   
4         1        1        1  0:00:00         1   17.0  525.0  371.0   C4   
5         1        1        1  0:00:00         1   26.0  525.0  370.0   C4   
6         1        1        1  0:00:00         1   32.0  525.0  371.0   C4   
7         1        1        1  0:00:0

In [25]:
#pass three
import pandas as pd
import networkx as nx

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial20-postprocess-summary1-summary2.csv"
FPS = 30
X_THRESH = 100  # spatial threshold for third pass
Y_THRESH = 100
TIME_THRESH_SEC = 6  # temporal threshold
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace(".csv", "-mergeID3.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace(".csv", "-summary3.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)
df = df.dropna(subset=["FirstFrame", "FirstX", "FirstY", "LastFrame", "LastX", "LastY"])

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each MergeID_2 is a node
for mid in df["MergeID_2"]:
    G.add_node(mid)

for idx_a, row_a in df.iterrows():
    for idx_b, row_b in df.iterrows():
        if row_a.MergeID_2 == row_b.MergeID_2:
            continue

        dx = abs(row_a.LastX - row_b.FirstX)
        dy = abs(row_a.LastY - row_b.FirstY)
        frame_diff = abs(row_a.LastFrame - row_b.FirstFrame)

        # Merge if within thresholds
        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID_2, row_b.MergeID_2)

# ===================== ASSIGN NEW MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for new_merge_id, comp in enumerate(components, start=1):
    for old_merge_id in comp:
        merge_map[old_merge_id] = new_merge_id

df["MergeID_3"] = df["MergeID_2"].map(merge_map).fillna(-1).astype(int)

# ===================== SAVE THIRD PASS =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY =====================
summary_rows = []
for merge_id in sorted(df["MergeID_3"].unique()):
    subset = df[df["MergeID_3"] == merge_id].sort_values(by="FirstFrame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]

    merged_persons = []
    for p in subset["PersonIDs"]:
        if isinstance(p, str):
            merged_persons.extend(eval(p))
        else:
            merged_persons.append(p)

    summary_rows.append({
        "MergeID_3": merge_id,
        "PersonIDs": merged_persons,
        "FirstTime": first_row["FirstTime"],
        "FirstFrame": first_row["FirstFrame"],
        "FirstX": first_row["FirstX"],
        "FirstY": first_row["FirstY"],
        "FirstZone": first_row.get("FirstZone", ""),
        "LastTime": last_row["LastTime"],
        "LastFrame": last_row["LastFrame"],
        "LastX": last_row["LastX"],
        "LastY": last_row["LastY"],
        "LastZone": last_row.get("LastZone", "")
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved third-pass merged dataset → {OUTPUT_MERGED}")
print(f"Saved third-pass summary → {OUTPUT_SUMMARY}")


Saved third-pass merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial20-postprocess-summary1-summary2-mergeID3.csv
Saved third-pass summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial20-postprocess-summary1-summary2-summary3.csv


In [26]:
#final clean up - delete row if time is 1s or less
import pandas as pd

# ===================== CONFIG =====================
INPUT_FILE = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial20-postprocess-summary1-summary2-summary3.csv"
OUTPUT_FILE = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial20-postprocess-summary1-summary2-summary3-cleaned.csv"

# ===================== LOAD =====================
df = pd.read_csv(INPUT_FILE)

# ===================== CONVERT TIMES =====================
# Convert to timedelta for comparison
df["FirstTime_td"] = pd.to_timedelta(df["FirstTime"])
df["LastTime_td"] = pd.to_timedelta(df["LastTime"])

# ===================== FILTER ROWS =====================
# Keep only rows where duration > 0 seconds
df_clean = df[df["LastTime_td"] != df["FirstTime_td"]].copy()

# ===================== REMOVE TEMP COLUMNS =====================
df_clean = df_clean.drop(columns=["FirstTime_td", "LastTime_td"], errors="ignore")

# ===================== SAVE =====================
df_clean.to_csv(OUTPUT_FILE, index=False)

print(f"Cleaned table saved to {OUTPUT_FILE}")

Cleaned table saved to /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial20-postprocess-summary1-summary2-summary3-cleaned.csv


In [None]:
#add arrival and departure time
import pandas as pd

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13.csv"
OUTPUT_CSV = INPUT_CSV.replace(".csv", "-arrivaldeparture.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Ensure proper types
df["PersonID"] = df["PersonID"].astype(str)
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["PixelX"] = pd.to_numeric(df["PixelX"], errors="coerce")
df["PixelY"] = pd.to_numeric(df["PixelY"], errors="coerce")
df["Zone"] = df["Zone"].astype(str)

# ===================== COMPUTE ARRIVAL & DEPARTURE =====================
arrival_info = {}

for pid, group in df.groupby("PersonID"):
    # Only rows where Zone is assigned
    zone_rows = group[group["Zone"].notna() & (group["Zone"] != "")]
    if not zone_rows.empty:
        first_zone_row = zone_rows.iloc[0]
        last_zone_row = zone_rows.iloc[-1]
        arrival_info[pid] = {
            "ArrivalTime": first_zone_row["Timestamp"],
            "ArrivalZone": first_zone_row["Zone"],
            "DepartureTime": last_zone_row["Timestamp"]
        }
    else:
        arrival_info[pid] = {
            "ArrivalTime": None,
            "ArrivalZone": None,
            "DepartureTime": None
        }

# ===================== ADD COLUMNS TO FULL DATA =====================
df["ArrivalTime"] = df["PersonID"].map(lambda pid: arrival_info[pid]["ArrivalTime"])
df["ArrivalZone"] = df["PersonID"].map(lambda pid: arrival_info[pid]["ArrivalZone"])
df["DepartureTime"] = df["PersonID"].map(lambda pid: arrival_info[pid]["DepartureTime"])

# ===================== SAVE =====================
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)

print(f"Saved full detection log with arrival info → {OUTPUT_CSV}")

Saved full detection log with arrival info → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-arrivaldeparture.csv


In [16]:
# post processing
import pandas as pd
import os

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-arrivaldeparture.csv"
OUTPUT_CSV = INPUT_CSV.replace("-arrivaldeparture.csv", "-postprocess.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)
df["PersonID"] = df["PersonID"].astype(str)

# ===================== FILTER FIRST & LAST =====================
rows = []
for pid, group in df.groupby("PersonID"):
    group_sorted = group.sort_values("Frame")
    first_row = group_sorted.iloc[0]
    last_row = group_sorted.iloc[-1]
    rows.append(first_row)
    if len(group_sorted) > 1:  # only add last row if different from first
        rows.append(last_row)

df_out = pd.DataFrame(rows)

# ===================== SAVE =====================
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
df_out.to_csv(OUTPUT_CSV, index=False)

print(f"First & last observation CSV saved to {OUTPUT_CSV}")


First & last observation CSV saved to /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess.csv


In [17]:
#post porcessing unification (can delete) - use to check?
import pandas as pd
import os
import math

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess.csv"
OUTPUT_CSV = INPUT_CSV.replace(".csv", "-postprocess-mergedIDs.csv")

DIST_MIN = 70    # minimum distance threshold
DIST_MAX = 100   # maximum distance threshold

# ===================== LOAD =====================
df = pd.read_csv(INPUT_CSV)

# Sort by PersonID then Frame
df = df.sort_values(["PersonID", "Frame"]).reset_index(drop=True)

# Keep a mapping of final merged IDs
id_mapping = {}

# Helper function: Euclidean distance
def distance(p1, p2):
    return math.sqrt((p1[0]-p2[0])**2 + (p1[1]-p2[1])**2)

# ===================== PROCESS =====================
# Store last known position of each merged ID
last_positions = {}

for pid, group in df.groupby("PersonID"):
    first_row = group.iloc[0]
    last_row = group.iloc[-1]

    new_id = pid
    first_pos = (first_row["PixelX"], first_row["PixelY"])
    first_frame = first_row["Frame"]

    best_match = None
    best_dist = float("inf")

    # Check all existing merged IDs
    for existing_id, (ex_frame, ex_pos) in last_positions.items():
        if ex_frame < first_frame:  # only look at earlier tracks
            d = distance(first_pos, ex_pos)
            if DIST_MIN <= d <= DIST_MAX and d < best_dist:
                best_match = existing_id
                best_dist = d

    if best_match is not None:
        # Merge: map this PersonID to existing one
        id_mapping[new_id] = best_match
        # Update last position of matched ID with this new track’s last row
        last_positions[best_match] = (last_row["Frame"], (last_row["PixelX"], last_row["PixelY"]))
    else:
        # Keep as new identity
        id_mapping[new_id] = new_id
        last_positions[new_id] = (last_row["Frame"], (last_row["PixelX"], last_row["PixelY"]))

# ===================== APPLY MERGE =====================
df["MergedID"] = df["PersonID"].map(id_mapping)

# ===================== SAVE =====================
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)

print(f"Post-processed merged CSV saved to {OUTPUT_CSV}")


Post-processed merged CSV saved to /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-postprocess-mergedIDs.csv


In [19]:
#pass 1
import pandas as pd
import networkx as nx

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess.csv"
FPS = 30  # frames per second (needed to convert seconds to frames)
X_THRESH = 40  # pixels
Y_THRESH = 40  # pixels
TIME_THRESH_SEC = 2  # max time difference in seconds
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace(".csv", "-mergeID1.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace(".csv", "-summary1.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV, header=None, names=["Time", "PersonID", "Frame", "X", "Y", "Zone"])

# Convert numeric columns
df["Frame"] = pd.to_numeric(df["Frame"], errors="coerce")
df["X"] = pd.to_numeric(df["X"], errors="coerce")
df["Y"] = pd.to_numeric(df["Y"], errors="coerce")
df["PersonID"] = df["PersonID"].astype(str)

df = df.dropna(subset=["PersonID", "Frame", "X", "Y"])

# ===================== FIRST & LAST OBSERVATIONS =====================
first_obs = df.groupby("PersonID").first().reset_index()
last_obs = df.groupby("PersonID").last().reset_index()

# Merge first and last for easier comparison
first_last_df = pd.merge(first_obs, last_obs, on="PersonID", suffixes=("_first", "_last"))

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each PersonID is a node
for pid in first_last_df["PersonID"]:
    G.add_node(pid)

for idx_a, row_a in first_last_df.iterrows():
    for idx_b, row_b in first_last_df.iterrows():
        if row_a.PersonID == row_b.PersonID:
            continue

        # Spatial proximity: last of A to first of B
        dx = abs(row_a.X_last - row_b.X_first)
        dy = abs(row_a.Y_last - row_b.Y_first)
        frame_diff = abs(row_a.Frame_last - row_b.Frame_first)

        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            # Link these two IDs
            G.add_edge(row_a.PersonID, row_b.PersonID)

# ===================== ASSIGN MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for merge_id, comp in enumerate(components, start=1):
    for pid in comp:
        merge_map[pid] = merge_id

# Assign MergeID back to full dataset
df["MergeID"] = df["PersonID"].map(merge_map).fillna(-1).astype(int)

# ===================== SAVE MERGED DATA =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY PER MergeID =====================
summary_rows = []
for merge_id in sorted(df["MergeID"].unique()):
    subset = df[df["MergeID"] == merge_id].sort_values(by="Frame")
    merged_persons = sorted(subset["PersonID"].unique())

    # Filter rows with non-empty zone
    zone_rows = subset[subset["Zone"].notna() & (subset["Zone"].astype(str) != "")]

    if not zone_rows.empty:
        # Arrival: first zone
        arrival_row = zone_rows.iloc[0]
        arrival_time = arrival_row["Time"]
        arrival_zone = arrival_row["Zone"]

        # Departure: last time in same arrival zone
        same_zone_rows = zone_rows[zone_rows["Zone"] == arrival_zone]
        departure_time = same_zone_rows.iloc[-1]["Time"]

        # Last time / last zone: last non-empty zone in subset
        last_row = zone_rows.iloc[-1]
        last_time = last_row["Time"]
        last_zone = last_row["Zone"]
    else:
        arrival_time, arrival_zone, departure_time, last_time, last_zone = None, None, None, None, None

    first_row = subset.iloc[0]

    summary_rows.append({
        "MergeID": merge_id,
        "PersonIDs": merged_persons,
        "FirstTime": first_row["Time"],
        "FirstFrame": first_row["Frame"],
        "FirstX": first_row["X"],
        "FirstY": first_row["Y"],
        "FirstZone": first_row.get("Zone", ""),
        "ArrivalTime": arrival_time,
        "ArrivalZone": arrival_zone,
        "DepartureTime": departure_time,
        "LastTime": last_time,
        "LastFrame": subset["Frame"].iloc[-1],
        "LastX": subset["X"].iloc[-1],
        "LastY": subset["Y"].iloc[-1],
        "LastZone": last_zone
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved merged dataset → {OUTPUT_MERGED}")
print(f"Saved summary → {OUTPUT_SUMMARY}")


Saved merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-mergeID1.csv
Saved summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1.csv


In [7]:
#passs 2
import pandas as pd
import networkx as nx
import ast

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1.csv"
FPS = 30
X_THRESH = 80  # larger threshold
Y_THRESH = 80
TIME_THRESH_SEC = 5  # larger temporal threshold
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace(".csv", "-mergeID2.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace(".csv", "-summary2.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Use first/last positions from first pass
df = df.dropna(subset=["FirstFrame", "FirstX", "FirstY", "LastFrame", "LastX", "LastY"])

# ===================== BUILD LINKS BASED ON MERGEID FINAL→FIRST =====================
G = nx.Graph()

# Each MergeID is a node
for mid in df["MergeID"]:
    G.add_node(mid)

for idx_a, row_a in df.iterrows():
    for idx_b, row_b in df.iterrows():
        if row_a.MergeID == row_b.MergeID:
            continue

        dx = abs(row_a.LastX - row_b.FirstX)
        dy = abs(row_a.LastY - row_b.FirstY)
        frame_diff = abs(row_a.LastFrame - row_b.FirstFrame)

        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID, row_b.MergeID)

# ===================== ASSIGN NEW MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for new_merge_id, comp in enumerate(components, start=1):
    for old_merge_id in comp:
        merge_map[old_merge_id] = new_merge_id

# Map new MergeID to the first-pass summary
df["MergeID_2"] = df["MergeID"].map(merge_map).fillna(-1).astype(int)

# ===================== SUMMARY =====================
#passs 2
import pandas as pd
import networkx as nx
import ast

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1.csv"
FPS = 30
X_THRESH = 80  # larger threshold
Y_THRESH = 80
TIME_THRESH_SEC = 5  # larger temporal threshold
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace(".csv", "-mergeID2.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace(".csv", "-summary2.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)

# Use first/last positions from first pass
df = df.dropna(subset=["FirstFrame", "FirstX", "FirstY", "LastFrame", "LastX", "LastY"])

# ===================== BUILD LINKS BASED ON MERGEID FINAL→FIRST =====================
G = nx.Graph()

# Each MergeID is a node
for mid in df["MergeID"]:
    G.add_node(mid)

for idx_a, row_a in df.iterrows():
    for idx_b, row_b in df.iterrows():
        if row_a.MergeID == row_b.MergeID:
            continue

        dx = abs(row_a.LastX - row_b.FirstX)
        dy = abs(row_a.LastY - row_b.FirstY)
        frame_diff = abs(row_a.LastFrame - row_b.FirstFrame)

        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID, row_b.MergeID)

# ===================== ASSIGN NEW MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for new_merge_id, comp in enumerate(components, start=1):
    for old_merge_id in comp:
        merge_map[old_merge_id] = new_merge_id

# Map new MergeID to the first-pass summary
df["MergeID_2"] = df["MergeID"].map(merge_map).fillna(-1).astype(int)

# ===================== SUMMARY =====================
summary_rows = []
for merge_id in sorted(df["MergeID_2"].unique()):
    subset = df[df["MergeID_2"] == merge_id].sort_values(by="FirstFrame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]

    # Flatten PersonIDs
    merged_persons = []
    for p in subset["PersonIDs"]:
        if isinstance(p, str):
            merged_persons.extend(ast.literal_eval(p))
        else:
            merged_persons.append(p)
    merged_persons = sorted(set(merged_persons))

    # Determine Arrival/Departure based on first-pass info
    # ArrivalTime: earliest among subset
    arrival_times = subset["ArrivalTime"].dropna()
    arrival_zones = subset["ArrivalZone"].dropna()
    departure_times = subset["DepartureTime"].dropna()

    arrival_time = arrival_times.min() if not arrival_times.empty else None
    arrival_zone = arrival_zones.iloc[0] if not arrival_zones.empty else None
    # DepartureTime: last time in the SAME zone as arrival_zone
    if arrival_zone is not None:
        dep_rows = subset[subset["ArrivalZone"] == arrival_zone]
        departure_time = dep_rows["DepartureTime"].max() if not dep_rows.empty else None
    else:
        departure_time = None

    summary_rows.append({
        "MergeID_2": merge_id,
        "PersonIDs": merged_persons,
        "FirstTime": first_row["FirstTime"],
        "FirstFrame": first_row["FirstFrame"],
        "FirstX": first_row["FirstX"],
        "FirstY": first_row["FirstY"],
        "FirstZone": first_row.get("FirstZone", ""),
        "ArrivalTime": arrival_time,
        "ArrivalZone": arrival_zone,
        "DepartureTime": departure_time,
        "LastTime": last_row["LastTime"],
        "LastFrame": last_row["LastFrame"],
        "LastX": last_row["LastX"],
        "LastY": last_row["LastY"],
        "LastZone": last_row.get("LastZone", "")
    })


summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved second-pass merged dataset → {OUTPUT_MERGED}")
print(f"Saved second-pass summary → {OUTPUT_SUMMARY}")


summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved second-pass merged dataset → {OUTPUT_MERGED}")
print(f"Saved second-pass summary → {OUTPUT_SUMMARY}")

Saved second-pass merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-mergeID2.csv
Saved second-pass summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-summary2.csv
Saved second-pass merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-mergeID2.csv
Saved second-pass summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-summary2.csv


In [8]:
#pass 3
import pandas as pd
import networkx as nx
import ast

# ===================== CONFIG =====================
INPUT_CSV = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-summary2.csv"
FPS = 30
X_THRESH = 100  # spatial threshold for third pass
Y_THRESH = 100
TIME_THRESH_SEC = 6  # temporal threshold
FRAME_THRESH = TIME_THRESH_SEC * FPS

OUTPUT_MERGED = INPUT_CSV.replace(".csv", "-mergeID3.csv")
OUTPUT_SUMMARY = INPUT_CSV.replace(".csv", "-summary3.csv")

# ===================== LOAD DATA =====================
df = pd.read_csv(INPUT_CSV)
df = df.dropna(subset=["FirstFrame", "FirstX", "FirstY", "LastFrame", "LastX", "LastY"])

# ===================== BUILD LINKS BASED ON FINAL→FIRST MATCH =====================
G = nx.Graph()

# Each MergeID_2 is a node
for mid in df["MergeID_2"]:
    G.add_node(mid)

for idx_a, row_a in df.iterrows():
    for idx_b, row_b in df.iterrows():
        if row_a.MergeID_2 == row_b.MergeID_2:
            continue

        dx = abs(row_a.LastX - row_b.FirstX)
        dy = abs(row_a.LastY - row_b.FirstY)
        frame_diff = abs(row_a.LastFrame - row_b.FirstFrame)

        # Merge if within thresholds
        if dx <= X_THRESH and dy <= Y_THRESH and frame_diff <= FRAME_THRESH:
            G.add_edge(row_a.MergeID_2, row_b.MergeID_2)

# ===================== ASSIGN NEW MERGEIDs =====================
components = list(nx.connected_components(G))
merge_map = {}
for new_merge_id, comp in enumerate(components, start=1):
    for old_merge_id in comp:
        merge_map[old_merge_id] = new_merge_id

df["MergeID_3"] = df["MergeID_2"].map(merge_map).fillna(-1).astype(int)

# ===================== SAVE THIRD PASS =====================
df.to_csv(OUTPUT_MERGED, index=False)

# ===================== SUMMARY =====================
summary_rows = []
for merge_id in sorted(df["MergeID_3"].unique()):
    subset = df[df["MergeID_3"] == merge_id].sort_values(by="FirstFrame")
    first_row = subset.iloc[0]
    last_row = subset.iloc[-1]

    # Flatten PersonIDs safely
    merged_persons = []
    for p in subset["PersonIDs"]:
        if isinstance(p, str):
            merged_persons.extend(ast.literal_eval(p))
        else:
            merged_persons.append(p)
    merged_persons = sorted(set(merged_persons))

    # Preserve earliest ArrivalZone from previous-pass groups
    arrival_subset = subset.dropna(subset=["ArrivalTime", "ArrivalZone"])
    if not arrival_subset.empty:
        # Earliest arrival
        arrival_row = arrival_subset.loc[arrival_subset["ArrivalTime"].idxmin()]
        arrival_time = arrival_row["ArrivalTime"]
        arrival_zone = arrival_row["ArrivalZone"]

        # DepartureTime: last time in the same arrival_zone across all merged groups
        same_zone_rows = subset[subset["ArrivalZone"] == arrival_zone]
        if not same_zone_rows.empty:
            departure_time = same_zone_rows["DepartureTime"].max()
        else:
            departure_time = arrival_time
    else:
        arrival_time, arrival_zone, departure_time = None, None, None

    # LastTime / LastZone: last detection overall among merged rows
    last_detection_rows = subset.dropna(subset=["LastTime", "LastZone"])
    if not last_detection_rows.empty:
        last_row_final = last_detection_rows.iloc[-1]
        last_time = last_row_final["LastTime"]
        last_zone = last_row_final["LastZone"]
    else:
        last_time, last_zone = None, None

    summary_rows.append({
        "MergeID_3": merge_id,
        "PersonIDs": merged_persons,
        "FirstTime": first_row["FirstTime"],
        "FirstFrame": first_row["FirstFrame"],
        "FirstX": first_row["FirstX"],
        "FirstY": first_row["FirstY"],
        "FirstZone": first_row.get("FirstZone", ""),
        "ArrivalTime": arrival_time,
        "ArrivalZone": arrival_zone,
        "DepartureTime": departure_time,
        "LastTime": last_time,
        "LastFrame": last_row["LastFrame"],
        "LastX": last_row["LastX"],
        "LastY": last_row["LastY"],
        "LastZone": last_zone
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(OUTPUT_SUMMARY, index=False)

print(f"Saved third-pass merged dataset → {OUTPUT_MERGED}")
print(f"Saved third-pass summary → {OUTPUT_SUMMARY}")


Saved third-pass merged dataset → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-summary2-mergeID3.csv
Saved third-pass summary → /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-summary2-summary3.csv


In [9]:
#final clean up - delete row if time is 1s or less
import pandas as pd

# ===================== CONFIG =====================
INPUT_FILE = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-summary2-summary3.csv"
OUTPUT_FILE = "/Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-summary2-summary3-cleaned.csv" 

# ===================== LOAD =====================
df = pd.read_csv(INPUT_FILE)

# ===================== CONVERT TIMES =====================
# Convert to timedelta for comparison
df["FirstTime_td"] = pd.to_timedelta(df["FirstTime"])
df["LastTime_td"] = pd.to_timedelta(df["LastTime"])

# ===================== FILTER ROWS =====================
# Keep only rows where duration > 0 seconds
df_clean = df[df["LastTime_td"] != df["FirstTime_td"]].copy()

# ===================== REMOVE TEMP COLUMNS =====================
df_clean = df_clean.drop(columns=["FirstTime_td", "LastTime_td"], errors="ignore")

# ===================== SAVE =====================
df_clean.to_csv(OUTPUT_FILE, index=False)

print(f"Cleaned table saved to {OUTPUT_FILE}")


Cleaned table saved to /Users/cherrychoy/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/Honours Thesis/Videos/TRIAL/trial13-postprocess-summary1-summary2-summary3-cleaned.csv
