In [1]:
from pathlib import Path
import pandas as pd

from rekall.predicates import _area, _iou, during_inv, overlaps
from stsearch.stdlib import tiou

In [26]:
OKUTAMA_DIR = Path("/home/zf/okutama/Train-Set")
# OKUTAMA_LABEL_DIR = OKUTAMA_DIR / "Labels/SingleActionLabels/3840x2160"
OKUTAMA_LABEL_DIR = OKUTAMA_DIR / "Labels/SingleActionTrackingLabels/3840x2160"
LIST_CLIP_ID = [p.stem for p in OKUTAMA_LABEL_DIR.iterdir() if p.suffix=='.txt']
print(LIST_CLIP_ID)
assert len(LIST_CLIP_ID) == 33

['1.1.6', '2.1.1', '2.1.5', '2.1.3', '2.1.10', '1.2.8', '1.2.4', '1.2.2', '2.1.4', '2.2.11', '2.2.4', '1.1.1', '2.1.2', '1.2.11', '1.1.2', '2.2.7', '1.2.9', '2.1.6', '1.2.6', '1.1.4', '2.1.7', '2.2.6', '1.1.5', '1.2.7', '1.2.5', '2.2.5', '1.1.7', '2.2.9', '1.1.3', '2.2.8', '2.2.2', '1.1.10', '1.1.11']


In [27]:
class EVENTYPE(object):
    PUSHING = "Pushing/Pulling"
    HANDSHAKE = "Hand Shaking"

In [28]:
def parse_event_list_of_clip(clip_id:str, event:str) -> pd.DataFrame:
    path = OKUTAMA_LABEL_DIR / f"{clip_id}.txt"
    columns = ['track_id', 'x1', 'y1', 'x2', 'y2', 'frame_id', 'lost', 'occluded', 'generated', 'label', 'action']
    df = pd.read_csv(path, sep=' ', names=columns, index_col=False)
    
    # remove occluded and lost
    df = df[(df['lost']==0) & (df['occluded']==0)]
    
    # select event
    df = df[df['action'] == event]
    
    # compress track into one record
    rv = []
    for track_id in df['track_id'].unique():
        v = df[df['track_id']==track_id]
        rv.append(
        {
            'track_id': track_id,
            'x1': v['x1'].min(),
            'x2': v['x2'].max(),
            'y1': v['y1'].min(),
            'y2': v['y2'].max(),
            't1': v['frame_id'].min(),
            't2': v['frame_id'].max()
        })
    
    return pd.DataFrame(rv)

# parse_event_list_of_clip('1.1.1', EVENTYPE.PUSHING)

def parse_event_list_all_clip(event:str) -> pd.DataFrame:
    all_df = []
    for clip_id in LIST_CLIP_ID:
        df = parse_event_list_of_clip(clip_id, event)
        if not df.empty:
            df['clip_id'] = clip_id
            all_df.append(df)
    return pd.concat(all_df, ignore_index=True)

In [29]:
parse_event_list_all_clip(EVENTYPE.PUSHING)

Unnamed: 0,track_id,x1,x2,y1,y2,t1,t2,clip_id
0,14,2856,3443,1014,1979,310,540,1.1.6
1,47,1245,2389,427,2153,830,1440,1.1.6
2,45,1194,2040,0,1487,1621,1980,1.1.6
3,88,1422,3468,674,2142,1980,2329,1.1.6
4,11,1279,1366,922,1037,420,459,2.1.1
...,...,...,...,...,...,...,...,...
129,50,297,832,1613,2150,1360,1399,1.1.10
130,76,0,1686,70,1309,2080,2279,1.1.10
131,85,744,1813,275,2094,2380,2529,1.1.10
132,87,511,1526,1028,1993,2400,2549,1.1.10


In [36]:
def parse_result(result_file):
    """
    Parse experiment result cvs file. Convert relative coords to pixel coords.
    """
    df = pd.read_csv(result_file, index_col=0)
    # convert relative coord to pixel coord
    df['x1'] = df['x1'] * df['width']
    df['x2'] = df['x2'] * df['width']
    df['y1'] = df['y1'] * df['height']
    df['y2'] = df['y2'] * df['height']
    return df

def is_hit(ipred, igt):
    return _iou(ipred, igt) > 0.01 and overlaps()(ipred, igt) #  tiou(i1, i2)>0.1

# compute precision, recall, result length
def eval_prediction(result_file, event_type):
    """
    Given a result file and event_type, evaluate metrics against GT annotation
    """
    GT = parse_event_list_all_clip(event_type)
    prediction = parse_result(result_file)
    
    GT['hit'] = False
    prediction['hit'] = False

    for i, pred in prediction.iterrows():
        for j, gt in GT[GT['clip_id']==pred['clip_id']].iterrows():
            if is_hit(pred, gt):
                prediction.loc[i, ['hit']] = True
                GT.loc[j, ['hit']] = True

    GT_hit = len(GT[GT['hit']==True])
    prediction_hit = len(prediction[prediction['hit']==True])
    video_hrs = sum((prediction['t2']-prediction['t1'])/prediction['fps'])/3600

    print(f"Precision={prediction_hit}/{len(prediction)}={prediction_hit/len(prediction):.2f}")
    print(f"Recall={GT_hit}/{len(GT)}={GT_hit/len(GT):.2f}")
    print(f"Video length={video_hrs:.3f} hrs")
    print(f"Productivity={GT_hit/video_hrs} /hr")
    
    return GT, prediction

In [38]:
# pushing L2
_ = eval_prediction("pushing_result.csv", EVENTYPE.PUSHING)

Precision=83/125=0.66
Recall=77/134=0.57
Video length=0.195 hrs
Productivity=395.34109816971716 /hr


In [39]:
# pushing L1
_ = eval_prediction("person_and_object_result.csv", EVENTYPE.PUSHING)

Precision=80/151=0.53
Recall=100/134=0.75
Video length=0.428 hrs
Productivity=233.457988370334 /hr


In [33]:
parse_event_list_all_clip(EVENTYPE.HANDSHAKE)

Unnamed: 0,track_id,x1,x2,y1,y2,t1,t2,clip_id
0,46,1200,1444,629,1723,1280,1349,1.1.6
1,45,1256,1537,669,1734,1280,1349,1.1.6
2,5,1267,1470,964,1551,230,439,2.1.1
3,4,1338,1534,981,1565,230,439,2.1.1
4,5,1920,2108,674,1014,270,319,2.1.5
...,...,...,...,...,...,...,...,...
66,2,1841,1905,986,1073,480,540,2.2.5
67,0,1897,1956,1014,1071,420,459,2.2.5
68,7,1869,1964,919,1017,420,540,2.2.5
69,8,2889,3752,924,1113,901,1659,2.2.8


In [40]:
# handshake L1 (two person)
_ = eval_prediction("two_person_result.csv", EVENTYPE.HANDSHAKE)

Precision=27/145=0.19
Recall=48/71=0.68
Video length=0.387 hrs
Productivity=123.88576890906938 /hr


In [42]:
# handshake L2 
GT, prediction = eval_prediction("handshake_result.csv", EVENTYPE.HANDSHAKE)

Precision=31/150=0.21
Recall=48/71=0.68
Video length=0.368 hrs
Productivity=130.41509433962284 /hr


In [44]:
GT[GT['hit']==False]

Unnamed: 0,track_id,x1,x2,y1,y2,t1,t2,clip_id,hit
10,28,2206,2305,761,1054,1410,1479,2.1.5,False
13,9,2009,2068,610,789,210,219,2.1.4,False
14,8,1942,2012,615,778,210,219,2.1.4,False
15,7,1897,1942,680,896,210,229,2.1.4,False
16,4,1841,1894,685,893,210,229,2.1.4,False
22,36,1655,1768,666,896,1180,1209,2.1.4,False
23,37,1804,1936,385,584,1081,1129,2.1.4,False
31,3,2099,2181,430,525,1090,1109,2.1.2,False
32,4,2136,2209,446,548,1090,1109,2.1.2,False
35,37,1984,2077,1501,1605,1200,1209,1.2.11,False


In [12]:
df = pd.read_csv("okutama_metadata.csv")

In [14]:
df['frame_count'].sum()/30/3600

0.5540925925925926