In [1]:
import json
import os
import pandas as pd
from mmif import Mmif, AnnotationTypes, DocumentTypes, View, Annotation
from mmif.utils import video_document_helper as vdh
from typing import Dict, List, Set, Optional, Union, Tuple
from collections import defaultdict
from clams_utils.aapb import guidhandler
from io import StringIO

In [2]:
gold_file = './test/cpb-aacip-81-902z3f9j.gold.csv'
df = pd.read_csv(gold_file).dropna(subset=['ANNOTATIONS'])
df

Unnamed: 0,GUID,FRAME,SKIPPED,ANNOTATIONS
0,cpb-aacip-81-902z3f9j,240,False,",PROGRAM,WOMAN*\n,AIR DATE,TBA*\n,PROG LENGTH,..."
1,cpb-aacip-81-902z3f9j,255,True,DUPLICATE
2,cpb-aacip-81-902z3f9j,270,True,DUPLICATE
3,cpb-aacip-81-902z3f9j,285,True,DUPLICATE
4,cpb-aacip-81-902z3f9j,300,True,DUPLICATE
...,...,...,...,...
65,cpb-aacip-81-902z3f9j,52605,True,DUPLICATE
66,cpb-aacip-81-902z3f9j,52620,True,DUPLICATE
67,cpb-aacip-81-902z3f9j,52635,True,DUPLICATE
68,cpb-aacip-81-902z3f9j,52650,True,DUPLICATE


In [3]:
def csv_string_to_set(csv_string: str) -> Set[Tuple[str, str]]:
    """
    Convert csv-string to a set of tuples which represent (role, filler) pairs
    
    :params: csv_string: the input csv-formatted string
    :return: a set of tuples of (role, filler) 
    """
    rf_set = set()
    for pair in csv_string.split('\n'):
        _, role, filler = pair.split(',', maxsplit=2)
        rf_set.add((role, filler))
    return rf_set

In [6]:
min_frame, max_frame = -1, -1
anns = set()
frames = defaultdict(set)
for _, frame in df.iterrows():
    if not frame['SKIPPED']:
        if anns:
            frames[(min_frame, max_frame)] = anns
        anns = csv_string_to_set(frame['ANNOTATIONS'])
        min_frame = frame['FRAME']
    else:
        if frame['ANNOTATIONS'] == 'DUPLICATE':
            max_frame = frame['FRAME']
        else:
            if anns:
                frames[(min_frame, max_frame)] = anns
                anns = set()
    
frames

defaultdict(set,
            {(240, 495): {('*OR', '*'),
              ('AGENCY', '*'),
              ('AIR DATE', 'TBA*'),
              ('CLIENT', 'WNFD'),
              ('CUT', '*'),
              ('DIRECTOR', '*'),
              ('PROG LENGTH', '*'),
              ('PROGRAM', 'WOMAN*'),
              ('RECORD DATE', '2/15/7?')},
             (51870, 52035): {('advisory board', 'A. Wilmot Jacobsen, M.D.'),
              ('advisory board', 'Dr. Gloria Roblin'),
              ('advisory board', 'Marjorie Plumb, M.D.'),
              ('advisory board', 'Stanford Copley, M.D.')},
             (52050, 52155): {("Samantha Dean's fashions by", 'Jenss')},
             (52170, 52260): {('director', 'Will George')},
             (52275, 52380): {('producer', 'Sandra Elkin')},
             (52395,
              52470): {('executive producer', 'John L. Hutchinson, Jr.')}})

In [26]:
error_str = []
for root, _, anns in os.walk(gold_dir):
    for ann in anns:
        file_path = os.path.join(root, ann)
        df = pd.read_csv(file_path).dropna(subset=['ANNOTATIONS'])
        for _, frame in df.iterrows():
            if not frame['SKIPPED']:
                input_string = frame['ANNOTATIONS']
                try:
                    output = csv_string_to_set(input_string)
                except ValueError:
                    error_str.append(input_string)
                    print(f"{guidhandler.get_aapb_guid_from(ann)}:\n\n{input_string}\n")

In [20]:
sample_str = error_str[2]
sample_str

',Produced by,Tim Smith\n,Produced by,Tony Van Witsen\n,Videotape Courtesy Of:,WCBS, NEW YORK\n,Videotape Courtesy Of:,KCTA, SEATTLE\n,Videotape Courtesy Of:,KQED, SAN FRANCISCO\n,Videotape Courtesy Of:,WFLI, INDIANA\n,Videotape Courtesy Of:,WFAA, DALLAS'

In [22]:
sample_str.split('\n')

[',Produced by,Tim Smith',
 ',Produced by,Tony Van Witsen',
 ',Videotape Courtesy Of:,WCBS, NEW YORK',
 ',Videotape Courtesy Of:,KCTA, SEATTLE',
 ',Videotape Courtesy Of:,KQED, SAN FRANCISCO',
 ',Videotape Courtesy Of:,WFLI, INDIANA',
 ',Videotape Courtesy Of:,WFAA, DALLAS']

In [24]:
',Videotape Courtesy Of:,WCBS, NEW YORK'.split(',', 2)

['', 'Videotape Courtesy Of:', 'WCBS, NEW YORK']

In [2]:
pred_file = '/home/bhj-dev/clams/cpb-aacip-191-40ksn47s.swt.paddleocr.rfb.mmif'
rfb_mmif = Mmif(json.load(open(pred_file)))
rfb_view = rfb_mmif.views.get_last_contentful_view()
ocr_view = rfb_mmif.get_view_by_id('v_1')
swt_view = rfb_mmif.get_view_by_id('v_0')

In [3]:
sample_tp = swt_view.get_annotation_by_id('v_0:tp_343') 
print(f"TimePoint: {sample_tp.get_property('timePoint')}")
print(f"Frame: {vdh.convert_timepoint(rfb_mmif, sample_tp, 'frames')}")

TimePoint: 342009
Frame: 10250


In [8]:
SWT_APP = 'http://apps.clams.ai/swt-detection/v5.0'
RFB_APP = 'http://apps.clams.ai/role-filler-binder/41cb5b8'

def get_align_to(ann: Annotation, view: View) -> Optional[Annotation]:
    for al in view.get_annotations(AnnotationTypes.Alignment):
        if aligned_ann := ann.aligned_to_by(al):
            return aligned_ann

def get_aligned_ann_of(mmif: Mmif, source: Annotation, source_app: str, target_app: str) -> Optional[Annotation]:
    valid_views = {view.metadata.app: view for view in mmif.views if not (view.has_error() or view.has_warnings())}
    # Validate if two apps are in mmif
    if not (source_app and target_app) in valid_views:
        raise ValueError(f"Either {source_app} or {target_app} is not in mmif")
    
    current_view, target_view = valid_views[source_app], valid_views[target_app]
    current_ann = source
    while current_view.id != target_view.id:
        next_ann = get_align_to(current_ann, current_view)
        current_view = mmif.get_view_by_id(next_ann.parent)
        current_ann = next_ann
    return current_ann

def csv_string_to_pair(csv_string: str) -> List[Dict]:
    return set(pd.read_csv(StringIO(csv_string), index_col=0).fillna('nan').itertuples(index=False, name=None))

def load_pred(file: Union[str, os.PathLike]) -> Dict[str, Dict]:
    guid = guidhandler.get_aapb_guid_from(file)
    
    rfb_mmif = Mmif(json.load(open(file)))
    rfb_view = rfb_mmif.views.get_last_contentful_view()
     
    frames_dict = {}
    for rfb_td in rfb_view.get_documents():
        aligned_tp = get_aligned_ann_of(rfb_mmif, rfb_td, RFB_APP, SWT_APP)
        aligned_frame = vdh.convert_timepoint(rfb_mmif, aligned_tp, 'frames')
        frames_dict[aligned_frame] = csv_string_to_pair(rfb_td.text_value)
    
    return {guid: frames_dict} 

In [11]:
output = load_pred(pred_file)
for frame, anns in output['cpb-aacip-191-40ksn47s'].items():
    print(f"{frame}:")
    for ann in anns:
        print(f"\t{ann}")

10250:
	('Dir., Behavior Health Services', 'Lynn Brady')
12888:
	('Knauer DSanta Fe', 'Rep. Patsy Trujillo')
17413:
	('Hands Across Gultures', 'Verna Roybal')
39591:
	('Knauer DSanta Fe', 'Rep. Patsy Trujillo')
43157:
	('DirBehavior Health Services', 'Lynn Brady')
47203:
	('Hands Across Cultures', 'Verna Roybal')
52028:
	('Technical Director', 'Randy Lantz')
	('Director', 'Eric Mathes')
52148:
	('Audio', 'Kevin Richard Lee')
	('Video Tape', 'Monica F. P. Williams')
52298:
	('nan', 'Randy ALantz')
	('Prompter Operator', 'Alicia Maldonado')
	('nan', 'Deborah = Starke')
52448:
	('nan', 'Trissel')
	('nan', 'Deborah Starke')
	('nan', 'Maldonado')
52598:
	('Studio Lighting', 'Kevin Lee')
	('Studio Lighting', 'Michael Kamins')
	('Studio Engineers', 'Bob Henry')
	('Studio Engineers', 'Al Deme')
52718:
	('Design Apprentice', 'Joseph Barron')
	('nan', 'Gordon Kennedy')


In [14]:
set(output.keys())

{'cpb-aacip-191-40ksn47s'}

In [16]:
guid = next(iter(output))
frame_set = set(output[guid].keys())
print(frame_set)

{52448, 47203, 17413, 39591, 10250, 52298, 52718, 52148, 43157, 52598, 12888, 52028}


In [17]:
set.intersection(frame_set, {17413, 47203})

{17413, 47203}