# load the packages

Make sure that you installed `cltl-face-all` python package from here https://github.com/leolani/cltl-face-all

Also install `grmc` by `pip install .` in the root directory of the repo

In [2]:
import os
from grmc.representation import annotation, container, entity, scenario, util
from cltl_face_all.face_alignment import FaceDetection
from cltl_face_all.arcface import ArcFace
from cltl_face_all.agegender import AgeGender

fd = FaceDetection(device='cpu', face_detector='sfd')
ag = AgeGender(device='cpu')
af = ArcFace(device='cpu')

[*] load ckpt from /home/tk/.virtualenvs/dev-python-3.7/lib/python3.7/site-packages/cltl_face_all/arcface/./pretrained_models/arc_res50/e_8_b_40000.ckpt


# Download small dataset and its annotation

In [1]:
os.makedirs('data', exist_ok=True)

%cd data
!wget https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv
!wget https://raw.githubusercontent.com/cltl/ma-communicative-robots/master/multimodal/dataset-small.json
!wget https://raw.githubusercontent.com/cltl/ma-communicative-robots/master/multimodal/dataset-medium.json
!wget https://raw.githubusercontent.com/cltl/ma-communicative-robots/master/multimodal/dataset-large.json

!gdown --id 1-2LeHC_5Cm2gWWT6vBrVhp8jorbjkN1_
!unzip visual-features.zip
!rm visual-features.zip
!gdown --id 16ck7plW9v9eSHGCs5wuB2AhhufPRt3Wi
!unzip smaller-dataset.zip
!rm smaller-dataset.zip
%cd ..

taset/dia576_utt0.mp4  
  inflating: smaller-dataset/dia576_utt1.mp4  
  inflating: smaller-dataset/dia576_utt2.mp4  
  inflating: smaller-dataset/dia576_utt3.mp4  
  inflating: smaller-dataset/dia576_utt4.mp4  
  inflating: smaller-dataset/dia576_utt5.mp4  
  inflating: smaller-dataset/dia576_utt6.mp4  
  inflating: smaller-dataset/dia576_utt7.mp4  
  inflating: smaller-dataset/dia576_utt8.mp4  
  inflating: smaller-dataset/dia576_utt9.mp4  
  inflating: smaller-dataset/dia576_utt10.mp4  
  inflating: smaller-dataset/dia577_utt0.mp4  
  inflating: smaller-dataset/dia577_utt1.mp4  
  inflating: smaller-dataset/dia577_utt2.mp4  
  inflating: smaller-dataset/dia577_utt3.mp4  
  inflating: smaller-dataset/dia577_utt4.mp4  
  inflating: smaller-dataset/dia577_utt5.mp4  
  inflating: smaller-dataset/dia577_utt6.mp4  
  inflating: smaller-dataset/dia577_utt7.mp4  
  inflating: smaller-dataset/dia585_utt0.mp4  
  inflating: smaller-dataset/dia585_utt1.mp4  
  inflating: smaller-dataset/dia585

# Simplify the data. After this cell the dict obj `datasets` will have all you need

In [28]:
import os
from glob import glob
import csv
import json


# Only the train_sent_emo is relevant to us since the smaller datasets
# are subset of the original train dataset.
annotation_path = 'data/train_sent_emo.csv'
        
VIDS_DIR = "data/smaller-dataset/"

datasets_ = {}
datasets_['small'] = "data/dataset-small.json"
datasets_['medium'] = "data/dataset-medium.json"
datasets_['large'] = "data/dataset-large.json"

for datasize in ['small', 'medium', 'large']:
    with open(datasets_[datasize], 'r') as stream:
        datasets_[datasize] = json.load(stream)

with open(annotation_path) as f:
    reader = csv.reader(f)
    annotations = list(reader)


# See if we have all of the videos
for datasize in ['small', 'medium', 'large']:
    for datatype in ['train', 'dev', 'test']:
        diautt_ = datasets_[datasize][datatype]
        for diautt in diautt_:
            assert os.path.isfile(os.path.join(VIDS_DIR, diautt))

# Find the corresponding speaker / emotion / sentiment from the annotations

diautt2anno = {}
for row in annotations[1:]:
    SrNo, Utterance, Speaker, Emotion, Sentiment, Dialogue_ID,\
        Utterance_ID, Season, Episode, StartTime, EndTime = row

    if f"dia{Dialogue_ID}_utt{Utterance_ID}.mp4" not in str(os.listdir(os.path.join(VIDS_DIR))):
        continue
    
    diautt2anno[f"dia{Dialogue_ID}_utt{Utterance_ID}"] = \
        {'SrNo': SrNo, 
        'Utterance': Utterance, 
        'Speaker': Speaker, 
        'Emotion': Emotion, 
        'Sentiment': Sentiment, 
        'Dialogue_ID': Dialogue_ID,
        'Utterance_ID': Utterance_ID, 
        'Season': Season, 
        'Episode': Episode, 
        'StartTime': StartTime, 
        'EndTime':EndTime}


datasets = {}
for DATASIZE in ['small', 'medium', 'large']:
    datasets[DATASIZE] = {}
    for DATATYPE in ['train', 'dev', 'test']:
        datasets[DATASIZE][DATATYPE] = {}
        diautt_ = datasets_[DATASIZE][DATATYPE]
        for diautt in diautt_:
            diautt_without_mp4 = diautt.split('.mp4')[0]
            datasets[DATASIZE][DATATYPE][diautt_without_mp4] = diautt2anno[diautt_without_mp4]

Let's see what `datasets` has

In [38]:
diautt_chosen = 'dia1000_utt0'

datasets['small']['train'][diautt_chosen]

{'SrNo': '10029',
 'Utterance': 'Would you look at this dump? He hated us. This is his final revenge!',
 'Speaker': 'Monica',
 'Emotion': 'disgust',
 'Sentiment': 'negative',
 'Dialogue_ID': '1000',
 'Utterance_ID': '0',
 'Season': '2',
 'Episode': '3',
 'StartTime': '00:06:45,780',
 'EndTime': '00:06:49,783'}

Load the video and get the visual features

In [37]:
import av
import numpy as np
from tqdm.notebook import tqdm

VIDPATH = os.path.join(VIDS_DIR, f'{diautt_chosen}.mp4')
container = av.open(VIDPATH)

visual_features = {}
for frame in tqdm(container.decode(video=0)):
    idx = frame.index
    visual_features[idx] = []
    numpy_RGB = np.array(frame.to_image())
    batch = numpy_RGB[np.newaxis, ...]
    bboxes = fd.detect_faces(batch)
    print(f"number of faces detected in the frame {idx} is {len(bboxes[0])}")

    if len(bboxes[0]) == 0:
        continue

    landmarks = fd.detect_landmarks(batch, bboxes)

    faces = fd.crop_and_align(batch, bboxes, landmarks)
    faces = np.concatenate(faces, axis=0)
    embeddings = af.predict(faces)
    ages, genders = ag.predict(faces)

    # print(len(bboxes[0]), len(landmarks[0]), len(ages), len(genders), len(embeddings), len(faces))
    for bb, lm, a, g, emb in zip(bboxes[0], landmarks[0], ages, genders, embeddings):
        x1, y1, x2, y2, prob = bb

        to_append = {'bbox': bb, 
                    'landmark': lm, 
                    'age': a,
                    'gender': g,
                    'embedding': emb}

        visual_features[idx].append(to_append)
    # print(len(visual_features), idx+1)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

number of faces detected in the frame 0 is 3
number of faces detected in the frame 1 is 3
number of faces detected in the frame 2 is 3
number of faces detected in the frame 3 is 3
number of faces detected in the frame 4 is 2
number of faces detected in the frame 5 is 2
number of faces detected in the frame 6 is 2
number of faces detected in the frame 7 is 2
number of faces detected in the frame 8 is 2
number of faces detected in the frame 9 is 3
number of faces detected in the frame 10 is 3
number of faces detected in the frame 11 is 3
number of faces detected in the frame 12 is 3
number of faces detected in the frame 13 is 3
number of faces detected in the frame 14 is 3
number of faces detected in the frame 15 is 3
number of faces detected in the frame 16 is 3
number of faces detected in the frame 17 is 3
number of faces detected in the frame 18 is 3
number of faces detected in the frame 19 is 3
number of faces detected in the frame 20 is 3
number of faces detected in the frame 21 is 

Let's take a look at the visual features

In [45]:
# The frame number 0, and detection number 0 has 5 attributes (bbox, landmark, age, gender, and embedding)
visual_features[0][0].keys()

dict_keys(['bbox', 'landmark', 'age', 'gender', 'embedding'])

# TODOs: 

1. match the face to the speaker if possible (Tae's working on the face recognition of the friends characters.)

2. Save the visual features in the grmc format

3. etc.