In [82]:
import csv
import statistics

from torchvision import transforms
import torch
import torchvision
import cv2
import numpy as np
from pathlib import Path
from emonet.models import EmoNet
from PIL import Image
from facenet_pytorch import MTCNN
import torchvision.transforms as T
from statistics import mode

t_p_transform = T.ToPILImage()
transform_image = transforms.Compose([transforms.ToTensor()])
mtcnn = MTCNN(keep_all=True)
classes = {0:"Neutral", 1:"Happy", 2:"Sad", 3:"Surprise", 4:"Fear", 5:"Disgust", 6:"Anger", 7:"Contempt"}
n_expression=8
image_size = 256
state_dict_path = Path().parent.joinpath('pretrained', f'emonet_{n_expression}.pth')
state_dict = torch.load(str(state_dict_path), map_location='cpu')
state_dict = {k.replace('module.',''):v for k,v in state_dict.items()}
net = EmoNet(n_expression=n_expression).to("cpu")
net.load_state_dict(state_dict, strict=False)
net.eval()

def recognize_emotion(pil_image, mtcnn, emonet):
    boxes, probs, points = mtcnn.detect(pil_image, landmarks=True)
    box = boxes[0]
    t_image = pil_image.crop(box.tolist())
    centercrop = torchvision.transforms.CenterCrop(np.min(t_image.size))
    resize = torchvision.transforms.Resize(256)
    t = transform_image(t_image)
    t = centercrop(t)
    t = resize(t)
    out = emonet(t[None, :])
    val = out['valence']
    ar = out['arousal']
    expr = out['expression']
    val = np.squeeze(val.detach().numpy()).item()
    ar = np.squeeze(ar.detach().numpy()).item()
    expr = np.argmax(np.squeeze(expr.detach().numpy()))
    return val, ar, classes[expr], t_image

In [84]:
def snapshot_emotion(video, frame_sec, frame_rate, window = 5):
    vals = []
    arls = []
    emotions = []
    for i in range(window):
        video.set(cv2.CAP_PROP_POS_FRAMES, frame_rate * (frame_sec + i * 0.5))
        ret, frame = video.read()
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = Image.fromarray(frame)
        #display(frame)
        val, arousal, emotion, cropped_image = recognize_emotion(frame, mtcnn, net)
        vals.append(val)
        arls.append(arousal)
        emotions.append(emotion)
    return vals, arls, emotions



result = []
with open("target_time.csv") as f:
    reader = csv.reader(f)
    for row in reader:
        filename = row[0]

        video = cv2.VideoCapture('s2.mp4')
        video_fps = video.get(cv2.CAP_PROP_FPS)
        print(filename)
        for i in range(1, len(row)):
            if ':' not in row[i]:
                break
            timestr = row[i]
            timeelems = timestr.split(':')
            hour = int(timeelems[0])
            min = int(timeelems[1])
            sec = int(timeelems[2])
            frame_sec = hour * 60 * 60 + min * 60 + sec
            vals = []
            arls = []
            emotions = []
            res = snapshot_emotion(video, frame_sec, video_fps)
            result.append({"filename":filename, "time":row[i], "valence":statistics.mean(res[0]), "arousal":statistics.mean(res[1]), "emotion":mode(res[2])})


with open("result.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=  result[0].keys())
    writer.writeheader()
    writer.writerows(result)


s1.mp4
s2.mp4
s3.mp4
