# Coleta de dados

Esse script usa a [API de reconhecimento facial da Microsoft](https://azure.microsoft.com/services/cognitive-services/face/) para detectar rostos e reconhecer a expressão facial dos candidatos durante o debate realizado pelo Estadão/TV Gazeta.

In [None]:
import json
import glob
import re
import requests
import pprint
import matplotlib.pyplot as plt
from matplotlib import patches
from PIL import Image
from io import BytesIO
import cv2
import os
import time
from datetime import datetime
import pandas as pd
%matplotlib inline

### 1. OpenCV - dividir vídeo em frames
O primeiro passo é usar o pacote [OpenCV](https://opencv.org/) para ler o vídeo e dividir em frames. O código é uma adaptação [deste](https://gist.github.com/keithweaver/70df4922fec74ea87405b83840b45d57), de autoria do usuário [keithweaver](https://gist.github.com/keithweaver) no GitHub. 

In [None]:
def split_into_frames(fp, out, save_rate, preffix):
    '''
    INPUT:
        fp = path to the video file
    OUTPUT:
        out = directory of the image output
    PARAMS:
        save_rate = save frame as img every x seconds
        preffix = preffix of the filename
    '''
# Playing video from file:
    cap = cv2.VideoCapture(fp)
    fps = round(cap.get(cv2.CAP_PROP_FPS))
    try:
        if not os.path.exists(out):
            os.makedirs(out)
    except OSError:
        print ('Error: Creating directory for output')
    
    currentFrame = 0
    while(True):
        # Capture frame-by-frame
        ret, frame = cap.read()
        ms = cap.get(cv2.CAP_PROP_POS_MSEC)
        currentFname = ms_to_hms(ms)
        # Break condition
        if not ret:
            break

        # Save only some specific frames
        if (currentFrame % (fps * save_rate) == 0):    
            # Saves image of the current frame in jpg file
            filename = out + preffix + '-' + str(currentFname) + '.jpg'
            #print ('Creating...' + filename)
            cv2.imwrite(filename, frame)

        # To stop duplicate images
        currentFrame += 1

        # Waits 1 second, then moves on to the next framw
        
    # When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()

Também definimos uma função auxiliar que converte um número de milisegundos para uma string no formato hora:minuto:segundo. Ela será usada na função abaixo para salvar o arquivo com a posição do vídeo já embutida no nome.

In [None]:
def ms_to_hms(ms):
    '''
    INPUT:
        ms = an arbitrary number of milliseconds
    OUTPUT:
        hms = a string derived from the milliseconds 
        and formatted in a hh:mm:ss pattern
    '''
    s = str(int((ms/1000)%60)) + 's'
    m = str(int((ms/(1000*60))%60)) + 'm'
    h = str(int((ms/(1000*60*60))%24)) + 'h'
    hms = [h, m, s]
    hms = [i.zfill(3) for i in hms]
    hms = '-'.join(hms)
    return hms

Agora, rodamos a função com os parâmetros desejados.

In [None]:
out = "../data/imgs/frames/debate-tv-gazeta/"
fps = glob.glob("../data/video/debate-tv-gazeta/camera*.mp4")
for fp in fps:
    print('Splitting', fp)
    pat = re.match('.*(camera\d).mp4', fp)[1]
    split_into_frames(fp, out, 1, pat)

### 2. Analisa imagens usando API da Microsoft

In [None]:
# Credencial de acesso
key = "INSIRA SUA CHAVE AQUI"

A função abaixo cria um *grupo facial*, a partir do qual a API consegue identificar a quem pertence cada rosto.

In [None]:
def make_face_group(key, face_list_id):
# This fucntion sends a put request that creates a face list using Microsoft API
    url = "https://brazilsouth.api.cognitive.microsoft.com/face/v1.0/facelists/" + face_list_id
    headers = {
        'Content-Type': 'application/json',
        'Ocp-Apim-Subscription-Key': key,
        }
    data = {
        'name':face_list_id,
        'userData':'Sample faces of 2018 Brasil presidential candidates'
    }
    response = requests.put(url, json=data, headers=headers)
    print(response)

In [None]:
face_list_id = 'candidatos-presidente-2018'
make_face_group(key, face_list_id)

Aqui, adicionamos uma imagem clara do rosto de cada candidato no grupo facial criado anteriormente. Armazenamos a correspondência entre o id gerado pela API e o nome de cada candidato em um dicionário.

In [None]:
def add_face(key, fp, face_list_id, dict_object):
# This function takes sample faces and stores them in the face list.
# By doing this, we will be able to identify the faces that we capture in the frame.
# We also store the correspondences locally, so we can match them to a human-readable description
# whithout the need to send another request.
    
    # Gets the name of the person using a regex pattern
    name = re.search("(\w+).jpg", fp)
    name = name.group(1)

    # Reads the file as a binary object
    image_data = open(fp, "rb").read()
    
    headers = {
        # Request headers
        'Content-Type': 'application/octet-stream',
        'Ocp-Apim-Subscription-Key': key,
    }
    
    params = {
        # Request parameters
        'userData': name
    }
    
    data = {
    "url": image_data
    }
    
    # Parse the params as a string
    query = ''
    for k,v in params.items():
        string = k + '=' + v + '&'
        query += string
    query = query[:-1]
    
    # Sends request and parses as json
    url = "https://brazilsouth.api.cognitive.microsoft.com/face/v1.0/facelists/" + face_list_id + "/persistedFaces?" + query    
    response = requests.post(url, params=params, headers=headers, data=image_data)
    response = response.json()
    persistedFaceId = response['persistedFaceId']
    # Saves correspondence to a local dict
    dict_object[persistedFaceId] = name

In [None]:
fps = glob.glob('../data/imgs/control/*.jpg')
face_dict = {}
for fp in fps:
     add_face(key, fp, face_list_id, face_dict)

As próximas funções são usadas para passar as imagens para a API.

Esta recebe um id temporário, gerado pela Microsoft ao receber uma foto. Ela então compara este rosto com os que estão armazenados no grupo facial, identificando de quem se trata.

In [None]:
def identify_face(key, face_id, face_list_id):
# This function takes an temporary face_id and matches it with the permanents ids in the face_list
    try:
        url = "https://brazilsouth.api.cognitive.microsoft.com/face/v1.0/findsimilars/"

        headers = {
            # Request headers
            'Content-Type': 'application/json',
            'Ocp-Apim-Subscription-Key': key,
        }

        data = {
        "faceId": face_id,
        "faceListId": face_list_id,
        "maxNumOfCandidatesReturned": 1,
        "mode": "matchFace"
        }

        response = requests.post(url, headers=headers, json=data)
        if response.status_code != 200:
            persisted_face_id = 'no_match'
        else:
            response = response.json()
            persisted_face_id = response[0]['persistedFaceId']

        return persisted_face_id
    except:
        print("Exception")
        raise

A próxima função passa todos os rostos encontrados em uma imagem para a função `identify_face()` e salva o output para um arquivo json.

In [None]:
def identify(key, faces, face_list_id, timestamp, timeout):
    # This function takes all the faces detected in a single image, passes it to identifiy_face() and saves the output as a JSON file.
    try:
        # Takes the timestamp from the fp string
        timestamp = re.search("\d{2}h\-\d{2}m\-\d{2}s", fp)
        timestamp = timestamp.group(0)
        # For each face detected in the image
        for face in faces:
            # Add the timestamp data into the object
            face['timestamp'] = timestamp

            # Now e need to match the temporary face_id with the persistent face_id
            face_id = face["faceId"]
            # Waits timeout so we don't burn all our requests/minute 
            time.sleep(timeout)        
            # Sends request
            permanent_id = identify_face(key, face_id, face_list_id)
            face['permanentId'] = permanent_id
            # Now we use the dict to get an actual face
            if permanent_id != 'no_match':
                name = face_dict[permanent_id]
                face['name'] = name
            else:
                name = 'no_match'
                face['name'] = 'no_match'

            # Write to json
            # If there is no file for this person, create it
            filename = '../data/jsons/debate-tv-gazeta/' + name + '-' + timestamp + '.json'
            #print(filename)
            with open(filename, mode='w', encoding='utf-8') as f:
                json.dump(face, f)
    except:
        print('Exception')
        raise

Essa função recebe uma imagem, identifica os rostos presentes e passa para a função `identify()`.

In [None]:
def run_api(key, fp, face_dict, timeout):
# This tunction takes a filepath to an image, runs the face recognition api on it
# and stores the retrieved data in an array inside the given dictionary
    try:
       # Takes the timestamp from the fp string
        timestamp = re.search("\d{2}h\-\d{2}m\-\d{2}s", fp)
        timestamp = timestamp.group(0)
        #print(timestamp)

        # Reads the file as a binary object
        image_data = open(fp, "rb").read()

        # Defines parameters and sends request to API
        headers = {'Ocp-Apim-Subscription-Key': key,
               'Content-Type': 'application/octet-stream'}
        params = {
            'returnFaceId': 'true',
            'returnFaceLandmarks': 'false',
            'returnFaceAttributes': 'age,gender,headPose,smile,facialHair,glasses,' +
            'emotion,hair,makeup,occlusion,accessories,blur,exposure,noise'
        }
        data = {'url': image_data}
        url = "https://brazilsouth.api.cognitive.microsoft.com/face/v1.0/detect"
        response = requests.post(url, params=params, headers=headers, json=data, data=image_data)
        faces = response.json()

        # Waits and sends request to the face identification routine
        time.sleep(timeout)
        identify(key, faces, face_list_id, timestamp, timeout)
    except Exception as e:
        print("Error on file", timestamp)
        print(e)

In [None]:
files = glob.glob("../data/imgs/frames/debate-tv-gazeta/*.jpg")
timeout = .1 
for fp in files:
     run_api(key, fp, face_dict, timeout)
    

### 3. Concatena jsons em um único array

Aqui, lemos o output salvo por run_api() arquivo por aqui e montamos um único array com todos eles.

In [None]:
def concatenate_jsons(array):
    files = glob.glob("../data/jsons/debate-tv-gazeta/*.json")
    for file in files:
         with open(file) as f:
            data = json.load(f)
            array.append(data)

In [None]:
array = []
concatenate_jsons(array)

### 4. Cria um dataframe linear com os campos desejados

Depois, criamos um dataframe do Pandas a partir desse array e salvamos em um arquivo CSV.

In [None]:
columns = ['name', 'timestamp', 'anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise', 'blur']
df = pd.DataFrame(columns=columns)

In [None]:
def array_to_df(array, df):
    for item in array:        
        row = {label:None for label in df.columns}
        row['name'] = item['name']
        row['timestamp'] = item['timestamp']
        row['blur'] = item['faceAttributes']['blur']['value']
        
        for k,v in item['faceAttributes']['emotion'].items():
            row[k] = v
        
        #print(row)
        df = df.append(row, ignore_index=True)
    
    return df

In [None]:
df = array_to_df(array, df)

In [None]:
# Puts into chronological order and changes index
df = df.sort_values(by='timestamp').reset_index(drop=True)

In [None]:
# Saves with a timestamp
now = str(datetime.now())[:-7]
now = now.replace(' ','_')
now = now.replace(":","-")
df.to_csv("../data/output_files/debate-gazeta/api_output_" + now + ".csv", index=False)