<a href="https://colab.research.google.com/github/dcafarelli/CMT-ABAW2020-EXPR/blob/main/affwild2_labeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook will bind a label to the corresponding frame. It will return two pandas dataframes in the form of frame_path/label for the train and validation set.

Download cropped aligned files and annotations from the competition and 
- set train files dir
- set validation files dir
- set annotations dir 

In [None]:
import pickle
import os
import numpy as np
from matplotlib import pyplot as plt
import glob
import pandas as pd
from tqdm import tqdm
import sys
import csv
from google.colab import drive

In [None]:
drive.mount('/content/gdrive')

In [None]:
#!unzip '/content/gdrive/My Drive/TESI/FER/AffWild2/cropped_aligned_train.zip' -d '/content/cropped_aligned_train/'

In [None]:
#!unzip '/content/gdrive/MyDrive/TESI/FER/AffWild2/cropped_aligned/cropped_aligned_val.zip' -d '/content/cropped_aligned_val/'

In [None]:
# --------- PATHS ---------

annot_dir = '/content/gdrive/My Drive/TESI/FER/AffWild2/annotations'
train_set_dir = '/content/cropped_aligned_train/'
validation_set_dir = '/content/cropped_aligned_val/'

In [None]:
def read_Expr(txt_file):
    with open(txt_file, 'r') as f:
        lines = f.readlines()
    lines = lines[1:] # skip first line
    lines = [x.strip() for x in lines]
    lines = [int(x) for x in lines]
    return np.array(lines)

In [None]:
def frames_to_label(name,label_array, frames, discard_value):
    try:
        assert len(label_array) >= len(frames) # some labels need to be discarded
    except AssertionError:
        print('Houston, we have a problem. Lab array > frames')
        print(name)
        pass
        
    frames_ids = [int(frame.split('/')[-1].split('.')[0]) - 1 for frame in frames] # frame_id start from 0
    N = label_array.shape[0]
    label_array = label_array.reshape((N, -1))
    to_drop = (label_array == discard_value).sum(-1)
    drop_ids = [i for i in range(len(to_drop)) if to_drop[i]]
    frames_ids = [i for i in frames_ids if i not in drop_ids]
    indexes = [True if i in frames_ids else False for i in range(len(label_array)) ]
    label_array = label_array[indexes]
    try:
        assert len(label_array) == len(frames_ids)
    except AssertionError:
        print('Houston, we have a problem.')
        print(name)
        pass
    try:
        prefix = '/'.join(name)
        #prefix = os.path.join('/', frames[34:])
    except IndexError:
        prefix = 'null'
        print('Exc: ',frames )
    return_frames = ['/'+name+'/{0:05d}.jpg'.format(id+1) for id in frames_ids]
    return label_array, return_frames, frames_ids

In [None]:
def create_annotations_dict(mode, path):
  tasks = [x for x in os.listdir(annot_dir)]
  data_file = {}
  for task in tasks:
      if task == 'EXPR_Set':
          Expr_list = ['Neutral','Anger','Disgust','Fear','Happiness','Sadness','Surprise']
          data_file[task] ={}
          for mode in [mode]:
          #data_file[mode] = {}
              txt_files = glob.glob(os.path.join(annot_dir, task, mode, '*.txt'))
              data_file[task][mode] = {}
              for txt_file in tqdm(txt_files):
                  name = os.path.basename(txt_file).split('.')[0]
                  print("Folder Name", name)
                  expr_array = read_Expr(txt_file)
                  #frames_paths = sorted(glob.glob(os.path.join('/content', mode, name, '*.jpg')))
                  frames_paths = sorted(glob.glob(os.path.join(path, name, '*.jpg')))
                  #print(frames_paths)
                  expr_array, frames_paths, frames_ids = frames_to_label(name,expr_array, frames_paths, discard_value = -1)
                  data_dict = {'path':frames_paths, 'label':expr_array.reshape(-1) }
                  print(len(frames_paths), len(expr_array))
                  data_file[name] = pd.DataFrame.from_dict(data_dict)
  return data_file

In [None]:
#Training Set
data_file_train = create_annotations_dict('Training_Set', train_set_dir)

In [None]:
#Validation Set
data_file_val = create_annotations_dict('Validation_Set', validation_set_dir)

# Create Pandas DataFrame TRAIN SET

---



In [None]:
train_set = pd.DataFrame()

In [None]:
txt_files = glob.glob(os.path.join(annot_dir,'EXPR_Set', 'Training_Set', '*.txt'))
for i, txt_file in tqdm(enumerate(txt_files)):
    name = os.path.basename(txt_file).split('.')[0]
    data = data_file_train[name]
    test_set = test_set.append(data)

In [None]:
histogram = train_set['label'].hist(bins = train_set['label'].nunique())

In [None]:
print(train_set['label'].value_counts(normalize=True) * 100)

In [None]:
print(train_set['label'].value_counts())

In [None]:
save_path = os.path.join(annot_dir, 'train_set.pkl')
print(save_path)
train_set.to_pickle(save_path)

# Create Pandas DataFrame VALIDATION SET

---



In [None]:
val_set = pd.DataFrame()

In [None]:
txt_files = glob.glob(os.path.join(annot_dir, 'EXPR_Set', 'Validation_Set', '*.txt'))

for i, txt_file in tqdm(enumerate(txt_files)):
    name = os.path.basename(txt_file).split('.')[0]
    data = data_file_val[name]
    val_set = val_set.append(data)

In [None]:
val_set

In [None]:
print(val_set.label.value_counts())

In [None]:
histogram_val = val_set['label'].hist(bins = val_set['label'].nunique())
print(val_set['label'].value_counts())
print(val_set['label'].value_counts(normalize=True) * 100)

In [None]:
save_path = os.path.join(annot_dir, 'val_set.pkl')
print(save_path)
val_set.to_pickle(save_path)