# Libraries

In [17]:
import os
import shutil

import numpy as np
import pandas as pd

# Paths

In [37]:
DATASETS_PATH = "/home/nicolas/trabajo-profesional/datasets/"
IFEED_PATH = DATASETS_PATH + "IFEED/images/"
IFEED_TEMP_PATH = DATASETS_PATH + "IFEED_Base"
IFEED_LABELS_PATH = DATASETS_PATH + "IFEED/labels/"

# Viability fact checking

In [19]:
def search_files_in_folder(path, files):
    for file in os.listdir(path):
        complete_path = f"{path}/{file}"
        if os.path.isdir(complete_path):
            files = search_files_in_folder(complete_path, files)
        else:
            files.append(complete_path)
    return files

In [20]:
complete_paths = []
complete_paths = search_files_in_folder(IFEED_TEMP_PATH, complete_paths)
len(complete_paths)

34946

In [21]:
sorted(complete_paths)

['/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia0_utt0/frame32_Phoebe_sad.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia0_utt0/frame36_Phoebe_sad.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia0_utt0/frame40_Phoebe_sad.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia0_utt0/frame60_Phoebe_happy.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia103_utt0/frame32_Monica_happy.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia103_utt3/frame20_Monica_happy.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia103_utt3/frame40_Monica_happy.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia103_utt3/frame44_Monica_sad.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia103_utt3/frame48_Monica_sad.jpg',
 '/home/nicolas/trabajo-profesional/datasets/IFEED_Base/Crossing/dia103_utt3/frame

Tengo la sensación de que hay archivos repetidos. Veamos qué pasa al calcular sus hashes

In [22]:
import hashlib

def sha256sum(filename):
    h  = hashlib.sha256()
    b  = bytearray(128*1024)
    mv = memoryview(b)
    with open(filename, 'rb', buffering=0) as f:
        while n := f.readinto(mv):
            h.update(mv[:n])
    return h.hexdigest()


In [23]:
hashes = list(map(sha256sum, complete_paths))
len(hashes)

34946

In [24]:
len(set(hashes))

34770

Pareciera haber 34770 files distintos. A ver si folder + name permiten identificarlos unívocamente

In [25]:
names = ["-".join(file.split("/")[-2:]) for file in complete_paths]
len(names)

34946

In [26]:
names[:20]

['dia568_utt7-frame24_Chandler_neutral.jpg',
 'dia568_utt7-frame28_Chandler_sad.jpg',
 'dia568_utt7-frame52_Chandler_happy.jpg',
 'dia568_utt7-frame56_Chandler_sad.jpg',
 'dia568_utt7-frame48_Chandler_sad.jpg',
 'dia568_utt7-frame12_Chandler_fear.jpg',
 'dia392_utt6-frame52_Phoebe_happy.jpg',
 'dia392_utt6-frame4_Phoebe_happy.jpg',
 'dia392_utt6-frame32_Phoebe_happy.jpg',
 'dia392_utt6-frame36_Phoebe_happy.jpg',
 'dia392_utt6-frame60_Phoebe_happy.jpg',
 'dia392_utt6-frame56_Phoebe_happy.jpg',
 'dia392_utt6-frame48_Phoebe_happy.jpg',
 'dia392_utt6-frame8_Phoebe_happy.jpg',
 'dia392_utt6-frame104_Phoebe_happy.jpg',
 'dia392_utt6-frame24_Phoebe_happy.jpg',
 'dia392_utt6-frame40_Phoebe_happy.jpg',
 'dia392_utt6-frame0_Phoebe_happy.jpg',
 'dia392_utt6-frame68_Phoebe_happy.jpg',
 'dia392_utt6-frame44_Phoebe_happy.jpg']

In [27]:
len(set(names))

34913

In [28]:
# for file in complete_paths:
#     source = file
#     target = IFEED_PATH + "-".join(file.split("/")[-2:])
#     shutil.copyfile(source, target)

In [29]:
len(os.listdir(IFEED_PATH))

34913

In [30]:
assert len(os.listdir(IFEED_PATH)) == len(set(names))

# Generate csv

In [32]:
df = pd.DataFrame()
df["file_name"] = names
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34946 entries, 0 to 34945
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_name  34946 non-null  object
dtypes: object(1)
memory usage: 273.1+ KB


In [36]:
df["labels_expression"] = df["file_name"].apply(lambda x: x.replace(".jpg", "").split("_")[-1])
df["labels_expression"] = df["labels_expression"].replace({"fea": "fear"})
df["labels_expression"].value_counts()

labels_expression
happy       10167
sad          8247
neutral      8152
angry        5157
fear         2009
surprise      802
disgust       412
Name: count, dtype: int64

In [38]:
df.to_csv(IFEED_LABELS_PATH + "labels.csv", sep=",", encoding="utf-8", index=False)