In [None]:
# 3_data_compiling

# get cleaned dataframes created in 1_data_cleaning
# set filters (number of words, number of frames per video, number of batches)
# compile data into small batches
# (since videos do not all have the same length, pad videos shorter than desired frame number with zeros)
# join batches
# save data

In [None]:
# import libraries

import os

import pandas as pd
import numpy as np

In [None]:
# set parameters

word_nb = 10
frame_nb = 50
batch_nb = 10

In [None]:
# get number of files per word that meet frame_nb criteria

# directory to be scanned
path_from = "/path_of_directory_where_cleaned_dataframes_were_saved/"

# initilise variables to store info and list of files
data_word = []
data_info = pd.DataFrame(columns = ["word", "count"])

# scan the directory
obj_file = os.scandir(path_from)

count = 0

# loop through files
for entry_file in obj_file:
    if entry_file.is_file() and entry_file.name.endswith(".csv"):
        count += 1
        print(count)

        # open current data
        data_current = pd.read_csv(entry_file.path)
        
        # check whether data is not empty and filter with frame number and get word
        if (data_current.shape[0] > 0) & (data_current.shape[0] <= frame_nb):
            data_word.append(entry_file.name.split("_")[0])

# count csv files per word
words_unique = np.unique(data_word)
for i in range(0,len(words_unique)):
    data_info.loc[i,"word"] = words_unique[i]
    count = 0
    for j in range(0,len(data_word)):
        if words_unique[i] == data_word[j]:
            count += 1
    data_info.loc[i,"count"] = count

# sort info
data_info = data_info.sort_values("count", ascending = False)

In [None]:
# visually check data info
data_info.head(11)

In [None]:
# word "NS" corresponds to geographical places, not taken further in our analysis
# set word list based on data info

word_list = ["AUSSI", "LS", "OUI", "AVOIR", "SOURD", "QUOI", "MAIS", "NON", "PLUS.P", "REGARDER"]

In [None]:
# get number of files after filtering

file_nb = 0
# loop through data info
for i in range(data_info.shape[0]):
    if data_info.loc[i,"word"] in word_list:
        file_nb += data_info.loc[i,"count"]

print(file_nb)

In [None]:
# compile data

# directory to be scanned
path_from = "/path_of_directory_where_cleaned_dataframes_were_saved/"

# directory to save batches of data
path_to = "/path_of_directory_where_batches_will_be_saved/"

# initilise variables 
data_temp = np.zeros((frame_nb, 1659))
labels_temp = ["init_label"] * frame_nb
count_all = 0
count = 0
batch_id = 0

# scan the directory
obj_file = os.scandir(path_from)
        
# loop through files
for entry_file in obj_file:
    if entry_file.is_file() and entry_file.name.endswith(".csv"):
        word = entry_file.name.split("_")[0]

        # process if word is in word list
        if word in word_list:
            data_current = pd.read_csv(entry_file.path)

            # check number of frames
            if (data_current.shape[0] > 0) & (data_current.shape[0] <= frame_nb):
                count_all += 1
                count += 1
                print(count)  

                # pad data to frame_nb
                if data_current.shape[0] < frame_nb:
                    pad_length = frame_nb - data_current.shape[0]
                    pad = pd.DataFrame(0, index = range(pad_length), columns = data_current.columns)
                    data_current = pd.concat([data_current, pad], axis = 0)

                # compile data 
                data_current = data_current.to_numpy()
                data_temp = np.vstack((data_temp,data_current))
                labels_current = [word] * frame_nb
                labels_temp = np.concatenate((labels_temp,labels_current), axis = 0)

                # save when batch size is reached of when all files are processed
                if (count == np.round((file_nb / batch_nb),0)) | (count_all == file_nb):
                    batch_id += 1
                    np.save(path_to + str(batch_id), data_temp)
                    np.save(path_to + str(batch_id), labels_temp)

                    count = 0
                    data_temp = np.zeros((frame_nb, 1659))
                    labels_temp = ["init_label"] * frame_nb


In [None]:
# join batches of data

# directory to be scanned
path_from = "/path_of_directory_where_batches_were_saved/"

# directory to save compiled data
path_to = "/path_of_directory_where_compiled_data_will_be_saved/"

# initilise variables 
data_all = np.zeros((50, 1659))
labels_all = np.array(["init_label"] * 50)  

# scan the directory
obj_file = os.scandir(path_from)

# loop through files
for entry_file in obj_file:
    if entry_file.is_file() and entry_file.name.startswith("data_temp"):
        data_current = np.load(entry_file.path)
        data_all = np.vstack((data_all,data_current))

    if entry_file.is_file() and entry_file.name.startswith("labels_temp"):
        labels_current = np.load(entry_file.path)
        labels_all = np.concatenate((labels_all,labels_current))

# drop columns that still contain nans
columns_tokeep = []
for j in range(data_all.shape[1]):
    if np.sum(np.isnan(data_all[:,j])) == 0:
        columns_tokeep.append(j)
data_all = data_all[:,columns_tokeep]

# remove labels that were used for concatenation
mask = labels_all != "init_label"
data = data_all[mask,:]
labels = labels_all[mask]

# save compiled data
np.save(path_to + data, data)
np.save(path_to + labels, labels)