# Main working file

In [1]:
# imports

## system
import glob
import os
from pathlib import Path

## data
import numpy as np
import pandas as pd



# Preprocessing

## Rewrapping

In [4]:
# Import the function
from util.video_transformation import rewrap_video

%load_ext autoreload
%autoreload 2 

base_path = "data-in"
input_folder = "avi"

for team_folder in os.listdir(base_path):
    if team_folder not in ['09_10', '21_22']:
        continue
    path_videos = os.path.join(base_path, team_folder, input_folder, "*.avi")
    files = glob.glob(path_videos)

    for file in files:
        rewrap_video(file, os.path.join(base_path, team_folder))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
data-in\09_10\avi\Camera_pp10_navigator_0_20240703_1008.avi
Rewrapping video
Video was already rewrapped
data-in\09_10\avi\Camera_pp10_navigator_0_20240703_1008_rewrapped.avi
Video already has metadata
data-in\09_10\avi\Camera_pp9_pilot_0_20240703_1008.avi
Video already has metadata
data-in\21_22\avi\Camera_pp21_pilot_0_20240925_1339.avi
Video already has metadata
data-in\21_22\avi\Camera_pp22_navigator_0_20240925_1338.avi
Rewrapping video


## Merge audio and video

## Trim Video

# Extraction of Action Units (AUs)

In [15]:
from util.feature_extraction import extract_features
data_in = r'data-in'
data_out = r'data-out'

for pair in os.listdir(data_in):
    input_folder = os.path.join(data_in, pair)
    output_folder = os.path.join(data_out, pair, 'au')
    for participant in pair.split('_'):
        # TODO: handle missing data (NaN or recovered)
        aus = extract_features(input_folder, participant, output_folder)


pp05_navigator_instructional_video_0.csv has already been processed. Continuing...
pp05_navigator_discussion_phase_0.csv has already been processed. Continuing...
pp05_navigator_discussion_phase_1.csv has already been processed. Continuing...
pp06_pilot_instructional_video_0.csv has already been processed. Continuing...
pp06_pilot_discussion_phase_0.csv has already been processed. Continuing...
pp06_pilot_discussion_phase_1.csv has already been processed. Continuing...
pp07_navigator_instructional_video_0.csv has already been processed. Continuing...
pp07_navigator_discussion_phase_0.csv has already been processed. Continuing...
pp07_navigator_discussion_phase_1.csv has already been processed. Continuing...
pp08_pilot_instructional_video_0.csv has already been processed. Continuing...
pp08_pilot_discussion_phase_0.csv has already been processed. Continuing...
pp08_pilot_discussion_phase_1.csv has already been processed. Continuing...
pp09_navigator_instructional_video_0.csv has already

# Feature selection


In [14]:
location = r"data-out" 
for pair in os.listdir(location):
    filename = os.path.join(location, pair, "selection")
    os.makedirs(filename, exist_ok=True)
    filename = os.path.join(location, pair, "extraction")
    os.makedirs(filename, exist_ok=True)
    

## Facial factors

In [None]:
import util.feature_selection as fs
location = r"data-out" 
names = ['discussion_phase_0', 'discussion_phase_1', 'instructional_video_0']
for pair in os.listdir(location):
    # TODO: make it check which pair the file is from and make two dataframes of that. Then combine
    for file in os.listdir(os.path.join(location, pair, "au")):
        if ".csv" in file: 
            filename = os.path.join(location, pair, "au", file)
            participant, _ = file.split("_",1)
            df = pd.read_csv(filename)
            for name in names:
                if name in file:
                    factors = fs.au_to_factors(df)
                    factors.to_csv(os.path.join(location, pair, "selection", f"{participant}_{name}_factors.csv"), index=False)

## Correlated Component Analysis

In [None]:
import util.feature_selection as fs

# corrCA takes a df as input. This df should be all files for a pair. 
location = r"data-out" 
for pair in os.listdir(location):
    filename = os.path.join(location, pair)
    data = {}
    nav, pil = pair.split("_")
    nav_df = pd.DataFrame()
    pil_df = pd.DataFrame()
    for file in os.listdir(os.path.join(filename, "au")):
        if ".csv" in file: 
            df = pd.read_csv(os.path.join(filename, "au", file))
            if ".csv" in file and nav in file:
                nav_df = pd.concat([nav_df, df])
            if ".csv" in file and pil in file:
                pil_df = pd.concat([pil_df, df])
    nav_df, pil_df = fs.make_equal_length(pair, nav_df, pil_df)
    w = fs.corrCA_weights(nav_df, pil_df) #output = pair/corrca.csv
    w.to_csv(os.path.join(filename, f"{pair}_corrca_weights.csv"), index=False)

names = ['discussion_phase_0', 'discussion_phase_1', 'instructional_video_0']
for pair in os.listdir(location):
    for file in os.listdir(os.path.join(location, pair, "au")):
        if ".csv" in file: 
            filename = os.path.join(location, pair, "au", file)
            participant, _ = file.split("_", 1)
            df = pd.read_csv(filename)
            for name in names:
                if name in file:
                    w = pd.read_csv(os.path.join(location, pair, f'{pair}_corrca_weights.csv'))
                    corrca = fs.apply_corrCA_weights(df, w)
                    corrca.to_csv(os.path.join(location, pair, "selection", f"{participant}_{name}_corrca.csv"), index=False)
                    continue


files of pairs in 05_06 do not have the same amount of datapoints - temporary fix has made them equal length
(17, 17)
files of pairs in 07_08 do not have the same amount of datapoints - temporary fix has made them equal length
(17, 17)
(17, 17)


# Correlation measure

## Pearson


In [27]:
from scipy.stats import pearsonr
import util.feature_selection as fs

location = r"data-out" 
sets = ['corrca', 'factors']
phases = ['instructional_video_0', 'discussion_phase_0', 'discussion_phase_1']
factors = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6']

# each pair gets a separate file
for pair in os.listdir(location):
    df = pd.DataFrame()
    nav, pil = pair.split("_")
    file = os.path.join(location, pair, 'selection')

    # format:  | factor1 | factor2 | ... | factor 6 | corrca | (per phase)
    for phase in phases:
        correlation = []
        # factors
        nav_factors = pd.read_csv(os.path.join(file, f"pp{nav}_{phase}_factors.csv"))
        pil_factors = pd.read_csv(os.path.join(file, f"pp{pil}_{phase}_factors.csv"))
        nav_factors, pil_factors = fs.make_equal_length(f"{pair}_{phase}", nav_factors, pil_factors)
        for f in factors:
            corr, _ = pearsonr(nav_factors[f].values, pil_factors[f].values)
            correlation.append(corr)

        # corrca
        nav_corrca = pd.read_csv(os.path.join(file, f"pp{nav}_{phase}_corrca.csv"))
        pil_corrca = pd.read_csv(os.path.join(file, f"pp{pil}_{phase}_corrca.csv"))
        nav_corrca, pil_corrca = fs.make_equal_length(f"{pair}_{phase}", nav_corrca, pil_corrca)
        
        corr, _ = pearsonr(nav_corrca['component1'].values, pil_corrca['component1'].values)
        correlation.append(corr)
        
        df[phase] = correlation
    df.to_csv(os.path.join(location, pair, "extraction", f"{pair}_pearson.csv"))
    df.to_csv(os.path.join("results", f"{pair}_pearson.csv"))

files of pairs in 05_06_discussion_phase_0 do not have the same amount of datapoints - temporary fix has made them equal length
files of pairs in 05_06_discussion_phase_0 do not have the same amount of datapoints - temporary fix has made them equal length
files of pairs in 07_08_instructional_video_0 do not have the same amount of datapoints - temporary fix has made them equal length
files of pairs in 07_08_instructional_video_0 do not have the same amount of datapoints - temporary fix has made them equal length
files of pairs in 07_08_discussion_phase_0 do not have the same amount of datapoints - temporary fix has made them equal length
files of pairs in 07_08_discussion_phase_0 do not have the same amount of datapoints - temporary fix has made them equal length
files of pairs in 07_08_discussion_phase_1 do not have the same amount of datapoints - temporary fix has made them equal length
files of pairs in 07_08_discussion_phase_1 do not have the same amount of datapoints - temporary f

## cRQA