# Main working file

In [8]:
# imports

## system
import os
import numpy as np
import pandas as pd

import util.feature_selection as fs
import util.correlation_measure as cm

LOCATION = "data"
ACTION_UNITS = "data/au"
FEATURE_FOLDER = "features"
CORRELATION_FOLDER = 'correlations'
PHASES = [f'{name}_{i}' for name, num in  [("instructional_video", 1), ("discussion_phase", 2), ('reschu_run',8)] for i in range(num)]#, ("reschu_run", 8)] for i in range(num)]
SETS = ['corrca', 'factors']
FACTORS = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6']
COMPONENTS = ['c1', 'c2', 'c3']
PAIRS = [f'0{i}_0{i+1}' for i in np.arange(1,9,2)]
PAIRS.append("09_10")
PAIRS.extend([f'{i}_{i+1}' for i in np.arange(11,104,2)])


AVAILABLE_PAIRS = []
SKIP_PAIRS = ['53_54', '55_56', "63_64", "89_90"]
for file in os.listdir(ACTION_UNITS):
    if ".csv" in file and 'Data' in file: 
        filename = os.path.join(ACTION_UNITS, file)
        participant, _ = file.split("_",1)
        participant = participant[2:]
        for p in PAIRS:
            p1,p2 = p.split("_")
            if participant == p1 or participant == p2:
                pair = p
                break
        if pair in SKIP_PAIRS:
            continue
        if pair not in AVAILABLE_PAIRS:
            AVAILABLE_PAIRS.append(pair)
            dir = os.path.join(LOCATION, pair)
            os.makedirs(dir, exist_ok=True)
            dir = os.path.join(LOCATION, pair, FEATURE_FOLDER)
            os.makedirs(dir, exist_ok=True)
            dir = os.path.join(LOCATION, pair, CORRELATION_FOLDER)
            os.makedirs(dir, exist_ok=True)

print(len(AVAILABLE_PAIRS))


12


# Feature selection


## Facial factors

In [9]:
## CREATING WEIGHTS FOR EACH PAIR SEPARATELY

# corrCA takes a df as input. This df should be all files for a pair. 
for pair in AVAILABLE_PAIRS:
    print(pair)
    weights_path = os.path.join(LOCATION, pair, f'{pair}_corrca_weights.csv')
    if os.path.exists(weights_path):
        print(f"Already computed corrca weights for {pair}, skipping...")
        continue

    p1, p2 = pair.split("_")
    p1_df = pd.DataFrame()
    p2_df = pd.DataFrame()

    for file in os.listdir(ACTION_UNITS):
        if '.csv' in file and 'Data' in file and p1 in file: 
            df = pd.read_csv(os.path.join(ACTION_UNITS, file))
            p1_df = pd.concat([p1_df, df])
        if '.csv' in file and 'Data' in file and p2 in file: 
            df = pd.read_csv(os.path.join(ACTION_UNITS, file))
            p2_df = pd.concat([p2_df, df])
    if len(p1_df) < 1 or len(p2_df) < 1:
        print(f"Pair {pair} misses either navigator or pilot action unit files, REMOVING FROM AVAILABLE PAIRS...")
        AVAILABLE_PAIRS = [i for i in AVAILABLE_PAIRS if i!=pair]
        continue
    p1_df = p1_df.iloc[:len(p2_df)]
    p2_df = p2_df.iloc[:len(p1_df)]

    mask = (p1_df.isna().any(axis=1).values) | (p2_df.isna().any(axis=1).values)
    p1_df = p1_df[~mask].reset_index(drop=True)
    p2_df = p2_df[~mask].reset_index(drop=True)
    w = fs.corrCA_weights(p1_df, p2_df) #output = pair/pair_corrca_weights.csv
    w.to_csv(os.path.join(LOCATION, pair, f"{pair}_corrca_weights.csv"), index=False)

print(len(AVAILABLE_PAIRS))

05_06
Already computed corrca weights for 05_06, skipping...
07_08
Already computed corrca weights for 07_08, skipping...
09_10
Already computed corrca weights for 09_10, skipping...
103_104
Already computed corrca weights for 103_104, skipping...
27_28
Already computed corrca weights for 27_28, skipping...
83_84
Already computed corrca weights for 83_84, skipping...
85_86
Already computed corrca weights for 85_86, skipping...
87_88
Already computed corrca weights for 87_88, skipping...
91_92
Already computed corrca weights for 91_92, skipping...
93_94
Already computed corrca weights for 93_94, skipping...
95_96
Already computed corrca weights for 95_96, skipping...
97_98
Already computed corrca weights for 97_98, skipping...
12


In [12]:
## EXTRACTING FACIAL FACTORS AND CORRCA FOR EACH FILE SEPARATELY
for pair in AVAILABLE_PAIRS:
    # check whether we can perform corrca
    if not os.path.exists(os.path.join(LOCATION, pair, f'{pair}_corrca_weights.csv')):
        print(f"Found no corrca weights for {pair}")    
    else:
        w = pd.read_csv(os.path.join(LOCATION, pair, f'{pair}_corrca_weights.csv'))

    p1, p2 = pair.split("_")
    for phase in PHASES: 
        p1_file = os.path.join(ACTION_UNITS, f"pp{p1}_navigator_{phase}_AU_withMissingData.csv")
        p2_file = os.path.join(ACTION_UNITS, f"pp{p2}_pilot_{phase}_AU_withMissingData.csv")

        if os.path.exists(p1_file) and os.path.exists(p2_file):
            p1_df = pd.read_csv(p1_file)
            p2_df = pd.read_csv(p2_file)

            # make equal length
            p1_df = p1_df.iloc[:len(p2_df)]
            p2_df = p2_df.iloc[:len(p1_df)]

            # remove all missing data
            mask = (p1_df.isna().any(axis=1).values) | (p2_df.isna().any(axis=1).values)
            p1_df = p1_df[~mask].reset_index(drop=True)
            p2_df = p2_df[~mask].reset_index(drop=True)

            p1_factors = fs.au_to_factors(p1_df)
            p1_factors.to_csv(os.path.join(LOCATION, pair, FEATURE_FOLDER, f"pp{p1}_{phase}_factors.csv"), index=False)
            
            p2_factors = fs.au_to_factors(p2_df)
            p2_factors.to_csv(os.path.join(LOCATION, pair, FEATURE_FOLDER, f"pp{p2}_{phase}_factors.csv"), index=False)

            if not os.path.exists(os.path.join(LOCATION, pair, f'{pair}_corrca_weights.csv')):
                continue

            p1_corrca = fs.apply_corrCA_weights(p1_df, w)
            p1_corrca.to_csv(os.path.join(LOCATION, pair, FEATURE_FOLDER, f"{p1}_{phase}_corrca.csv"), index=False)

            p2_corrca = fs.apply_corrCA_weights(p2_df, w)
            p2_corrca.to_csv(os.path.join(LOCATION, pair, FEATURE_FOLDER, f"{p2}_{phase}_corrca.csv"), index=False)
            



# Correlation measure: cRQA

In [13]:
# TODO: cRQA at different time lags for everything
# TODO: RR at diag 0 for synchrony
# TODO: RR max lag for mimicry

# chosen radii that work to get meaningful results: real pairs are still captured and fake pairs are not
RADII = {
    'f1': 0.5,
    'f2': 0.3,
    'f3': 0.5,
    'f4': 0.5,
    'f5': 0.5,
    'f6': 0.5,
    'c1': 0.1,
    'c2': 0.1,
    'c3': 0.1 
}

# EXPERIMENT AREA

In [None]:
import pandas as pd
import os
import numpy as np

# Initialize DataFrame to store all results
results_df = pd.DataFrame(columns=[
    'pair', 'phase', 'method', 'component_factor', 'radius', 
    'non_event_matches', 'condition', 'RR'
])

# Define radii to test
radii = [0.1, 0.2, 0.3, 0.4, 0.5]

def process_analysis(p1_df, p2_df, component, radius, remove_non_events, method, pair, phase, condition):
    output = cm.crqa_lag_analysis(
        p1_df[component].values, 
        p2_df[component].values, 
        radius=radius,
        remove_non_event_matches=remove_non_events
    )
    
    new_row = {
        'pair': pair,
        'phase': phase,
        'method': method,
        'component_factor': component,
        'radius': radius,
        'non_event_matches': 'excluded' if remove_non_events else 'included',
        'condition': condition,
        'RR': output['RR']
    }
    
    return new_row

# Process real pairs - Factors
print("\nREAL PAIRS - FACTORS\n")
for f in FACTORS:
    print(f"Processing factor {f}")
    for r in radii:
        for pair in AVAILABLE_PAIRS[:10]:
            p1, p2 = pair.split("_")
            for phase in PHASES:
                p1_loc = os.path.join(LOCATION, pair, FEATURE_FOLDER, f'pp{p1}_{phase}_factors.csv')
                p2_loc = os.path.join(LOCATION, pair, FEATURE_FOLDER, f'pp{p2}_{phase}_factors.csv')
                
                if os.path.exists(p1_loc) and os.path.exists(p2_loc):
                    p1_df = pd.read_csv(p1_loc)
                    p2_df = pd.read_csv(p2_loc)
                    
                    # With non-event matches
                    new_row = process_analysis(
                        p1_df, p2_df, f, r, False, 
                        'factor', pair, phase, 'real'
                    )
                    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
                    
                    # Without non-event matches
                    new_row = process_analysis(
                        p1_df, p2_df, f, r, True, 
                        'factor', pair, phase, 'real'
                    )
                    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

# Process real pairs - CORRCA
print("\nREAL PAIRS - CORRCA\n")
for c in COMPONENTS:
    print(f"Processing component {c}")
    for r in radii:
        for pair in AVAILABLE_PAIRS[:10]:
            p1, p2 = pair.split("_")
            for phase in PHASES:
                p1_loc = os.path.join(LOCATION, pair, FEATURE_FOLDER, f'pp{p1}_{phase}_corrca.csv')
                p2_loc = os.path.join(LOCATION, pair, FEATURE_FOLDER, f'pp{p2}_{phase}_corrca.csv')
                
                if os.path.exists(p1_loc) and os.path.exists(p2_loc):
                    p1_df = pd.read_csv(p1_loc)
                    p2_df = pd.read_csv(p2_loc)
                    
                    # With non-event matches
                    new_row = process_analysis(
                        p1_df, p2_df, c, r, False, 
                        'corrca', pair, phase, 'real'
                    )
                    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
                    
                    # Without non-event matches
                    new_row = process_analysis(
                        p1_df, p2_df, c, r, True, 
                        'corrca', pair, phase, 'real'
                    )
                    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

# Process surrogate pairs - Factors
print("\nSURROGATE PAIRS - FACTORS\n")
index_real = np.arange(10)
index_fake = np.append(np.arange(1,10),0)

for f in FACTORS:
    print(f"Processing factor {f} for surrogate pairs")
    for r in radii:
        for i in range(10):
            pair1 = AVAILABLE_PAIRS[index_real[i]]
            pair2 = AVAILABLE_PAIRS[index_fake[i]]
            p1, _ = pair1.split("_")
            _, p2 = pair2.split("_")
            
            for phase in PHASES:
                p1_loc = os.path.join(LOCATION, pair1, FEATURE_FOLDER, f'pp{p1}_{phase}_factors.csv')
                p2_loc = os.path.join(LOCATION, pair2, FEATURE_FOLDER, f'pp{p2}_{phase}_factors.csv')
                
                if os.path.exists(p1_loc) and os.path.exists(p2_loc):
                    p1_df = pd.read_csv(p1_loc)
                    p2_df = pd.read_csv(p2_loc)
                    
                    # With non-event matches
                    new_row = process_analysis(
                        p1_df, p2_df, f, r, False, 
                        'factor', f"{pair1}_{pair2}", phase, 'fake'
                    )
                    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
                    
                    # Without non-event matches
                    new_row = process_analysis(
                        p1_df, p2_df, f, r, True, 
                        'factor', f"{pair1}_{pair2}", phase, 'fake'
                    )
                    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

# Process surrogate pairs - CORRCA
print("\nSURROGATE PAIRS - CORRCA\n")
for c in COMPONENTS:
    print(f"Processing component {c} for surrogate pairs")
    for r in radii:
        for i in range(10):
            pair1 = AVAILABLE_PAIRS[index_real[i]]
            pair2 = AVAILABLE_PAIRS[index_fake[i]]
            p1, _ = pair1.split("_")
            _, p2 = pair2.split("_")
            
            for phase in PHASES:
                p1_loc = os.path.join(LOCATION, pair1, FEATURE_FOLDER, f'pp{p1}_{phase}_corrca.csv')
                p2_loc = os.path.join(LOCATION, pair2, FEATURE_FOLDER, f'pp{p2}_{phase}_corrca.csv')
                
                if os.path.exists(p1_loc) and os.path.exists(p2_loc):
                    p1_df = pd.read_csv(p1_loc)
                    p2_df = pd.read_csv(p2_loc)
                    
                    # With non-event matches
                    new_row = process_analysis(
                        p1_df, p2_df, c, r, False, 
                        'corrca', f"{pair1}_{pair2}", phase, 'fake'
                    )
                    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
                    
                    # Without non-event matches
                    new_row = process_analysis(
                        p1_df, p2_df, c, r, True, 
                        'corrca', f"{pair1}_{pair2}", phase, 'fake'
                    )
                    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)

# Save results to CSV
results_df.to_csv('crqa_results_all_pairs.csv', index=False)
print("Processing complete. Results saved to crqa_results_all_pairs.csv")

# Display sample of the results
print("\nSample of the results DataFrame:")
print(results_df.head())


REAL PAIRS - FACTORS

Processing factor f1


  results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)


Processing factor f2
