<a href="https://colab.research.google.com/github/evansalv/social-perception-convo/blob/main/Visual_Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import os
import re
import pandas as pd
from glob import glob

In [22]:
vis_dir = '/content/drive/My Drive/Closeness_Project_Materials/facial_exp_extracted/'
output_dir = '/content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Visual_Preprocessing/Outputs/'

In [23]:
# Define the directory containing CSV files
pyfeat_files = glob(os.path.join(vis_dir, "*.csv"))
print(len(pyfeat_files))

45


In [24]:
# Function to parse mm:ss or mm:ss:ms into total seconds
def parse_time(time_str):
    try:
        parts = time_str.strip().split(":")
        if len(parts) == 2:
            minutes, seconds = map(float, parts)
            return minutes * 60 + seconds
        elif len(parts) == 3:
            minutes, seconds, ms = map(float, parts)
            return minutes * 60 + seconds + ms / 100
    except:
        return None

In [30]:
# Load and annotate each CSV
pyfeat_list = []

for file_path in pyfeat_files:
    filename = os.path.basename(file_path)
    #print(filename)
    base_name = os.path.splitext(filename)[0]
    #print(base_name)
    match = re.match(r"Pair(\d+)\s+(\d+)", base_name)

    if match:
        pair, speaker = match.groups()
        print("Pair ID: ", pair)
        print("Speaker Displayed  ID: ", speaker)
        df = pd.read_csv(file_path)
        df["Pair"] = pair
        df["Speaker_Displayed"] = int(speaker)
        pyfeat_list.append(df)

    if not match:
        print(f"Skipping file: {filename}")

# Combine all files into one DataFrame
if pyfeat_list:
    pyfeat_dfs = pd.concat(pyfeat_list, ignore_index=True)
else:
    pyfeat_dfs = pd.DataFrame()

# Parse time and clean columns
if not pyfeat_dfs.empty:
    pyfeat_dfs["approx_time"] = pyfeat_dfs["approx_time"].astype(str)
    #pyfeat_dfs["TimestampID"] = pyfeat_dfs["approx_time"].apply(parse_time)
    #pyfeat_dfs["TimestampID"] = pyfeat_dfs["TimestampID"].fillna(0).astype(int)
    #pyfeat_dfs["TimestampID"] = pyfeat_dfs["TimestampID"].replace(0, 1)

    #pyfeat_dfs["Speaker_Displayed"] = pyfeat_dfs["Speaker_Displayed"].replace({
        #"speaker1": "Speaker 1", "speaker2": "Speaker 2"
    #})

    # Identify AU columns
    au_columns = [col for col in pyfeat_dfs.columns if "AU" in col]

    # Select and summarize desired columns
    cols_to_keep = ["Pair", "Speaker_Displayed", "approx_time", "Pitch", "Roll", "Yaw",
                    "anger", "disgust", "fear", "happiness", "sadness", "surprise", "neutral"] + au_columns

    pyfeat_df = pyfeat_dfs[cols_to_keep]

    # Group by VideoID, TimestampID, and SpeakerID, then compute means
    pyfeat_df = (
        pyfeat_df.groupby(["Pair", "Speaker_Displayed", "approx_time"], as_index=False)
        .agg({
                "Pitch": "mean",
                "Roll": "mean",
                "Yaw": "mean",
                "anger": "mean",
                "disgust": "mean",
                "fear": "mean",
                "happiness": "mean",
                "sadness": "mean",
                "surprise": "mean",
                "neutral": "mean",
                **{col: "mean" for col in au_columns}
            })
            .rename(columns={
                "Pitch": "Pitch_facial",
                "Roll": "Roll_facial",
                "Yaw": "Yaw_facial",
                "anger": "anger_facial",
                "disgust": "disgust_facial",
                "fear": "fear_facial",
                "happiness": "happiness_facial",
                "sadness": "sadness_facial",
                "surprise": "surprise_facial",
                "neutral": "neutral_facial"
        })
    )


pyfeat_df.head()


Pair ID:  31
Speaker Displayed  ID:  1
Pair ID:  29
Speaker Displayed  ID:  1
Pair ID:  5
Speaker Displayed  ID:  2
Pair ID:  33
Speaker Displayed  ID:  1
Pair ID:  2
Speaker Displayed  ID:  2
Pair ID:  4
Speaker Displayed  ID:  2
Pair ID:  31
Speaker Displayed  ID:  2
Pair ID:  2
Speaker Displayed  ID:  1
Pair ID:  27
Speaker Displayed  ID:  1
Pair ID:  4
Speaker Displayed  ID:  1
Pair ID:  7
Speaker Displayed  ID:  2
Pair ID:  24
Speaker Displayed  ID:  1
Pair ID:  10
Speaker Displayed  ID:  2
Pair ID:  20
Speaker Displayed  ID:  1
Pair ID:  11
Speaker Displayed  ID:  2
Pair ID:  11
Speaker Displayed  ID:  1
Pair ID:  12
Speaker Displayed  ID:  2
Pair ID:  8
Speaker Displayed  ID:  1
Pair ID:  8
Speaker Displayed  ID:  2
Pair ID:  6
Speaker Displayed  ID:  1
Pair ID:  3
Speaker Displayed  ID:  2
Pair ID:  7
Speaker Displayed  ID:  1
Pair ID:  6
Speaker Displayed  ID:  2
Pair ID:  5
Speaker Displayed  ID:  1
Pair ID:  29
Speaker Displayed  ID:  2
Pair ID:  27
Speaker Displayed  ID:  2

Unnamed: 0,Pair,Speaker_Displayed,approx_time,Pitch_facial,Roll_facial,Yaw_facial,anger_facial,disgust_facial,fear_facial,happiness_facial,...,AU14,AU15,AU17,AU20,AU23,AU24,AU25,AU26,AU28,AU43
0,1,1,00:00,-1.137874,-0.759117,11.746366,0.009239,0.041927,0.003011,0.860669,...,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.5,1.0
1,1,1,00:01,11.514537,6.562951,9.117919,0.007451,0.001733,0.003434,0.929481,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,1,1,00:02,12.616027,9.631932,0.232633,0.014891,0.001362,0.018632,0.149057,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1,1,00:03,19.226876,4.371184,3.468467,0.013003,0.001567,0.013661,0.126781,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1,1,00:04,17.699673,13.535509,9.090072,0.013212,0.001789,0.007152,0.277127,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [31]:
# Define the output file path
output_path = output_dir + 'visual_data.csv'

# Save the DataFrame to CSV
pyfeat_df.to_csv("output_path", index=False)

print(f"✅ CSV file successfully saved to: {output_path}")

✅ CSV file successfully saved to: /content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Visual_Preprocessing/Outputs/visual_data.csv
