<a href="https://colab.research.google.com/github/evansalv/social-perception-convo/blob/main/Semantic_Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Step 1: Enable rpy2 and upload transcript CSV
%load_ext rpy2.ipython

import pandas as pd
import numpy as np
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import os
import sys
from rpy2.robjects import r, globalenv, StrVector
from scipy.spatial import distance
import re
import inflect

p = inflect.engine()

In [3]:
# Step 2: Set up transcriptions
# Change this path to where semantic helper code are stored
sem_dir  = '/content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Semantic_Preprocessing/'
# Add the semantic directory to sys.path
sys.path.append(sem_dir)
# Output path for semantic data
transcript_dir = '/content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Studies/Study2/'
output_dir = '/content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Semantic_Preprocessing/Outputs/'


In [4]:
from BERT_helper import bert_embedding

In [5]:
# Define full paths to the files
transcript_path = os.path.join(transcript_dir, 'transcript.csv')
#word_level_path = os.path.join(video_dir, 'Observer_Segments_WordData.xlsx')

# Load the datasets
transcript = pd.read_csv(transcript_path)

In [6]:
print("\nWord-level transcription loaded:")
print(transcript.head())


Word-level transcription loaded:
  Transcript Start_Timestamp End_Timestamp    Speaker  Pair  Question  \
0       What     00:00:00:04   00:00:00:21  Speaker 1     1         2   
1         in     00:00:00:21   00:00:00:28  Speaker 1     1         2   
2       your     00:00:00:28   00:00:00:37  Speaker 1     1         2   
3       life     00:00:00:37   00:00:00:49  Speaker 1     1         2   
4         do     00:00:00:49   00:00:00:57  Speaker 1     1         2   

   Start_Seconds  End_Seconds  Speaker_Displayed      VideoID  turnID  \
0           0.04         0.21                  1  3YxFNCQJXz0       1   
1           0.21         0.28                  1  3YxFNCQJXz0       1   
2           0.28         0.37                  1  3YxFNCQJXz0       1   
3           0.37         0.49                  1  3YxFNCQJXz0       1   
4           0.49         0.57                  1  3YxFNCQJXz0       1   

   turnSpeaker1  turnSpeaker2  turnIDSpeaker1  turnIDSpeaker2 2segIDSpeaker1  \
0       

In [21]:
# Helper functions:

# Convert number: replace the number string to the word of the number ('4' -> 'four')
def replace_numbers(match):
            return p.number_to_words(match.group())

# Clean text: clean text for dictionary word embeddings (AffectR, wav2vec, GloVe, etc.)

def clean_text(text):
    """
    Cleans transcript text from a DataFrame column for word embedding.

    Parameters:
    - df: pandas DataFrame containing the text
    - column: name of the column with text to clean
    - convert_numbers: whether to convert numeric tokens to words (e.g., '4' → 'four')

    Returns:
    - cleaned string
    """

    # Make all lowercase
    text.lower()

    # Replace numbers with words if specified
    text = re.sub(r'\d+', replace_numbers, text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    return text

In [22]:
# Install AffectR
# %%
%%R
# Install devtools if it's not already installed
if (!requireNamespace("devtools", quietly = TRUE)) {
  install.packages("devtools")
}
# Install the affectr package from GitHub
devtools::install_github("markallenthornton/affectr")


Skipping install of 'affectr' from a github remote, the SHA1 (d6727a81) has not changed since last install.
  Use `force = TRUE` to force installation


In [23]:
# Load R package
r('library(affectr)')

# Output list
results = []
total_vids = len(transcript['VideoID'].unique())
vid_count = 0
# Iterate over each VideoID and turn
for vid in transcript['VideoID'].unique():
    vid_count += 1
    print(f"\n📽️ Processing Video {vid_count} / {total_vids} → VideoID: {vid}")

    vid_data = transcript[transcript['VideoID'] == vid]

    for turn in vid_data['turnID'].dropna().unique():
        turn_data = vid_data[vid_data['turnID'] == turn]

        for speaker in ['Speaker 1', 'Speaker 2']:
            listener = 'Speaker 2' if speaker == 'Speaker 1' else 'Speaker 1'
            active_flag = f'5segID{speaker.replace(" ", "")}'

            if active_flag not in turn_data.columns:
                continue

            seg_ids = turn_data[active_flag].dropna().unique()
            if len(seg_ids) == 0:
                continue

            for seg in seg_ids:
                seg_data = turn_data[turn_data[active_flag] == seg]
                text = ' '.join(seg_data['Transcript'].astype(str)).strip()
                cleaned_text = clean_text(text)
                # Defaults
                affect_scores = [0.0, 0.0, 0.0]
                bert_embed = [0.0] * 1024

                # If speaker is talking and text exists
                if text:
                    try:
                        # Compute affectr
                        globalenv['texts'] = StrVector([cleaned_text])
                        r('scores <- affect3d(texts[1])')
                        affect_scores = list(r('as.numeric(scores)'))

                        # Compute BERT
                        bert_embed = bert_embedding("FacebookAI/roberta-large-mnli", text)
                    except Exception as e:
                        print(f"⚠️ Processing failed for VideoID={vid}, Turn={turn}, Segment={seg}, Speaker={speaker}: {e}")
                        affect_scores = [0.0, 0.0, 0.0]
                        bert_embed = [0.0] * 1024

                # Unpack BERT embedding into 1024 columns
                embed_dict = {}
                if speaker == 'Speaker 1':
                    for i in range(1024):
                        embed_dict[f'BERT_Spkr1_{i}'] = bert_embed[i]
                        embed_dict[f'BERT_Spkr2_{i}'] = 0.0
                else:
                    for i in range(1024):
                        embed_dict[f'BERT_Spkr1_{i}'] = 0.0
                        embed_dict[f'BERT_Spkr2_{i}'] = bert_embed[i]

                # Record results
                results.append({
                    'VideoID': vid,
                    'Transcript': text,
                    'Clean_Transcript': cleaned_text,
                    'turnID': turn,
                    '5segIDSpeaker1':	seg if speaker == 'Speaker 1' else np.nan,
                    '5segIDSpeaker2': seg if speaker == 'Speaker 2' else np.nan,
                    'Speaker': speaker,
                    'Listener': listener,
                    'rationality_Spkr1': affect_scores[0] if speaker == 'Speaker 1' else 0,
                    'social_impact_Spkr1': affect_scores[1] if speaker == 'Speaker 1' else 0,
                    'valence_Spkr1': affect_scores[2] if speaker == 'Speaker 1' else 0,
                    'rationality_Spkr2': affect_scores[0] if speaker == 'Speaker 2' else 0,
                    'social_impact_Spkr2': affect_scores[1] if speaker == 'Speaker 2' else 0,
                    'valence_Spkr2': affect_scores[2] if speaker == 'Speaker 2' else 0,
                    **embed_dict
                })

                print(f"✅ Processed → VideoID: {vid}, TurnID: {int(turn)}, Speaker: {speaker}, Segment: {seg}")

# Convert to DataFrame
word_embeddings = pd.DataFrame(results)

# Preview the shape and first few rows
print(f"\n Final output shape: {word_embeddings.shape}")
word_embeddings.head()



📽️ Processing Video 1 / 46 → VideoID: 3YxFNCQJXz0
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 1, Speaker: Speaker 1, Segment: T1_SEG1
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 2, Speaker: Speaker 2, Segment: T2_SEG1
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 2, Speaker: Speaker 2, Segment: T2_SEG2
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 2, Speaker: Speaker 2, Segment: T2_SEG3
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 2, Speaker: Speaker 2, Segment: T2_SEG4
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 2, Speaker: Speaker 2, Segment: T2_SEG5
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 2, Speaker: Speaker 2, Segment: T2_SEG6
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 3, Speaker: Speaker 1, Segment: T3_SEG1
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 4, Speaker: Speaker 2, Segment: T4_SEG1
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 5, Speaker: Speaker 1, Segment: T5_SEG1
✅ Processed → VideoID: 3YxFNCQJXz0, TurnID: 6, Speaker: Speaker 2, Segment: T6_SEG1
✅ Processed → VideoID: 3Y

KeyboardInterrupt: 

In [10]:
# Columns to check for NaNs
columns_to_check = [
    'rationality_Spkr1', 'social_impact_Spkr1', 'valence_Spkr1',
    'rationality_Spkr2', 'social_impact_Spkr2', 'valence_Spkr2'
]

# Check for NaNs in any of the specified columns
has_nan = word_embeddings[columns_to_check].isna().any()

# Print which columns contain NaNs
print("🔍 NaN Check:")
print(has_nan)

# Optionally, show rows where any of those columns have NaN
rows_with_nan = word_embeddings[word_embeddings[columns_to_check].isna().any(axis=1)]
print(f"\n⚠️ Total rows with NaNs in these columns: {len(rows_with_nan)}")
rows_with_nan

🔍 NaN Check:
rationality_Spkr1      False
social_impact_Spkr1    False
valence_Spkr1          False
rationality_Spkr2      False
social_impact_Spkr2    False
valence_Spkr2          False
dtype: bool

⚠️ Total rows with NaNs in these columns: 0


Unnamed: 0,VideoID,Transcript,Clean_Transcript,turnID,5segIDSpeaker1,5segIDSpeaker2,Speaker,Listener,rationality_Spkr1,social_impact_Spkr1,...,BERT_Spkr1_1019,BERT_Spkr2_1019,BERT_Spkr1_1020,BERT_Spkr2_1020,BERT_Spkr1_1021,BERT_Spkr2_1021,BERT_Spkr1_1022,BERT_Spkr2_1022,BERT_Spkr1_1023,BERT_Spkr2_1023


In [11]:
# Define the output file path
output_path = output_dir + 'word_embeddings.csv'

# Save the DataFrame to CSV
word_embeddings.to_csv(output_path, index=False)

print(f"✅ CSV file successfully saved to: {output_path}")

✅ CSV file successfully saved to: /content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Semantic_Preprocessing/Outputs/word_embeddings.csv


In [6]:
output_path = output_dir + 'word_embeddings.csv'
word_embeddings = pd.read_csv(output_path)

In [7]:
# Compute distance meaures for Affect3D word embeddings

# Step 1: Mean-pooling affectR vector
def mean_pool_affectr_vector(group, speaker):
    """Extract and mean-pool affectR values for the active speaker."""
    if speaker == 'Speaker 1':
        rationality = group['rationality_Spkr1']
        social_impact = group['social_impact_Spkr1']
        valence = group['valence_Spkr1']
    else:
        rationality = group['rationality_Spkr2']
        social_impact = group['social_impact_Spkr2']
        valence = group['valence_Spkr2']

    matrix = np.stack([rationality, social_impact, valence], axis=1)
    return matrix.mean(axis=0) if len(matrix) > 1 else matrix[0]

# Step 2: Compute Euclidean distances for each affect dimension
def compute_euclidean_distances(turn_vectors):
    """Compute Euclidean distances between consecutive turns for each dimension."""
    valence_dist, social_impact_dist, rationality_dist = [np.nan], [np.nan], [np.nan]

    for i in range(1, len(turn_vectors)):
        v1, v2 = turn_vectors[i - 1], turn_vectors[i]
        rationality_dist.append(distance.euclidean([v1[0]], [v2[0]]))
        social_impact_dist.append(distance.euclidean([v1[1]], [v2[1]]))
        valence_dist.append(distance.euclidean([v1[2]], [v2[2]]))

    return rationality_dist, social_impact_dist, valence_dist

# Step 3: Main function to compute Mahalanobis + Euclidean distances
def compute_affectr_distances(df):
    """Compute Mahalanobis and Euclidean distances for affectR vectors by turn and VideoID."""
    results = []

    for vid in df['VideoID'].unique():
        print(f"\n📼 Processing VideoID: {vid}")
        vid_data = df[df['VideoID'] == vid]
        grouped = vid_data.groupby('turnID')

        turn_vectors = []
        turn_ids = []
        speakers = []

        for turn_id, group in grouped:
            speaker = group['Speaker'].iloc[0]
            vector = mean_pool_affectr_vector(group, speaker)

            print(f"\n🎬 TurnID: {turn_id}")
            print(f"   🗣️ Active Speaker: {speaker}")
            print(f"   ✅ Mean-pooled AffectR vector: {vector}")

            turn_vectors.append(vector)
            turn_ids.append(turn_id)
            speakers.append(speaker)

        # Step 3a: Mahalanobis distances
        cov = np.cov(np.stack(turn_vectors).T)
        inv_covmat = np.linalg.pinv(cov)
        maha_dists = [np.nan]
        for i in range(1, len(turn_vectors)):
            d = distance.mahalanobis(turn_vectors[i], turn_vectors[i - 1], inv_covmat)
            maha_dists.append(d)
            print(f"📏 Mahalanobis distance between Turn {turn_ids[i - 1]} and Turn {turn_ids[i]}: {d}")

        # Step 3b: Euclidean distances
        rationality_euc, social_euc, valence_euc = compute_euclidean_distances(turn_vectors)

        # Step 4: Save DataFrame
        results_df = pd.DataFrame({
            'VideoID': [vid] * len(turn_ids),
            'turnID': turn_ids,
            'Speaker': speakers,
            '3D_Dist': maha_dists,
            'Rationality_Dist': rationality_euc,
            'Social_Impact_Dist': social_euc,
            'Valence_Dist': valence_euc
        })

        results.append(results_df)

    return pd.concat(results, ignore_index=True)

# Step 5: Apply to your word_embeddings data
affectR_dist = compute_affectr_distances(word_embeddings)

# Step 6: Inspect result
print(f"\n✅ Final output shape of distance measurement: {affectR_dist.shape}")
affectR_dist.head()



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ✅ Mean-pooled AffectR vector: [0.95304143 0.14159757 0.22041   ]

🎬 TurnID: 104
   🗣️ Active Speaker: Speaker 1
   ✅ Mean-pooled AffectR vector: [ 0.89273354 -0.19064277  0.08991877]

🎬 TurnID: 105
   🗣️ Active Speaker: Speaker 2
   ✅ Mean-pooled AffectR vector: [ 0.64147439 -0.02664852 -0.04302597]

🎬 TurnID: 106
   🗣️ Active Speaker: Speaker 1
   ✅ Mean-pooled AffectR vector: [ 0.81272138 -0.12029763 -0.00490773]
📏 Mahalanobis distance between Turn 1 and Turn 2: 2.0132152394817493
📏 Mahalanobis distance between Turn 2 and Turn 3: 2.502104957564272
📏 Mahalanobis distance between Turn 3 and Turn 4: 2.407924105626881
📏 Mahalanobis distance between Turn 4 and Turn 5: 1.4462572112230156
📏 Mahalanobis distance between Turn 5 and Turn 6: 1.7103668989007967
📏 Mahalanobis distance between Turn 6 and Turn 7: 1.352231747212943
📏 Mahalanobis distance between Turn 7 and Turn 8: 1.432884357769352
📏 Mahalanobis distance between Tur

Unnamed: 0,VideoID,turnID,Speaker,3D_Dist,Rationality_Dist,Social_Impact_Dist,Valence_Dist
0,3YxFNCQJXz0,1,Speaker 1,,,,
1,3YxFNCQJXz0,2,Speaker 2,1.899565,0.068287,0.036051,0.13621
2,3YxFNCQJXz0,3,Speaker 1,1.36495,0.031562,0.064577,0.035693
3,3YxFNCQJXz0,4,Speaker 2,0.871419,0.034613,0.039632,0.065932
4,3YxFNCQJXz0,5,Speaker 1,1.173527,0.049777,0.012239,0.071755


In [8]:
# Compute cosine distance meaures for BERT word embeddings

def mean_pool_bert_vector(group, speaker):
    """Extract and mean-pool the 1024-d BERT embeddings for the active speaker."""
    cols = [f'BERT_Spkr1_{i}' if speaker == 'Speaker 1' else f'BERT_Spkr2_{i}' for i in range(1024)]
    matrix = group[cols].to_numpy()
    return matrix.mean(axis=0) if len(matrix) > 1 else matrix[0]

def compute_bert_distances(df):
    """Compute cosine distances between BERT embeddings for consecutive turns."""
    results = []

    for vid in df['VideoID'].unique():
        print(f"\n📼 Processing VideoID: {vid}")
        vid_data = df[df['VideoID'] == vid]
        grouped = vid_data.groupby('turnID')

        turn_vectors = []
        turn_ids = []
        speakers = []

        for turn_id, group in grouped:
            speaker = group['Speaker'].iloc[0]
            vector = mean_pool_bert_vector(group, speaker)

            print(f"\n🎬 TurnID: {turn_id}")
            print(f"   🗣️ Active Speaker: {speaker}")
            print(f"   ✅ Mean-pooled BERT vector (dim: {vector.shape}): {vector[:5]}...")

            turn_vectors.append(vector)
            turn_ids.append(turn_id)
            speakers.append(speaker)

        # Compute cosine distances between turns
        bert_dists = [np.nan]
        for i in range(1, len(turn_vectors)):
            d = distance.cosine(turn_vectors[i], turn_vectors[i - 1])
            bert_dists.append(d)
            print(f"📏 BERT cosine distance between Turn {turn_ids[i - 1]} and Turn {turn_ids[i]}: {d}")

        results_df = pd.DataFrame({
            'VideoID': [vid] * len(turn_ids),
            'turnID': turn_ids,
            'Speaker': speakers,
            'BERT_Dist': bert_dists
        })

        results.append(results_df)

    return pd.concat(results, ignore_index=True)

# ✅ Run the function on your word_embeddings DataFrame
bert_dist = compute_bert_distances(word_embeddings)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ✅ Mean-pooled BERT vector (dim: (1024,)): [-0.44409513  0.27576613 -0.69757384 -0.50134343  0.96607256]...

🎬 TurnID: 104
   🗣️ Active Speaker: Speaker 1
   ✅ Mean-pooled BERT vector (dim: (1024,)): [-0.48887336  0.47527048 -0.72809726 -0.56744319  1.02051461]...

🎬 TurnID: 105
   🗣️ Active Speaker: Speaker 2
   ✅ Mean-pooled BERT vector (dim: (1024,)): [ 0.05914995  0.23127605 -0.4425548  -0.64224428  0.30802384]...

🎬 TurnID: 106
   🗣️ Active Speaker: Speaker 1
   ✅ Mean-pooled BERT vector (dim: (1024,)): [-0.43453151  0.33578998 -0.63941151 -0.57423699  0.91238803]...
📏 BERT cosine distance between Turn 1 and Turn 2: 0.034177772994581246
📏 BERT cosine distance between Turn 2 and Turn 3: 0.06492263881606486
📏 BERT cosine distance between Turn 3 and Turn 4: 0.06277214736878567
📏 BERT cosine distance between Turn 4 and Turn 5: 0.046815748698818216
📏 BERT cosine distance between Turn 5 and Turn 6: 0.03215661294143135
📏 

In [9]:
# Merge affectR_dist and bert_dist_df on VideoID and turnID
turn_distances = pd.merge(affectR_dist, bert_dist, on=['VideoID', 'turnID', 'Speaker'], how='outer')

# Drop duplicate columns if any (e.g., Speaker from both sides)
turn_distances = turn_distances.loc[:, ~turn_distances.columns.duplicated()]

# Define the output file path
output_path = output_dir + 'turn_distance.csv'

# Save the DataFrame to CSV
turn_distances.to_csv(output_path, index=False)

print(f"✅ CSV file successfully saved to: {output_path}")

# Merge combined distances back with the original word_embeddings dataframe
semantic_data = pd.merge(word_embeddings, turn_distances, on=['VideoID', 'turnID'], how='left')

# Drop duplicate columns if any (e.g., Speaker from both sides)
semantic_data = semantic_data.loc[:, ~semantic_data.columns.duplicated()]

# Define the output file path
output_path = output_dir + 'semantic_data.csv'

# Save the DataFrame to CSV
semantic_data.to_csv(output_path, index=False)

print(f"✅ CSV file successfully saved to: {output_path}")

✅ CSV file successfully saved to: /content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Semantic_Preprocessing/Outputs/turn_distance.csv
✅ CSV file successfully saved to: /content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Semantic_Preprocessing/Outputs/semantic_data.csv
