<a href="https://colab.research.google.com/github/evansalv/social-perception-convo/blob/main/Turn_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install libraries
import os
import pandas as pd
import numpy as np
import re
import inflect

In [3]:
# Load paths
transcription_dir = '/content/drive/My Drive/Closeness_Project_Materials/Transcriptions/'
video_dir = '/content/drive/My Drive/Closeness_Project_Materials/Observation_Study_Segments/'
output_dir = '/content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Studies/Study2/'

In [4]:
#turn_level_path = os.path.join(video_dir, 'Observer_Segments_Data.xlsx')
#word_level_path = os.path.join(video_dir, 'dataset_transcripts.csv')
word_level_path = os.path.join(video_dir, 'Observer_Word_Data.csv')
observer_segments_path = os.path.join(video_dir, 'Video_Time_Breaks.xlsx')

# Load the datasets
#turn_transcript = pd.read_excel(turn_level_path)
word_transcript = pd.read_csv(word_level_path)
observer_segments = pd.read_excel(observer_segments_path)

In [5]:
print("\nWord-level transcription loaded:")
print(word_transcript.head())


Word-level transcription loaded:
   Speaker_Displayed  Pair  Question Transcript Start_Timestamp End_Timestamp  \
0                  1     1         2       What     00:00:00:04   00:00:00:21   
1                  1     1         2         in     00:00:00:21   00:00:00:28   
2                  1     1         2       your     00:00:00:28   00:00:00:37   
3                  1     1         2       life     00:00:00:37   00:00:00:49   
4                  1     1         2         do     00:00:00:49   00:00:00:57   

     Speaker  
0  Speaker 1  
1  Speaker 1  
2  Speaker 1  
3  Speaker 1  
4  Speaker 1  


In [6]:
# Video observer segments time chunks
print("\nObserver segments chunk timing:")
print(observer_segments.head())


Observer segments chunk timing:
   Pair  Question  Speaker_Displayed Start_Timestamp End_Timestamp  \
0     1         2                  1     00:00:00:04   00:01:31:10   
1     1         1                  2     00:00:00:03   00:01:17:31   
2     2         5                  1     00:00:00:17   00:01:13:04   
3     2         7                  2     00:00:02:03  00:00:49:13    
4     3         1                  1     00:00:00:19   00:02:57:16   

       Start_Word   End_Word Start_Speaker End_Speaker      VideoID  
0            What       big.     Speaker 1   Speaker 1  3YxFNCQJXz0  
1              In     spend,     Speaker 2   Speaker 2  4SOtva6huN8  
2             How        Um,     Speaker 1   Speaker 1  ZIUxkzM48sg  
3  Mm-hmm. What's       you?     Speaker 2   Speaker 1  Wq3NHuI7SvE  
4        Um, take  Um, yeah.     Speaker 2   Speaker 2  oufrU7mdSGw  


In [7]:
# Group by Pair and Question
df = observer_segments
grouped = df.groupby(['Pair', 'Question'])

# Store all matched word-level CSVs
word_data = []

# Find matching CSV for each pair and question — avoiding sentence-level files
def find_valid_csv(pair, question):
    matching_paths = []
    for root, dirs, files in os.walk(transcription_dir):
        path_parts = root.split(os.sep)
        if "Transcriptions" in path_parts and "Sentence Level" not in path_parts and "transcriptions" not in path_parts:
            last_folder = path_parts[-1]
            if f"Pair {pair}" in last_folder and "Corrupted" not in last_folder:
                for f in files:
                    if f.endswith('.csv') and (
                        f'Pair{pair}_Q{question}' in f or f'Pair{pair}_Question{question}' in f):
                        matching_paths.append(os.path.join(root, f))

    # Pick the first path that does NOT contain 'Transcript' and 'Notes' columns
    for path in matching_paths:
        try:
            preview = pd.read_csv(path, nrows=1)
            if not all(col in preview.columns for col in ['Transcript', 'Notes']):
                return path
        except Exception as e:
            print(f"⚠️ Could not preview {path}: {e}")
    return None

# Loop through all pairs/questions and collect matched word transcripts CSVs
for (pair, question), _ in grouped:
    print(f"\n🔎 Pair {pair} | Question {question}")
    csv_path = find_valid_csv(pair, question)
    if csv_path:
        try:
            word_df = pd.read_csv(csv_path)
            print("✅ Using CSV:", csv_path)
            print("📄 Columns:", word_df.columns.tolist())
            word_df = word_df.dropna(subset=['Start Timecode', 'End Timecode'])

            # ➕ Annotate and store
            word_df['Pair'] = pair
            word_df['Question'] = question
            word_data.append(word_df.copy())
        except Exception as e:
            print(f"❌ Failed to read {csv_path}: {e}")
    else:
        print("⚠️ No suitable CSV found.")

# Combine all collected CSVs into one DataFrame
full_word_transcript = pd.concat(word_data, ignore_index=True)

# Display
print("Combined DataFrame shape:", full_word_transcript.shape)
full_word_transcript.head()



🔎 Pair 1 | Question 1
✅ Using CSV: /content/drive/My Drive/Closeness_Project_Materials/Transcriptions/Pair 1 - Sonix csv export 08-01-2023 at 17 49pm/Pair1_Q1.mp4.csv
📄 Columns: ['Word', 'Start Timecode', 'End Timecode', 'Speaker']

🔎 Pair 1 | Question 2
✅ Using CSV: /content/drive/My Drive/Closeness_Project_Materials/Transcriptions/Pair 1 - Sonix csv export 08-01-2023 at 17 49pm/Pair1_Q2.mp4.csv
📄 Columns: ['Word', 'Start Timecode', 'End Timecode', 'Speaker']

🔎 Pair 2 | Question 5
✅ Using CSV: /content/drive/My Drive/Closeness_Project_Materials/Transcriptions/Pair 2 - Sonix csv export 08-01-2023 at 17 51pm/Pair2_Question5.mp4.csv
📄 Columns: ['Word', 'Start Timecode', 'End Timecode', 'Speaker']

🔎 Pair 2 | Question 7
✅ Using CSV: /content/drive/My Drive/Closeness_Project_Materials/Transcriptions/Pair 2 - Sonix csv export 08-01-2023 at 17 51pm/Pair2_Question7.mp4.csv
📄 Columns: ['Word', 'Start Timecode', 'End Timecode', 'Speaker']

🔎 Pair 3 | Question 1
✅ Using CSV: /content/drive/My 

Unnamed: 0,Word,Start Timecode,End Timecode,Speaker,Pair,Question
0,In,00:00:00:03,00:00:00:08,Speaker 2,1,1
1,a,00:00:00:08,00:00:00:09,Speaker 2,1,1
2,few,00:00:00:10,00:00:00:22,Speaker 2,1,1
3,"minutes,",00:00:00:22,00:00:00:46,Speaker 2,1,1
4,you,00:00:00:46,00:00:00:53,Speaker 2,1,1


In [8]:
# Helper functions
def timestamp_to_seconds(ts):
    try:
        h, m, s, f = map(int, ts.split(":"))
        # Assuming the last part is centiseconds (100 per second)
        return h * 3600 + m * 60 + s + f / 100
    except:
        return None

def bad_speaker(transcript):
  # Convert 'Speaker' to string for safe matching
  speaker_str = transcript['Speaker'].astype(str)

  # Filter and print bad rows
  bad_speaker_rows = transcript[
      transcript['Speaker'].isna() |
      speaker_str.str.contains('3', na=False)
  ]

  return bad_speaker_rows

def fix_bad_speaker_segments(transcript, bad_rows):
    for _, row in bad_rows.iterrows():
        pair = row['Pair']
        question = row['Question']
        start_ts = row['Start_Timestamp']
        end_ts = row['End_Timestamp']
        correct_speaker = None

        # Define correction rules
        # Change below if needed
        if pair == 2 and question == 7:
            correct_speaker = 'Speaker 2'
        elif pair == 14 and question == 1:
            correct_speaker = 'Speaker 2'
        elif pair == 20 and question == 1:
            correct_speaker = 'Speaker 1'

        if correct_speaker:
            condition = (
                (transcript['Pair'] == pair) &
                (transcript['Question'] == question) &
                (transcript['Start_Timestamp'] >= start_ts) &
                (transcript['End_Timestamp'] <= end_ts)
            )
            transcript.loc[condition, 'Speaker'] = correct_speaker
            print(f"✅ Fixed speaker for Pair {pair}, Question {question} between {start_ts} and {end_ts} → {correct_speaker}")
        else:
            print(f"⚠️ No fix rule defined for Pair {pair}, Question {question}")
    return transcript

# Standardize Speaker columns as "Speaker 1" or "Speaker 2"
def standardize_speaker_column(transcript):
    transcript["Speaker"] = transcript["Speaker"].astype(str).str.extract(r"(\d)").fillna("1")
    transcript["Speaker"] = "Speaker " + transcript["Speaker"]
    return transcript

p = inflect.engine()
# Convert number: replace the number string to the word of the number ('4' -> 'four')
def replace_numbers(match):
    return p.number_to_words(match.group())

# Define function to check if a string contains no word characters
def has_no_words(text):
    return not bool(re.search(r'\w', str(text)))

# Define function to check if a string is only punctuation or whitespace
def is_only_punctuation(text):
    return bool(re.fullmatch(r'\s*[\W_]+\s*', str(text)))


In [9]:
# Cleaning full transcript:

# 1) Rename columns for consistency
full_word_transcript = full_word_transcript.rename(columns={
    'Start Timecode': 'Start_Timestamp',
    'End Timecode': 'End_Timestamp'
})


# 2) Get numeric seconds
full_word_transcript['Start_Seconds'] = full_word_transcript['Start_Timestamp'].apply(timestamp_to_seconds)
full_word_transcript['End_Seconds'] = full_word_transcript['End_Timestamp'].apply(timestamp_to_seconds)


# 3) Identify and fix bad 'Speaker' values
print("Unique Speaker values:", full_word_transcript['Speaker'].unique())

# 4) Get bad speaker rows
bad_speaker_rows = bad_speaker(full_word_transcript)

print("\n⚠️ Rows with NaN or '3' in 'Speaker':")
print(bad_speaker_rows[['Pair', 'Question', 'Speaker']])


# 5) Clean up speaker column only within specific timestamps per bad row
# Fix speaker labels based on timestamp segments
full_word_transcript = fix_bad_speaker_segments(full_word_transcript, bad_speaker_rows)

# 6) Then standardize the formatting of speaker values
full_word_transcript = standardize_speaker_column(full_word_transcript)

# Show unique values in 'Speaker' column after cleaning and standardizing
print("Unique Speaker values:", full_word_transcript['Speaker'].unique())

# 7)  This will replace *only whole numbers* in each transcript entry
full_word_transcript['Word'] = full_word_transcript['Word'].astype(str).apply(
    lambda text: re.sub(r'\b\d+\b', replace_numbers, text)
)

# Find rows with no words
no_word_mask = full_word_transcript['Word'].apply(has_no_words)
no_word_rows = full_word_transcript[no_word_mask]

# Within those, find which are only punctuation
punct_only_mask = no_word_rows['Word'].apply(is_only_punctuation)
punct_only_rows = no_word_rows[punct_only_mask]

# Output
print("Rows with NO words:")
print(no_word_rows[['Word']])

print("\n Among those, rows with ONLY punctuation:")
print(punct_only_rows[['Word']])

# 8) Merge each punctuation row (no word) with its previous row
for idx in punct_only_rows.index:
    if idx - 1 in full_word_transcript.index:
        prev_idx = idx - 1

        # Append punctuation to Transcript of previous row
        prev_transcript = str(full_word_transcript.at[prev_idx, 'Word']).strip()
        punct = str(full_word_transcript.at[idx, 'Word']).strip()

        # Ensure space before punctuation only if needed
        if punct and not punct[0].isalnum():
            merged_transcript = prev_transcript + punct
        else:
            merged_transcript = prev_transcript + ' ' + punct

        full_word_transcript.at[prev_idx, 'Word'] = merged_transcript

        # Update End_Timestamp of previous row to punctuation row's End_Timestamp
        full_word_transcript.at[prev_idx, 'End_Timestamp'] = full_word_transcript.at[idx, 'End_Timestamp']

# Drop the punctuation-only rows after merging
full_word_transcript = full_word_transcript.drop(index=punct_only_rows.index).reset_index(drop=True)

print(f"✅ Merged {len(punct_only_rows)} punctuation rows into previous rows (Transcript + End_Timestamp).")


# Check if still rows with no words
no_word_mask = full_word_transcript['Word'].apply(has_no_words)
no_word_rows = full_word_transcript[no_word_mask]
print("🟨 Rows with NO words:")
print(no_word_rows[['Word']])

Unique Speaker values: ['Speaker 2' 'Speaker 1' nan 'Speaker 3 ' 'Speaker 2 ' 'Speaker 3'
 'Speaker1' 'Speaker2']

⚠️ Rows with NaN or '3' in 'Speaker':
       Pair  Question    Speaker
2045      2         7        NaN
2046      2         7        NaN
2047      2         7        NaN
2048      2         7        NaN
2049      2         7        NaN
...     ...       ...        ...
25163    24         3  Speaker 3
25164    24         3  Speaker 3
25165    24         3  Speaker 3
25166    24         3  Speaker 3
25167    24         3  Speaker 3

[279 rows x 3 columns]
✅ Fixed speaker for Pair 2, Question 7 between 00:00:26:14 and 00:00:27:09 → Speaker 2
✅ Fixed speaker for Pair 2, Question 7 between 00:00:27:09 and 00:00:27:11 → Speaker 2
✅ Fixed speaker for Pair 2, Question 7 between 00:00:27:11 and 00:00:27:14 → Speaker 2
✅ Fixed speaker for Pair 2, Question 7 between 00:00:27:14 and 00:00:27:23 → Speaker 2
✅ Fixed speaker for Pair 2, Question 7 between 00:00:27:23 and 00:00:28:01 → Sp

In [10]:
# Define the output file path
output_path = output_dir + 'full_transcript.csv'

# Save the DataFrame to CSV
full_word_transcript.to_csv(output_path, index=False)

print(f"✅ CSV file successfully saved to: {output_path}")

✅ CSV file successfully saved to: /content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Studies/Study2/full_transcript.csv


In [11]:
# Segmenting full transcript to observer segments

segmented_chunks = []

for idx, row in observer_segments.iterrows():
    pair = row['Pair']
    question = row['Question']
    start_ts = row['Start_Timestamp']
    end_ts = row['End_Timestamp']
    start_word = str(row['Start_Word']).strip()
    end_word = str(row['End_Word']).strip()
    start_speaker = str(row['Start_Speaker']).strip()
    end_speaker = str(row['End_Speaker']).strip()

    # Subset entire transcript for that pair and question
    sub = full_word_transcript[
        (full_word_transcript['Pair'] == pair) &
        (full_word_transcript['Question'] == question)
    ].reset_index(drop=True)

    # Match start row: Start_Timestamp + Start Word + Start Speaker
    start_idx = None
    for i in range(len(sub)):
        r = sub.iloc[i]
        if r['Start_Timestamp'] >= start_ts and str(r['Word']).strip() == start_word and str(r['Speaker']).strip() == start_speaker:
            start_idx = i
            break

    # Match end row: End_Timestamp + End Word + End Speaker
    end_idx = None
    for i in range(len(sub) - 1, -1, -1):
        r = sub.iloc[i]
        if r['End_Timestamp'] <= end_ts and str(r['Word']).strip() == end_word and str(r['Speaker']).strip() == end_speaker:
            end_idx = i
            break

    if start_idx is not None and end_idx is not None and start_idx <= end_idx:
        chunk = sub.iloc[start_idx:end_idx + 1].copy()

         # Print observer and transcript match for confirmation
        print(f"\n✅ Match for Pair {pair}, Q{question}")
        print("Observer Segment:")
        print(f"  Start: [{start_ts}] Word='{start_word}' Speaker='{start_speaker}'")
        print(f"  End  : [{end_ts}] Word='{end_word}' Speaker='{end_speaker}'")
        print("Transcript Segment:")
        print(f"  Start: [{chunk.iloc[0]['Start_Timestamp']}] Word='{chunk.iloc[0]['Word']}' Speaker='{chunk.iloc[0]['Speaker']}'")
        print(f"  End  : [{chunk.iloc[-1]['End_Timestamp']}] Word='{chunk.iloc[-1]['Word']}' Speaker='{chunk.iloc[-1]['Speaker']}'")

        # Add observer metadata
        for col in ['Start_Seconds', 'End_Seconds', 'Speaker_Displayed', 'VideoID']:
            if col in row:
                chunk[col] = row[col]

        segmented_chunks.append(chunk)
    else:
        print(f"\n⚠️ Could not find matching start/end for Pair {pair} Q{question}")
        if start_idx is None:
            print(f"  → Start mismatch: Word='{start_word}', Speaker='{start_speaker}'")
        if end_idx is None:
            print(f"  → End mismatch  : Word='{end_word}', Speaker='{end_speaker}'")

# Combine all valid chunks
word_transcript = pd.concat(segmented_chunks, ignore_index=True)
print("✅ Final segmented word transcript shape:", word_transcript.shape)
word_transcript.head()


✅ Match for Pair 1, Q2
Observer Segment:
  Start: [00:00:00:04] Word='What' Speaker='Speaker 1'
  End  : [00:01:31:10] Word='big.' Speaker='Speaker 1'
Transcript Segment:
  Start: [00:00:00:04] Word='What' Speaker='Speaker 1'
  End  : [00:01:31:10] Word='big.' Speaker='Speaker 1'

✅ Match for Pair 1, Q1
Observer Segment:
  Start: [00:00:00:03] Word='In' Speaker='Speaker 2'
  End  : [00:01:17:31] Word='spend,' Speaker='Speaker 2'
Transcript Segment:
  Start: [00:00:00:03] Word='In' Speaker='Speaker 2'
  End  : [00:01:17:31] Word='spend,' Speaker='Speaker 2'

✅ Match for Pair 2, Q5
Observer Segment:
  Start: [00:00:00:17] Word='How' Speaker='Speaker 1'
  End  : [00:01:13:04] Word='Um,' Speaker='Speaker 1'
Transcript Segment:
  Start: [00:00:00:17] Word='How' Speaker='Speaker 1'
  End  : [00:01:13:04] Word='Um,' Speaker='Speaker 1'

✅ Match for Pair 2, Q7
Observer Segment:
  Start: [00:00:02:03] Word='Mm-hmm. What's' Speaker='Speaker 2'
  End  : [00:00:49:13 ] Word='you?' Speaker='Speake

Unnamed: 0,Word,Start_Timestamp,End_Timestamp,Speaker,Pair,Question,Start_Seconds,End_Seconds,Speaker_Displayed,VideoID
0,What,00:00:00:04,00:00:00:21,Speaker 1,1,2,0.04,0.21,1,3YxFNCQJXz0
1,in,00:00:00:21,00:00:00:28,Speaker 1,1,2,0.21,0.28,1,3YxFNCQJXz0
2,your,00:00:00:28,00:00:00:37,Speaker 1,1,2,0.28,0.37,1,3YxFNCQJXz0
3,life,00:00:00:37,00:00:00:49,Speaker 1,1,2,0.37,0.49,1,3YxFNCQJXz0
4,do,00:00:00:49,00:00:00:57,Speaker 1,1,2,0.49,0.57,1,3YxFNCQJXz0


In [12]:
# Show unique values in 'Speaker' column after observer video segmentation
print("Unique Speaker values:", word_transcript['Speaker'].unique())

Unique Speaker values: ['Speaker 1' 'Speaker 2']


In [13]:
# Turn Segmentation:

# Add empty turn columns
word_transcript['turnID'] = None
word_transcript['turnSpeaker1'] = None
word_transcript['turnSpeaker2'] = None
word_transcript['turnIDSpeaker1'] = None
word_transcript['turnIDSpeaker2'] = None

# Loop through each unique VideoID
for vid in word_transcript['VideoID'].unique():
    # Subset data for this VideoID and sort by time if needed
    vid_subset = word_transcript[word_transcript['VideoID'] == vid].copy()
    vid_subset = vid_subset.sort_values(by='Start_Timestamp')

    turn_num = 0
    turn_numspkr1 = 0
    turn_numspkr2 = 0
    previous_speaker = None

    for idx, row in vid_subset.iterrows():
        current_speaker = row['Speaker']

        # If speaker changes from the previous row, increment turn
        if current_speaker != previous_speaker:
            turn_num += 1
            previous_speaker = current_speaker
            if current_speaker == 'Speaker 1':
              turn_numspkr1 += 1

            elif current_speaker == 'Speaker 2':
              turn_numspkr2 += 1


        # Assign turn number
        word_transcript.loc[idx, 'turnID'] = turn_num

        if current_speaker == 'Speaker 1':
            word_transcript.loc[idx, 'turnSpeaker1'] = 1
            word_transcript.loc[idx, 'turnSpeaker2'] = 0
            word_transcript.loc[idx, 'turnIDSpeaker1'] = turn_numspkr1

        elif current_speaker == 'Speaker 2':
            word_transcript.loc[idx, 'turnSpeaker1'] = 0
            word_transcript.loc[idx, 'turnSpeaker2'] = 1
            word_transcript.loc[idx, 'turnIDSpeaker2'] = turn_numspkr2

# Preview result
print(word_transcript)


               Word Start_Timestamp End_Timestamp    Speaker  Pair  Question  \
0              What     00:00:00:04   00:00:00:21  Speaker 1     1         2   
1                in     00:00:00:21   00:00:00:28  Speaker 1     1         2   
2              your     00:00:00:28   00:00:00:37  Speaker 1     1         2   
3              life     00:00:00:37   00:00:00:49  Speaker 1     1         2   
4                do     00:00:00:49   00:00:00:57  Speaker 1     1         2   
...             ...             ...           ...        ...   ...       ...   
16977  one hundred%     00:03:16:55   00:03:17:32  Speaker 1    33         6   
16978            on     00:03:17:33   00:03:17:41  Speaker 1    33         6   
16979     that one.     00:03:17:41   00:03:17:50  Speaker 1    33         6   
16980         Yeah.     00:03:19:24   00:03:20:04  Speaker 2    33         6   
16981       Number.     00:03:19:24   00:03:20:04  Speaker 1    33         6   

       Start_Seconds  End_Seconds  Spea

In [14]:
word_transcript

Unnamed: 0,Word,Start_Timestamp,End_Timestamp,Speaker,Pair,Question,Start_Seconds,End_Seconds,Speaker_Displayed,VideoID,turnID,turnSpeaker1,turnSpeaker2,turnIDSpeaker1,turnIDSpeaker2
0,What,00:00:00:04,00:00:00:21,Speaker 1,1,2,0.04,0.21,1,3YxFNCQJXz0,1,1,0,1,
1,in,00:00:00:21,00:00:00:28,Speaker 1,1,2,0.21,0.28,1,3YxFNCQJXz0,1,1,0,1,
2,your,00:00:00:28,00:00:00:37,Speaker 1,1,2,0.28,0.37,1,3YxFNCQJXz0,1,1,0,1,
3,life,00:00:00:37,00:00:00:49,Speaker 1,1,2,0.37,0.49,1,3YxFNCQJXz0,1,1,0,1,
4,do,00:00:00:49,00:00:00:57,Speaker 1,1,2,0.49,0.57,1,3YxFNCQJXz0,1,1,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16977,one hundred%,00:03:16:55,00:03:17:32,Speaker 1,33,6,196.55,197.32,2,NYbcSl1rX0A,41,1,0,21,
16978,on,00:03:17:33,00:03:17:41,Speaker 1,33,6,197.33,197.41,2,NYbcSl1rX0A,41,1,0,21,
16979,that one.,00:03:17:41,00:03:17:50,Speaker 1,33,6,197.41,197.50,2,NYbcSl1rX0A,41,1,0,21,
16980,Yeah.,00:03:19:24,00:03:20:04,Speaker 2,33,6,199.24,200.04,2,NYbcSl1rX0A,42,0,1,,21


In [15]:
# Display video information
# Group by VideoID
for vid in word_transcript['VideoID'].unique():
    vid_data = word_transcript[word_transcript['VideoID'] == vid].copy()
    total_duration = round(vid_data['End_Seconds'].max() - vid_data['Start_Seconds'].min(), 2)
    min_turn = vid_data['turnID'].min()
    total_turns = vid_data['turnID'].nunique()

    print(f"\n🎥 VideoID: {vid}")
    print(f"   📼 Video duration: {total_duration} seconds")
    print(f"   ➤ Starts at turnID = {int(min_turn)} {'✅' if min_turn == 1 else '❌'}")
    print(f"   ➤ Total Turns: {total_turns}")

    for speaker in vid_data['Speaker'].unique():
        speaker_data = vid_data[vid_data['Speaker'] == speaker]
        word_count = len(speaker_data)
        speaker_turns = speaker_data['turnID'].nunique()

        # Compute duration per unique turn, then sum
        turn_durations = (
            speaker_data.groupby('turnID')
            .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
        )
        speaker_duration = round(turn_durations.sum(), 2)

        print(f"      • {speaker}: {speaker_turns} turns | {word_count} words | {speaker_duration} seconds")



🎥 VideoID: 3YxFNCQJXz0
   📼 Video duration: 91.06 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 25
      • Speaker 1: 13 turns | 59 words | 20.96 seconds
      • Speaker 2: 12 turns | 195 words | 66.32 seconds

🎥 VideoID: 4SOtva6huN8
   📼 Video duration: 77.28 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 5
      • Speaker 2: 3 turns | 32 words | 9.31 seconds
      • Speaker 1: 2 turns | 169 words | 67.02 seconds

🎥 VideoID: ZIUxkzM48sg
   📼 Video duration: 72.87 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 11
      • Speaker 1: 6 turns | 15 words | 4.33 seconds
      • Speaker 2: 5 turns | 159 words | 63.49 seconds

🎥 VideoID: Wq3NHuI7SvE
   📼 Video duration: 47.1 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 10
      • Speaker 2: 5 turns | 23 words | 5.31 seconds
      • Speaker 1: 5 turns | 130 words | 39.27 seconds

🎥 VideoID: oufrU7mdSGw
   📼 Video duration: 176.97 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 95
      • Speaker 2: 48 turn

  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x


🎥 VideoID: yaJcfRNzzzs
   📼 Video duration: 64.25 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 12
      • Speaker 2: 6 turns | 173 words | 59.2 seconds
      • Speaker 1: 6 turns | 9 words | 3.82 seconds

🎥 VideoID: qYUbcQokhVo
   📼 Video duration: 245.16 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 67
      • Speaker 2: 34 turns | 592 words | 162.99 seconds
      • Speaker 1: 33 turns | 276 words | 78.14 seconds

🎥 VideoID: XP1g9LcplB8
   📼 Video duration: 107.95 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 30
      • Speaker 1: 15 turns | 310 words | 94.99 seconds
      • Speaker 2: 15 turns | 32 words | 12.36 seconds

🎥 VideoID: yVo1bFCQfv0
   📼 Video duration: 181.0 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 38
      • Speaker 1: 19 turns | 453 words | 114.08 seconds
      • Speaker 2: 19 turns | 183 words | 61.81 seconds

🎥 VideoID: _D77faGnzxw
   📼 Video duration: 55.0 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 10
      • Speaker 

  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x


🎥 VideoID: 8eexk7ZQ7YY
   📼 Video duration: 244.84 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 106
      • Speaker 2: 53 turns | 207 words | 52.37 seconds
      • Speaker 1: 53 turns | 715 words | 189.2 seconds

🎥 VideoID: s-djtoC8SjQ
   📼 Video duration: 183.27 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 63
      • Speaker 1: 32 turns | 75 words | 21.06 seconds
      • Speaker 2: 31 turns | 620 words | 158.57 seconds

🎥 VideoID: mVRMa5MAqGo
   📼 Video duration: 255.4 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 110
      • Speaker 1: 55 turns | 598 words | 196.91 seconds
      • Speaker 2: 55 turns | 180 words | 57.24 seconds

🎥 VideoID: kLdBvkC8JEc
   📼 Video duration: 70.0 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 7
      • Speaker 2: 4 turns | 13 words | 4.54 seconds
      • Speaker 1: 3 turns | 140 words | 63.05 seconds

🎥 VideoID: CErmhd_IiM8
   📼 Video duration: 134.08 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 28
      • Spea

  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x

      • Speaker 2: 52 turns | 174 words | 53.12 seconds
      • Speaker 1: 51 turns | 549 words | 179.27 seconds

🎥 VideoID: ALToDlcVvsE
   📼 Video duration: 157.0 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 76
      • Speaker 2: 38 turns | 434 words | 143.66 seconds
      • Speaker 1: 38 turns | 48 words | 15.71 seconds

🎥 VideoID: nddHI5aPQvw
   📼 Video duration: 68.06 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 13
      • Speaker 1: 7 turns | 178 words | 67.81 seconds
      • Speaker 2: 6 turns | 6 words | 1.97 seconds

🎥 VideoID: 15dG8_uXmFg
   📼 Video duration: 76.97 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 27
      • Speaker 2: 14 turns | 205 words | 67.39 seconds
      • Speaker 1: 13 turns | 38 words | 10.26 seconds

🎥 VideoID: GmGJrUAj21k
   📼 Video duration: 99.1 seconds
   ➤ Starts at turnID = 1 ✅
   ➤ Total Turns: 45
      • Speaker 2: 23 turns | 115 words | 32.52 seconds
      • Speaker 1: 22 turns | 218 words | 56.41 seconds

🎥 VideoID: yC

  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x['End_Seconds'].max() - x['Start_Seconds'].min())
  .apply(lambda x: x

In [16]:
# 2 Second Segmentation:

# Initialize segment ID columns
word_transcript['2segIDSpeaker1'] = None
word_transcript['2segIDSpeaker2'] = None

# Loop over each VideoID
for vid in word_transcript['VideoID'].unique():
    vid_data = word_transcript[word_transcript['VideoID'] == vid]

    for turn in vid_data['turnID'].dropna().unique():
        turn_data = vid_data[vid_data['turnID'] == turn]
        speaker = turn_data['Speaker'].iloc[0]
        start_time = turn_data['Start_Seconds'].min()
        end_time = turn_data['End_Seconds'].max()
        duration = end_time - start_time

        if duration < 2:
            # Not enough to split — assign one segment
            segment_mask = (
                (word_transcript['VideoID'] == vid) &
                (word_transcript['turnID'] == turn)
            )
            seg_col = '2segIDSpeaker1' if speaker == "Speaker 1" else '2segIDSpeaker2'
            word_transcript.loc[segment_mask, seg_col] = f"T{int(turn)}_SEG1"
        else:
            # Split into 2-second chunks
            num_segments = int(duration // 2)
            segment_edges = [start_time + i * 2 for i in range(num_segments)]
            segment_edges.append(end_time)  # final segment may be < 2 sec

            for i in range(len(segment_edges) - 1):
                seg_start = segment_edges[i]
                seg_end = segment_edges[i + 1]
                #seg_mask = (
                    #(word_transcript['VideoID'] == vid) &
                    #(word_transcript['turnID'] == turn) &
                    #(word_transcript['Start_Seconds'] >= seg_start) &
                    #(word_transcript['End_Seconds'] <= seg_end)
                #)

                seg_mask = (
                    (word_transcript['VideoID'] == vid) &
                    (word_transcript['turnID'] == turn) &
                    (word_transcript['End_Seconds'] > seg_start) &
                    (word_transcript['Start_Seconds'] < seg_end)
                )
                seg_col = '2segIDSpeaker1' if speaker == "Speaker 1" else '2segIDSpeaker2'
                seg_label = f"T{int(turn)}_SEG{i+1}"
                word_transcript.loc[seg_mask, seg_col] = seg_label


In [17]:
word_transcript

Unnamed: 0,Word,Start_Timestamp,End_Timestamp,Speaker,Pair,Question,Start_Seconds,End_Seconds,Speaker_Displayed,VideoID,turnID,turnSpeaker1,turnSpeaker2,turnIDSpeaker1,turnIDSpeaker2,2segIDSpeaker1,2segIDSpeaker2
0,What,00:00:00:04,00:00:00:21,Speaker 1,1,2,0.04,0.21,1,3YxFNCQJXz0,1,1,0,1,,T1_SEG1,
1,in,00:00:00:21,00:00:00:28,Speaker 1,1,2,0.21,0.28,1,3YxFNCQJXz0,1,1,0,1,,T1_SEG1,
2,your,00:00:00:28,00:00:00:37,Speaker 1,1,2,0.28,0.37,1,3YxFNCQJXz0,1,1,0,1,,T1_SEG1,
3,life,00:00:00:37,00:00:00:49,Speaker 1,1,2,0.37,0.49,1,3YxFNCQJXz0,1,1,0,1,,T1_SEG1,
4,do,00:00:00:49,00:00:00:57,Speaker 1,1,2,0.49,0.57,1,3YxFNCQJXz0,1,1,0,1,,T1_SEG1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16977,one hundred%,00:03:16:55,00:03:17:32,Speaker 1,33,6,196.55,197.32,2,NYbcSl1rX0A,41,1,0,21,,T41_SEG1,
16978,on,00:03:17:33,00:03:17:41,Speaker 1,33,6,197.33,197.41,2,NYbcSl1rX0A,41,1,0,21,,T41_SEG1,
16979,that one.,00:03:17:41,00:03:17:50,Speaker 1,33,6,197.41,197.50,2,NYbcSl1rX0A,41,1,0,21,,T41_SEG1,
16980,Yeah.,00:03:19:24,00:03:20:04,Speaker 2,33,6,199.24,200.04,2,NYbcSl1rX0A,42,0,1,,21,,T42_SEG1


In [18]:
# 5 Second Segmentation:

# Initialize segment ID columns
word_transcript['5segIDSpeaker1'] = None
word_transcript['5segIDSpeaker2'] = None

# Loop over each VideoID
for vid in word_transcript['VideoID'].unique():
    vid_data = word_transcript[word_transcript['VideoID'] == vid]

    for turn in vid_data['turnID'].dropna().unique():
        turn_data = vid_data[vid_data['turnID'] == turn]
        speaker = turn_data['Speaker'].iloc[0]
        start_time = turn_data['Start_Seconds'].min()
        end_time = turn_data['End_Seconds'].max()
        duration = end_time - start_time

        if duration < 5:
            # Not enough to split — assign one segment
            segment_mask = (
                (word_transcript['VideoID'] == vid) &
                (word_transcript['turnID'] == turn)
            )
            seg_col = '5segIDSpeaker1' if speaker == "Speaker 1" else '5segIDSpeaker2'
            word_transcript.loc[segment_mask, seg_col] = f"T{int(turn)}_SEG1"
        else:
            # Split into 5-second chunks
            num_segments = int(duration // 5)
            segment_edges = [start_time + i * 5 for i in range(num_segments)]
            segment_edges.append(end_time)  # final segment may be < 5 sec

            for i in range(len(segment_edges) - 1):
                seg_start = segment_edges[i]
                seg_end = segment_edges[i + 1]

                seg_mask = (
                    (word_transcript['VideoID'] == vid) &
                    (word_transcript['turnID'] == turn) &
                    (word_transcript['End_Seconds'] > seg_start) &
                    (word_transcript['Start_Seconds'] < seg_end)
                )
                seg_col = '5segIDSpeaker1' if speaker == "Speaker 1" else '5segIDSpeaker2'
                seg_label = f"T{int(turn)}_SEG{i+1}"
                word_transcript.loc[seg_mask, seg_col] = seg_label


In [19]:
# 1) Rename columns for consistency
word_transcript = word_transcript.rename(columns={
    'Word': 'Transcript'
})

In [20]:
# Define the output file path
output_path = output_dir + 'transcript.csv'

# Save the DataFrame to CSV
word_transcript.to_csv(output_path, index=False)

print(f"✅ CSV file successfully saved to: {output_path}")

✅ CSV file successfully saved to: /content/drive/My Drive/Closeness_Project_Materials/FeaturePipeline/Studies/Study2/transcript.csv
