In [40]:
import os
import shutil
import pandas as pd
import numpy as np

This section reads 2 VtV files and prepares them to be merged. method == 0 reads files that end with _single.txt, these are the single speaker transcriptions produced from videos where there is only 1 speaker. method != 0 reads files that end with _single_X.txt and _single_Y.txt, these are single speaker transcriptions produced by splitting the speakers from videos with 2 speakers.

In [47]:
method = 0
if method == 0: # Merge seperate transcriptions
    directory = "Transcripts/Study 2 (Fall2023)/VTV/"
    file_start = "VtV36"
    
    dfA = pd.read_csv(directory + "single/"+ file_start + "A_single.txt", sep="|")
    dfB = pd.read_csv(directory + "single/"+ file_start + "B_single.txt", sep="|")
else: # Merge pre split transcriptions
    directory = "Transcripts/Study 2 (Fall2023)/VTV/"
    file_start = "VtV17"
    
    dfA = pd.read_csv(directory + "single/"+ file_start + "A_single_X.txt", sep="|")
    dfB = pd.read_csv(directory + "single/"+ file_start + "B_single_Y.txt", sep="|")

To merge, we select 2 timestamps in both transcriptions that have been manually determined to be identical in actual time. We then calculate the difference and subtract the difference from the first transcriptions file, which synchronizes the timestamps for both files.

In [48]:
dfA_time = 123.28999999999999
dfB_time = 122.24000000000001
difference = dfA_time - dfB_time

In [49]:
dfA["Start Time"] = dfA["Start Time"] -difference
dfA["End Time"]= dfA["End Time"] -difference
dfA["Speaker"] = "Speaker 1"
dfB["Speaker"] = "Speaker 2"

We can then merge both files and then sort by the start time of each line of speech. This produces a merged transcription that we can now save to a csv file.

In [50]:
df_new = pd.concat([dfA, dfB])
df_new = df_new.sort_values(by=["Start Time"])
df_new.to_csv(directory + "dyad/"+ file_start + "_dyad.txt", index=False,sep="|")

# Merge videos to each other

In [1]:
from pydub import AudioSegment
import os

def combine_dual_recordings(
    file_a,
    file_b,
    output_path="combined.wav",
    align_offset_ms=0,
    stereo=True,
    target_sample_rate=16000
):
    """
    Combines two mono recordings (e.g., from two speakers) into a single audio file.
    
    Parameters:
        file_a (str): Path to speaker A's audio file
        file_b (str): Path to speaker B's audio file
        output_path (str): Output WAV path for the combined audio
        align_offset_ms (int): Offset in milliseconds to delay speaker B (can be negative)
        stereo (bool): If True, saves as stereo (speaker A = left, B = right). If False, mixes into mono
        target_sample_rate (int): Output sample rate (Hz), default is 16000
    """
    print(f"🔄 Loading audio files: {file_a} and {file_b}")
    a = AudioSegment.from_file(file_a).set_frame_rate(target_sample_rate).set_channels(1)
    b = AudioSegment.from_file(file_b).set_frame_rate(target_sample_rate).set_channels(1)

    # Apply the offset
    if align_offset_ms > 0:
        print(f"⏱️ Delaying speaker B by {align_offset_ms} ms")
        b = AudioSegment.silent(duration=align_offset_ms) + b
    elif align_offset_ms < 0:
        print(f"⏱️ Delaying speaker A by {-align_offset_ms} ms")
        a = AudioSegment.silent(duration=-align_offset_ms) + a

    # Equalize lengths by padding
    max_len = max(len(a), len(b))
    a += AudioSegment.silent(duration=max_len - len(a))
    b += AudioSegment.silent(duration=max_len - len(b))

    # Combine (stereo option)
    if stereo:
        print("🔊 Combining as stereo (A = left, B = right)")
        combined = AudioSegment.from_mono_audiosegments(a, b)
    else:
        print("🔊 Combining as mono overlay")
        combined = a.overlay(b)

    # Export
    combined.export(output_path, format="wav")
    print(f"✅ Combined audio saved to: {output_path}")

In [10]:
study_a = "./Data/DECEPTION/ATA/NATA41 (SUSPECTED).m4a"
study_b = "./Data/DECEPTION/ATA/NATA41-R (UNPINNED).mp4"
combine_dual_recordings(
    file_a=study_a,
    file_b=study_b,
    output_path="combined_3_s.wav",
    align_offset_ms=0,  # Set this manually if there's a delay
    stereo=True
)

🔄 Loading audio files: ./Data/DECEPTION/ATA/NATA41 (SUSPECTED).m4a and ./Data/DECEPTION/ATA/NATA41-R (UNPINNED).mp4
🔊 Combining as stereo (A = left, B = right)
✅ Combined audio saved to: combined_3_s.wav
