In [2]:
import os
import shutil
import pandas as pd
import numpy as np

This reads the full transcription for a video file. The full transcription is the raw transcription produced by the transcription pipeline, and can be used toreproduce filtered transcriptions like the single and dyad files.

By default, the script will autamatically treat files that start with VtV as single speaker files, while treating VGC and FTF files as dual speaker files. Setting method == 0 forces the script to treat the file as a dual speaker file.

In [34]:
directory = "Transcripts/Study 2 (Fall2023)/VTV/"
file_name = "VTV39AB (CamSwitch)"
ext = "_full.txt"
method = 0 # Choose the method for filtering the full transcription file
df = pd.read_csv(directory + "full/"+ file_name + ext, sep="|")

In [35]:
index_RA = df.index.get_loc(df['match_count'].idxmax())

# This section cuts off the beginning part of the transcription from the line with matches "8 minute conversation" the best.
if df.iloc[index_RA]["Start Time"] / df.iloc[len(df) - 1]["Start Time"] < 0.5 and df['match_count'].max() > 1:
    filtered_df = df.iloc[index_RA + 1:][["Start Time", "End Time", "Speaker", "Transcription"]]
else:
    filtered_df = df[["Start Time", "End Time", "Speaker", "Transcription"]]

# We take only the 2 speakers that speak the most
value_counts = filtered_df['Speaker'].value_counts()
top_2_speakers = value_counts.nlargest(2).index.tolist()
if len(top_2_speakers) > 1:
    replace_map = {
    top_2_speakers[0]: "Speaker 1",
    top_2_speakers[1]: "Speaker 2",
    }
else:
    replace_map = {top_2_speakers[0]: "Speaker 1"}
    
filtered_df['Speaker'] = filtered_df['Speaker'].replace(replace_map)

# Now we split the transcription based on whether the file is a single speaker or dual speaker file
if method == 0 or (method is None and (not file_name.startswith('VtV') and not file_name.startswith('VTV'))):
    # Dyad case
    filtered_df = filtered_df[(filtered_df["Speaker"] == "Speaker 1") | (filtered_df["Speaker"] == "Speaker 2")]
    filtered_df.to_csv(directory + "/dyad/"  + file_name + '_dyad.txt', sep="|", index=False)
    filtered_df[filtered_df["Speaker"] == "Speaker 1"].to_csv(directory + "/single/" + file_name + '_single_X.txt', sep="|", index=False)
    filtered_df[filtered_df["Speaker"] == "Speaker 2"].to_csv(directory + "/single/" + file_name + '_single_Y.txt', sep="|", index=False)
else:
    # Single speaker case
    filtered_df = filtered_df[filtered_df["Speaker"] == "Speaker 1"]
    filtered_df.to_csv(directory + "/single/" + file_name + '_single.txt', sep="|", index=False)