In [17]:
import numpy as np
import pandas as pd
import os

# Set display options
pd.set_option('display.max_colwidth', None)

# Example of reading a CSV with specified delimiter, header, and encoding
# Reading the CSV file using tab as the delimiter
df = pd.read_csv("./DAIC_cleaned/414_TRANSCRIPT_cleaned.csv", sep='\t')

df


Unnamed: 0,start_time,stop_time,speaker,value
0,59.638,61.908,Ellie,so how are you doing today
1,63.490,64.510,Participant,i'm okay
2,66.521,67.841,Ellie,where are you from originally
3,68.540,70.220,Participant,burbank california
4,70.436,71.026,Ellie,really
...,...,...,...,...
83,877.360,879.860,Participant,go for a walk i don't know go to the beach *laughter*
84,878.333,880.633,Ellie,what are you most proud of in your life
85,883.490,946.390,Participant,"um that is a hard question, i mean i think i've, done a lot of great things in my life, and um, i have achieved a lot, but, i think one of the proudest things, proudest moments in my life is something really small and insignificant, i was doing this exercise boot camp and, the last person to like cross the finish line had to do like, thirty push ups and, it was some guy i didn't really know him at all and the trainer said does anyone wanna help him if someone helps him you have to do half each, and no one wanted to help him and i just decided to start doing push ups with him, and i just, it wasn't 'cause i wanted to get credit or because i thought it would make me look good i just did it, and no one really knows about that but it just i feel really good about that"
86,947.184,948.084,Ellie,that's great


In [26]:
def process_transcripts(folder_path, version):
    """
    Process transcripts from a specified folder based on the chosen version.
    
    Parameters:
    - folder_path (str): The path to the folder containing the transcript CSV files.
    - version (int): The version of the processing method to use (1, 2, or 3).
    
    Returns:
    - combined_data (DataFrame): A DataFrame containing the combined transcripts.
    """
    # Initialize an empty list to store intermediate DataFrames
    data_frames = []
    
    # Iterate over all files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Construct full file path
            file_path = os.path.join(folder_path, filename)
            
            # Read the CSV file
            df = pd.read_csv(file_path, sep='\t')
            
            # Extract participant ID from the filename
            participant_id = filename.split('_')[0]
            
            if version == 1:
                # Combine the 'Text' column into a single string with speaker indication
                combined_text = '. '.join(f"{row['speaker']}: {row['value']}" for _, row in df.iterrows()) + '.'
                
            elif version == 2:
                # Combine the 'Text' column into a single string
                combined_text = '. '.join(df['value'].astype(str)) + '.'
                
            elif version == 3:
                # Filter rows where Speaker is 'Participant'
                participant_rows = df[df['speaker'] == 'Participant']
                # Combine the 'Value' column into a single string
                combined_text = '. '.join(participant_rows['value'].astype(str)) + '.'
                
            else:
                raise ValueError("Invalid version specified. Choose 1, 2, or 3.")
            
            # Create a new DataFrame for this participant
            new_df = pd.DataFrame({
                'ParticipantID': [participant_id],
                'Conversation': [combined_text]
            })
            
            # Append the new DataFrame to the list
            data_frames.append(new_df)
    
    # Concatenate all DataFrames in the list into a single DataFrame
    combined_data = pd.concat(data_frames, ignore_index=True)
    combined_data = combined_data.sort_values(by='ParticipantID', ascending=True)
    
    return combined_data



folder_path = './DAIC_cleaned'  
version = 1  # Choose the version to use (1, 2, or 3)
transcript_combined = process_transcripts(folder_path, version)

# Save the combined DataFrame to a CSV file
transcript_combined.to_csv('ellieAndParticipantTranscripts.csv', index=False)
