# Transcript Data Processing
Combines every participant's transcript with corresponding labeled data into `raw_compiled_transcripts.csv`

# File paths

In [1]:
transcript_directory = "./Transcript" # directory containing all the transcript csv files
label_filepath = "../data/Y_data.csv" # file containing all the participants' label

# Data Example

## X data

In [2]:
import pandas as pd 
import os

In [3]:
eg_transcript_filepath = transcript_directory + "/300_TRANSCRIPT.csv" #insert file path to transcript here 
eg_transcript_filename = os.path.basename(eg_transcript_filepath)

df_transcript = pd.read_csv(eg_transcript_filepath, sep = '\t')
df_transcript.head()

Unnamed: 0,start_time,stop_time,speaker,value
0,36.588,39.668,Ellie,hi i'm ellie thanks for coming in today
1,39.888,43.378,Ellie,i was created to talk to people in a safe and ...
2,43.728,48.498,Ellie,think of me as a friend i don't judge i can't ...
3,49.188,52.388,Ellie,i'm here to learn about people and would love ...
4,52.658,58.958,Ellie,i'll ask a few questions to get us started and...


## Y data

In [4]:
df_labels = pd.read_csv(label_filepath)
df_labels.head()

Unnamed: 0,Participant_ID,PHQ_Binary,PHQ_Score
0,300,0,2
1,301,0,3
2,302,0,4
3,303,0,0
4,304,0,6


# Process Data
1. Drop `start_time` and `stop_time` columns
2. Drop rows that contain prompts by Ellie
3. Concatenate the words spoken by the participant into a single chunk of text

In [5]:
def get_transcript(filepath):
    filename = os.path.basename(filepath)
    participant_no = int(filename[0:3])
    
    df_transcript = pd.read_csv(filepath, sep = '\t')  
    
    df_dropped_columns = df_transcript.drop(['start_time', 'stop_time'], axis = 1) # drop first 2 columns
    df_dropped_ellie = df_dropped_columns.set_index('speaker').drop(index = ['Ellie']) # drop anything Ellie says, she's not important
    df_cleaned = df_dropped_ellie.reset_index() 
    
    transcript_concat = '' 

    for index, row in df_cleaned.iterrows():
        if (index == 0):
            transcript_concat += str(row['value'])
        else:
            transcript_concat += ' ' + str(row['value'])
            
    return participant_no, transcript_concat

In [6]:
# Verify that we can get the right participant number and concatenated transcript

get_transcript(eg_transcript_filepath)

(300,
 "good atlanta georgia um my parents are from here um i love it i like the weather i like the opportunities um yes um it took a minute somewhat easy congestion that's it um i took up business and administration uh yeah i am here and there i'm on a break right now but i plan on going back in the uh next semester uh probably to open up my own business no um no specific reason i just don't travel a lot i'm pretty local once a year can you be a little bit more specific no answer i like reading books i enjoy i enjoy cooking um exercising is great i'm i'm i'm pretty good at it um yeah um probably about two weeks ago uh frustrated um i don't like bias um i don't like um when someone says they're gonna do something and they don't uh somewhat friendship i like to play sports i enjoy uh going out with friends and family playing games grandparents parents um yeah i mean they've always given me great advice they've always kept it real real close i would say going to college right after high 

# Compile all transcripts into a global dataframe

In [7]:
def combine_transcripts(directory, label_filepath):
    column_names_final = ["Participant_ID", "Transcript", "PHQ_Score", "PHQ_Binary"]
    df_final = pd.DataFrame(columns = column_names_final)
    
    csv_files = [pos_csv for pos_csv in os.listdir(directory) if pos_csv.endswith('.csv')]
    
    for filename in csv_files:
        filepath = directory + '/' + filename
        participant_no, transcript_concat = get_transcript(filepath)
        df_labels = pd.read_csv(label_filepath, index_col = "Participant_ID")
        PHQ_Binary = df_labels.loc[participant_no]["PHQ_Binary"]
        PHQ_Score = df_labels.loc[participant_no]["PHQ_Score"]

        new_row = {"Participant_ID": participant_no, "Transcript": transcript_concat, "PHQ_Score": PHQ_Score, "PHQ_Binary": PHQ_Binary}
        df_final = df_final.append(new_row, ignore_index = True)
    
    return df_final

In [8]:
df = combine_transcripts(transcript_directory, label_filepath).sort_values(by=['Participant_ID'], ascending=True)
df.to_csv("../data/raw_compiled_transcripts.csv", index=False)
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,Participant_ID,Transcript,PHQ_Score,PHQ_Binary
0,300,good atlanta georgia um my parents are from he...,2,0
1,301,thank you mmm k i'm doing good thank you i'm f...,3,0
2,302,i'm fine how about yourself i'm from los ange...,4,0
3,303,okay how 'bout yourself here in california yea...,0,0
4,304,i'm doing good um from los angeles california ...,6,0
