# Transcript Data Processing
Takes in raw `.csv` file of each participant's transcript from his/her interview with Ellie, and clean it up. 

# Import data

## Y data

In [1]:
import pandas as pd 
import os

df_labels = pd.read_csv("../data/Y_Data.csv", index_col = "Participant_ID")
df_labels.head()

Unnamed: 0_level_0,PHQ_Binary,PHQ_Score
Participant_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
300,0,2
301,0,3
302,0,4
303,0,0
304,0,6


## X data

In [2]:
filepath_transcript = "300_TRANSCRIPT.csv" #insert file path to transcript here 
filename_transcript = os.path.basename(filepath_transcript)

df_transcript = pd.read_csv(filepath_transcript)
df_transcript.head()

Unnamed: 0,start_time,stop_time,speaker,value
0,36.588,39.668,Ellie,hi i'm ellie thanks for coming in today
1,39.888,43.378,Ellie,i was created to talk to people in a safe and ...
2,43.728,48.498,Ellie,think of me as a friend i don't judge i can't ...
3,49.188,52.388,Ellie,i'm here to learn about people and would love ...
4,52.658,58.958,Ellie,i'll ask a few questions to get us started and...


In [3]:
# Check if the values can be accessed

participant_no = int(filename_transcript[0:3])
PHQ_Binary = df_labels.loc[participant_no]["PHQ_Binary"]
PHQ_Score = df_labels.loc[participant_no]["PHQ_Score"]

print(PHQ_Binary, PHQ_Score)

0 2


# Process Data
1. Drop `start_time` and `stop_time` columns
2. Drop rows that contain prompts by Ellie
3. Concatenate the words spoken by the participant into a single chunk of text

In [4]:
df_dropped_columns = df_transcript.drop(['start_time', 'stop_time'], axis = 1) #drop first 2 columns
df_dropped_ellie = df_dropped_columns.set_index('speaker').drop(index = ['Ellie']) #drop anything Ellie says, she's not important
df_cleaned = df_dropped_ellie.reset_index() 

In [5]:
# Concatenate participant's transcript into a single string

transcript_concat = '' 

for index, row in df_cleaned.iterrows():
    if (index == 0):
        transcript_concat += (row['value'])
    else:
        transcript_concat += ' ' + (row['value'])    

print(transcript_concat)

good atlanta georgia um my parents are from here um i love it i like the weather i like the opportunities um yes um it took a minute somewhat easy congestion that's it um i took up business and administration uh yeah i am here and there i'm on a break right now but i plan on going back in the uh next semester uh probably to open up my own business no um no specific reason i just don't travel a lot i'm pretty local once a year can you be a little bit more specific no answer i like reading books i enjoy i enjoy cooking um exercising is great i'm i'm i'm pretty good at it um yeah um probably about two weeks ago uh frustrated um i don't like bias um i don't like um when someone says they're gonna do something and they don't uh somewhat friendship i like to play sports i enjoy uh going out with friends and family playing games grandparents parents um yeah i mean they've always given me great advice they've always kept it real real close i would say going to college right after high school w

# Reconstruct data into a global dataframe

In [6]:
column_names_final = ["Participant_ID", "Transcript", "PHQ_Score", "PHQ_Binary"]
df_final = pd.DataFrame(columns = column_names_final)

new_row = {"Participant_ID": participant_no, "Transcript": transcript_concat, "PHQ_Score": PHQ_Score, "PHQ_Binary": PHQ_Binary}
df_final = df_final.append(new_row, ignore_index = True)

df_final.head()

Unnamed: 0,Participant_ID,Transcript,PHQ_Score,PHQ_Binary
0,300,good atlanta georgia um my parents are from he...,2,0


# Compile all transcripts

In [10]:
import pandas as pd 
import os
import ntpath

In [11]:
def get_transcript(file):
    filename = ntpath.basename(file)
    participant_no = int(filename[0:3])
    
    df_transcript = pd.read_csv(file, sep = '\t') 


    df_dropped_columns = df_transcript.drop(['start_time', 'stop_time'], axis = 1) #drop first 2 columns
    df_dropped_ellie = df_dropped_columns.set_index('speaker').drop(index = ['Ellie']) #drop anything Ellie says, she's not important
    df_cleaned = df_dropped_ellie.reset_index() 
    transcript_concat = '' 

    for index, row in df_cleaned.iterrows():
        if (index == 0):
            transcript_concat += (row['value'])
        else:
            transcript_concat += ' ' + (row['value'])
            
    return participant_no, transcript_concat

def combine_transcripts(directory, label_file):
    column_names_final = ["Participant_ID", "Transcript", "PHQ_Score", "PHQ_Binary"]
    df_final = pd.DataFrame(columns = column_names_final)
    
    csv_files = [pos_csv for pos_csv in os.listdir(directory) if pos_csv.endswith('.csv')]
    for filename in csv_files:
        file = directory+'/'+filename
        participant_no, transcript_concat = get_transcript(file)
        df_labels = pd.read_csv(label_file, index_col = "Participant_ID")
        PHQ_Binary = df_labels.loc[participant_no]["PHQ_Binary"]
        PHQ_Score = df_labels.loc[participant_no]["PHQ_Score"]

        new_row = {"Participant_ID": participant_no, "Transcript": transcript_concat, "PHQ_Score": PHQ_Score, "PHQ_Binary": PHQ_Binary}
        df_final = df_final.append(new_row, ignore_index = True)
    return df_final

In [None]:
transcript_directory = "./transcripts" # directory containing all the transcript csv files
label_file = "Y_data.csv"

df = combine_transcripts(transcript_directory, label_file)
df.to_csv("compiled_transcripts.csv", index=False)

In [None]:
df.head()