In [2]:
from itertools import combinations
import csv
import numpy as np
import pandas as pd
np.random.seed(15)

In [3]:
# Load data
transcripts = pd.read_csv("transcripts_topic.tsv", sep="\t")
scores_train = pd.read_csv("train_split_Depression_AVEC2017.csv")
scores_dev = pd.read_csv("dev_split_Depression_AVEC2017.csv")
scores_test = pd.read_csv("test_split_Depression_AVEC2017.csv")
scores = pd.concat([scores_train, scores_dev, scores_test])
scores = scores.set_index("Participant_ID")
scores = scores["PHQ8_Binary"]

In [6]:
transcripts.head()

Unnamed: 0,start_time,stop_time,speaker,value,topic,topic_value,sub_topic,participant,Gender
0,173.236,174.446,Participant,what do you mean i'm sorry,4.0,do you consider yourself an introvert,0.0,303,0
1,756.786,757.876,Participant,oh wow,1.0,how easy is it for you to get a good night's s...,0.0,303,0
2,759.366,761.846,Participant,i have my days um,1.0,how easy is it for you to get a good night's s...,0.0,303,0
3,816.806,821.326,Participant,what am i like irritated tired um lazy,1.0,what are you like when you don't sleep well,1.0,303,0
4,822.486,823.416,Participant,you know,1.0,what are you like when you don't sleep well,1.0,303,0


In [7]:
scores.head()

Participant_ID
303    0.0
304    0.0
305    0.0
310    0.0
312    0.0
Name: PHQ8_Binary, dtype: float64

### Manual dictionary for topic/subtopic special tokens for word embeddings

In [9]:
topic_to_subtopic_to_category = {
    0 : {
        0 : "did_recently",
        1 : "enjoy_travelling",
        3 : "family_relationship",
        4 : "do_for_fun",
        5 : "best_friend",
        6 : "ideal_weekend"
    },
    
    1 : {
        0 : "easy_sleep",
        1 : "sleep_badly"
    },
    
    2 : {
        0 : "happy_last_time",
        1 : "behaviour_changes",
        2 : "disturbing_thoughts",
        3 : "feel_lately"
    },
    
    3 : {
        0 : "any_regret",
        1 : "feel_guilty",
        2 : "most_proud",
    },
    
    4 : {
        0 : "introvert",
        1 : "shy_outgoing",
    },
    
    5 : {
        0 : "ptsd_diagnosed",
        1 : "depression_diagnosed",
        2 : "therapy_useful",
        
    },
    
    6 : {
        2 : "easy_parent"
    }
}

### Create pariticipant ID to text dictionary

In [10]:
# TODO can add preprocessing steps here
def preprocess(text):
    return text

In [14]:
participant_to_text = {}

prev_topic = ""
prev_subtopic = ""
prev_participant = -1

for idx, row in transcripts.iterrows():
    participant = row.participant
    topic = int(row.topic)
    subtopic = int(row.sub_topic)
    text = row.value
    
    if participant not in participant_to_text:
        # Create blank entry for new participant
        participant_to_text[participant] = ["", []]
    
    if participant == prev_participant and topic == prev_topic and subtopic == prev_subtopic:
        # If previous participant and topic+subtopic, don't pad special tokens
        proc_text = preprocess(text) + " "
        participant_to_text[participant][0] += proc_text
        participant_to_text[participant][1][-1] += proc_text
    else:
        # If different topic+subtopic, pad special token infront of text before appending to full text and topic-wise text
        proc_text = topic_to_subtopic_to_category[topic][subtopic] + " " + preprocess(text) + " "
        participant_to_text[participant][0] += proc_text
        participant_to_text[participant][1].append(proc_text)
        
    prev_participant = participant
    prev_topic = topic
    prev_subtopic = subtopic

In [15]:
# Example output
print("Full text (all topics): ")
print(participant_to_text[303][0])
print("\nTopic-wise text: ")
print(participant_to_text[303][1])

Full text (all topics): 
introvert what do you mean i'm sorry easy_sleep oh wow i have my days um sleep_badly what am i like irritated tired um lazy you know depression_diagnosed no best_friend i don't really have a best friend but a person that i deal with and i used to work with um she would probably tell you that i'm very um outgoing a go getter dependable responsible happy_last_time well i try to stay happy i'd rather be happy than sad my kids keep me going you know what i mean 

Topic-wise text: 
["introvert what do you mean i'm sorry ", 'easy_sleep oh wow i have my days um ', 'sleep_badly what am i like irritated tired um lazy you know ', 'depression_diagnosed no ', "best_friend i don't really have a best friend but a person that i deal with and i used to work with um she would probably tell you that i'm very um outgoing a go getter dependable responsible ", "happy_last_time well i try to stay happy i'd rather be happy than sad my kids keep me going you know what i mean "]


### Create Training and Test data
- Currently, use dev set data as Test data and training set data as Training data
- **IMPORTANT**: For training data, we are doing data augmentation as follows:
    - 

In [17]:
# Augmentation parameters 
# Here, 0-> non-depressed class, 1-> depressed class
# We treat the two classes differently (we do more augmentation for depressed class)
min_len = {0: 10, 1: 5}  # Minimum length of a transcript above which we can do augmentation 
aug_count = {0: 3, 1: 8} # Number of augmented transcripts to be created

data_train = {"Text": [], "Targets": []}
data_test = {"Text": [], "Targets": []}

for participant in participant_to_text:
    # Training data
    if participant in scores_train["Participant_ID"].values:
        # Add un-augmented transcript
        data_train["Text"].append(participant_to_text[participant][0])
        data_train["Targets"].append(scores[participant])
        
        # Data augmentation step (only for those transcripts which are longer than min_len)
        if len(participant_to_text[participant][1]) > min_len[scores[participant]]:
            # Generate aug_count integers, each of which is the length of the new transcript 
            # (each entry in t_len is in range min_len to size of current transcript)
            t_lens = np.random.randint(low=min_len[scores[participant]], 
                                       high=len(participant_to_text[participant][1]), 
                                       size=aug_count[scores[participant]])
            for t_len in t_lens:
                # Generate list of all combinations of topic texts of t_len
                combs = list(combinations(participant_to_text[participant][1], t_len))
                # Select a random combination
                t_comb = list(combs[np.random.randint(len(combs))])
                # Shuffle the topic texts in selected combination
                np.random.shuffle(t_comb)
                # Add augmented transcript
                data_train["Text"].append(" ".join(t_comb))
                data_train["Targets"].append(scores[participant])

    # Testing data
    elif participant in scores_dev["Participant_ID"].values:
        data_test["Text"].append(participant_to_text[participant][0])
        data_test["Targets"].append(scores[participant])

In [18]:
# Check class balance in training data
np.unique(data_train["Targets"], return_counts=True)

(array([0., 1.]), array([272, 262]))

In [19]:
data_train["Text"][:10]

["introvert what do you mean i'm sorry easy_sleep oh wow i have my days um sleep_badly what am i like irritated tired um lazy you know depression_diagnosed no best_friend i don't really have a best friend but a person that i deal with and i used to work with um she would probably tell you that i'm very um outgoing a go getter dependable responsible happy_last_time well i try to stay happy i'd rather be happy than sad my kids keep me going you know what i mean ",
 "family_relationship very close even though i don't live with them i try to see them as much as i can introvert mm yes  enjoy_travelling um trying new things seeing new views of the world um trying the different type of foods um seeing how the government is and how they run the things out there i guess easy_sleep it's pretty good eh somewhat sleep_badly i'm tired <laughter> and i kind of fall asleep during class and whatnot depression_diagnosed no best_friend very friendly and funny talkative happy_last_time um last weekend i 

### Save training and test data

In [70]:
pd.DataFrame(data_train).to_csv("data_train.csv", index=False)
pd.DataFrame(data_test).to_csv("data_test.csv", index=False)

---

### Data analysis

### How many words are there in training and testing data?

In [72]:
train_text = " ".join(data_train["Text"])
train_text = np.unique(train_text.split(" "))
test_text = " ".join(data_test["Text"])
test_text = np.unique(test_text.split(" "))
len(train_text), len(test_text)

(1720, 823)

### How many words in testing data are not found in training data?

In [73]:
x = []
for w in test_text:
    if w not in train_text:
        x.append(w)
print(len(x))

219
