In [1]:
import os
import re
import random
import pandas as pd

In [2]:
toks = [2000, 100, 200]

its = ["Transcript:['Ellie: ", "Synopsis:['The Participant", "Sentiment:['The Participant"]
items = ['transcript', 'synopsis', 'sentiment']
forms = ["Transcript:['Ellie: how is life Participant: I dont know ok Ellie: i see, anything else ']",
         "Synopsis:['Synthetic synopsis here']",
         "Sentiment:['Detailed sentiment analysis here']"]

rules = ["""Transcript: The synthetic transcript should follow the adjusted depression score of {PHQ8_Score} where Participant now has {dep}.
       Given the input information, focus on making a proactive conversation between Ellie and Participant with coherent responses and a consistent storyline in the same style of the transcript.
       The conversation must have a good balance between Ellie's and Participant's dialogue.""",
        "Synopsis: The synthetic synopsis should succinctly capture the key concerns and topics discussed in the input transcript, providing insightful and reflective observations.",
        "Sentiment: Provide a detailed sentiment analysis of the synthetic synopsis, identifying and elaborating on the specific emotions expressed."
        ]
depression = [
    "good mental health and no or minimal depression",
    "mild but bearable depression",
    "moderate depression, anxiety, or slight stress",
    "moderately severe depression, anxiety, stress and concerning negative thoughts",
    "severe depression, anxiety, stress and alarming negative thoughts"
    ]
adds = ["with a different story, background, and location from Participant's life events",
       "from the synthetic input transcript",
       "from the synthetic input transcript"]

og_prompt = """>
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
    
    You are an intelligent data generation assistant tasked with creating a synthetic {item}.

    Instructions:
    1. Use the input transcript above only as a reference, but create a new {item} in fluent English {add} that matches a depression/PHQ8 score of {PHQ8_Score}.
    2. Follow this rule for the generated data:
       {rule} 
       Do not repeat any previous text. Do not cut to other prompts.
    
    Output the synthetic row in a compact JSON format on a single line without any whitespace, separators, special tokens, or special symbols. For example:
    {form}
    
    Don't copy the example or repeat yourself. Avoid any white space, line breaks, or incomprehensible text.<|eot_id|>
    
    <|start_header_id|>user<|end_header_id|>
    
    PHQ8 Score: {PHQ8_Score}
    Depression level: Participant has {dep}.
    Input Transcript: ['{Transcript}']
    <|eot_id|>
    
    <|start_header_id|>assistant<|end_header_id|>
    Output {start}"""

In [3]:
data = []
train = pd.read_csv("/data/DAIC_cleaned/train.csv")
sims = 3
pref = "\'PHQ8_Score\'"
for i, d in train.iterrows():
    # Clean CSV
    pid, s = d["subject_id"], d["PHQ8_Score"]
    if int(pid) in [305, 364, 368, 380]: continue
    src = "/data/DAIC_Chat_Complete_Cleaned/%d_TRANSCRIPT.csv" % (pid,)
    trans = pd.read_csv(src).astype(str)
    trans["convo"] = trans["speaker"].str.cat(trans["value"], sep=": ")
    transcript = trans["convo"].str.cat(sep=' ')
    sgroup = s // 5
    scores = random.sample(list(set(range(24)) - set(range(sgroup * 5, (sgroup+1) * 5))), sims)
    for score in scores:
        error = False
        row, q = [pid, score], ""
        syntrans = ""
        dep = depression[score//5]
        print(row)
        for it in range(len(its)):
            curtrans = transcript if syntrans == "" else syntrans
            tok = toks[it]
            item, rule, form = items[it], rules[it], forms[it]
            if it == 0: rule = rule.format(PHQ8_Score=score,dep=dep)
            start = its[it]
            pro = og_prompt.format(Transcript=curtrans,PHQ8_Score=score,add=adds[it],
                                   item=item,rule=rule,form=form,start=start,dep=dep)
            output = ""
            outtxt = "./DAIC_25tuned_instruct_finetune/output.txt"
            !tune run generate --config daic_eval_chain_finetune.yml prompt="{pro}" \
                max_new_tokens="{tok}" > {outtxt} 2>&1
            try:
                with open(outtxt, 'r') as f: output = f.read().rstrip().split(start)
                if len(output) == 1: raise ValueError("Error in generating output!")
                output = output[-1].replace('"', "'").replace('`', "'")
                for sy in ["<|e", ">", "]", "}"]: output = output.split(sy)[0]
                if ''.join(re.split(r'Ellie:|Participant:',output.replace("'", ""))[-4:]).strip() == "":
                    raise ValueError("Output drew a blank!")
                output = output.split("\n")[0]
                if output[-1] != "'": output += "'"
                fout = start.split("[")[-1] + output
                row.append(fout)
                if it == 0: syntrans = fout
            except:
                error = True
                break
        if not error:
            print(i, row)
            data.append(row)
        else: print("Error occurred! Starting next one...")

[303, 10]
0 [303, 10, "'Ellie:  how can i help Participant: i just need someone to talk to Ellie: i'm here to listen Participant: thank you Ellie: what's been going on Participant: well everything i guess i'm just feeling a little overwhelmed lately Ellie: overwhelmed is a good word for it Participant: yeah,  just feeling like i'm stuck in a rut and i don't know how to get out of it Ellie:  stuck is a perfect way to put it Participant: yeah,  it's kinda like i'm just going through the motions of  life you know,  but not really enjoying it Ellie: i hear you Participant: yeah,  just feeling like i'm just trying to survive instead of really living Ellie: i understand Participant: yeah,  it's like everything is just a chore right now and Ellie: yeah Participant: i'm just trying to just make it through the day without losing my mind Ellie: i hear you Participant: yeah,  but i'm afraid it's gonna take a while for me to get out of this rut Participant: yeah,  gotta take my time Ellie: take yo

In [4]:
df = pd.DataFrame(data, columns=["Participant_ID", "PHQ8_Score", "Transcript", "Synopsis", "Sentiment"])
df

Unnamed: 0,Participant_ID,PHQ8_Score,Transcript,Synopsis,Sentiment
0,303,10,'Ellie: how can i help Participant: i just ne...,'The Participant is struggling with feelings o...,'The Participant\'s sentiment expressed in thi...
1,303,16,'Ellie: hi how's it going Participant: it's ...,'The Participant is struggling with moderately...,'The Participant expresses a deep sense of neg...
2,303,23,'Ellie: hi Participant: hi Ellie: how's life ...,'The Participant appears to be drowning in a s...,'The Participant\'s emotional tone is overwhel...
3,304,0,'Ellie: you're doing really great looking bac...,"'The Participant's experiences, including grat...",'The Participant's language exudes a profound ...
4,304,3,'Ellie: Participant: Ellie: how's your broth...,'The Participant has a strong sense of indepen...,'The Participant presents a positive self-imag...
...,...,...,...,...,...
298,488,23,'Ellie: hi Participant: thanks Ellie: how are...,'The Participant's thoughts seem trapped in a ...,'The Participant\'s response is laced with a p...
299,488,19,'Ellie: how's life going Participant: it's t...,'The Participant's mental health has been stru...,'The Participant expresses a deeply rooted sen...
300,491,16,'Ellie: how's life? Participant: I don't know...,'The Participant's moderately severe depressio...,'The Participant is expressing a deep sense of...
301,491,10,'Ellie: hi Participant: hi Ellie: how's life ...,'The Participant struggles with depression and...,'The Participant shares a deep-seated sense of...


In [5]:
dst = "DAIC_transcripts"
os.makedirs(dst, exist_ok=True)
df.to_csv(dst + "/DAIC_transcript_finetune.csv", index=False)