# Import libararies and load data

In [20]:
### ALL NECESSARY LIBRARIES ###
import pandas as pd
import json
import warnings
warnings.simplefilter(action='ignore')

# for saving variables
import pickle
import os
import tiktoken

# needed for gpt
from openai import AzureOpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt

%run "./utilityFunctions.ipynb"

In [315]:
# import data
transcripts = pd.read_csv('EXTENDED_TRANSCRIPT_COMBINED.csv')

with open('e_daic_phq8_scores_lang.pkl', 'rb') as f:
    phq8_scores = pickle.load(f)

# rename column Participant_ID to ParticipantID
phq8_scores.rename(columns={'Participant_ID': 'ParticipantID'}, inplace=True)


In [317]:
merged_data = pd.merge(transcripts, phq8_scores, on='ParticipantID')
merged_data.head()
merged_data.to_csv('PHQ8_Transcript_Combined.csv', index=False)

In [334]:
new_merged_data = pd.merge(summary_df, phq8_scores, on='ParticipantID')


In [326]:
# Current column names in the DataFrame
current_columns = [
    "PHQ_Binary", "PHQ_Score", "PCL-C (PTSD)", "PTSD Severity", 
    "PHQ_8NoInterest", "PHQ_8Depressed", "PHQ_8Sleep", "PHQ_8Tired", 
    "PHQ_8Appetite", "PHQ_8Failure", "PHQ_8Concentrating", "PHQ_8Moving", 
    "PHQ_8Total", "PHQ8_Binary", "PHQ8_Score"
]

# Desired new column names
new_columns = [
    "PHQ8_Binary", "PHQ8_Score", "PHQ8_PTSD", "PHQ8_PTSD_Severity",
    "PHQ8_NoInterest", "PHQ8_Depressed", "PHQ8_Sleep", "PHQ8_Tired",
    "PHQ8_Appetite", "PHQ8_Failure", "PHQ8_Concentrating", "PHQ8_Moving",
    "PHQ8_Total", "PHQ8_Binary", "PHQ8_Score"
]

# Dictionary to map current column names to new column names
rename_dict = dict(zip(current_columns, new_columns))

# Rename the columns
new_merged_data.rename(columns=rename_dict, inplace=True)
new_merged_data.to_csv('PHQ8_synopsisAndSentiment_Combined.csv', index=False)

In [162]:
#Get current directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)


Current Working Directory: /Users/Chenjunyu/Desktop/Work/Computational Vision and Learning Lab/Depression Detection


In [None]:
# For usage of counting tokens of input

# Initialize the tokenizer
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Your provided text
text = prompt

# Tokenize the text
tokens = encoding.encode(text)

# Print the number of tokens
print(len(tokens))

# Synopsis and sentiment section

In [164]:
prompt_template = """
As a professional psychologist, analyze the following transcript of a therapy session. Your objective is to generate two specific outputs:

1. A synopsis that succinctly captures the key concerns and topics discussed by the patient, providing insightful and reflective observations.
2. A detailed sentiment analysis of the patient's responses, identifying and elaborating on the specific emotions expressed.

Please format your output as a compact JSON object on a single line. The JSON should include two properties: 'Synopsis' and 'Sentiment', each containing the respective analyses. Avoid any extra spaces or line breaks within the JSON.

Example: "Synopsis":"Synopsis here", "Sentiment":"Sentiment here"

Transcript:
{Transcript}
"""





In [167]:
# Pricing details
PROMPT_COST_PER_1000_TOKENS = 0.005
COMPLETION_COST_PER_1000_TOKENS = 0.015

# Initialize the AzureOpenAI client with the specified endpoint and API key
client = AzureOpenAI(
    azure_endpoint="https://gptshuhaotest.openai.azure.com/",  # your Azure endpoint
    api_key="d0a5a4feefc34f71b685fc394d033b2c",  # your API key
    api_version="2024-02-01"
)

MODEL = "gpt-4o"  # change model here


# keep track of conversation
def add_message(role, content):
    conversation_history.append({"role": role, "content": content})
    
# clear conversation  
def clear_conversation():
    global conversation_history
    conversation_history = [
        {"role": "system", "content": "You are a professional psychologist who is very compassionate and empathetic"}
    ]


# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def get_response(prompt):
    # Add the user's message to the conversation history.
    add_message("user", prompt)
    
    # Create a chat completion request to the OpenAI API.
    response = client.chat.completions.create(
        model=MODEL,     
        # The conversation history, including system, user, and assistant messages.
        messages=conversation_history, 
        temperature=0.5,  # Controls the randomness of the output. 
        max_tokens=2000,  # The maximum number of tokens to generate in the response.
        top_p=1,  # Controls the diversity via nucleus sampling; 1 means use all tokens.
        frequency_penalty=2,  # Penalizes new tokens based on their frequency in the text so far.
        presence_penalty=2,  # Penalizes new tokens based on whether they appear in the text so far.
        n=1,  # The number of completions to generate. Here, we are generating only one.
        response_format={ "type": "json_object" },
    )

    # Add the assistant's message to the conversation history.
    assistant_message = response.choices[0].message.content
    add_message("assistant", assistant_message)
    
    
    # Save usage and compute cost
    usage = response.usage
    computeCost(usage)

    # Return the assistant's message.
    return assistant_message


In [178]:
Summary = []

for index, row in merged_data.iterrows():    
    attempt_count = 0
    success = False
    while attempt_count < 10 and not success:
        attempt_count += 1
        # Format the prompt with the current row's conversation
        prompt = prompt_template.format(Transcript=row['Conversation'])
        try:
            # Get synthetic data response
            response = get_response(prompt)
            print(f"Participant ID {row['ParticipantID']}")
            print(f"Attempt {attempt_count}: {response}")  
            
            # Try to parse the response as JSON
            json_data = json.loads(response)
            json_data['ParticipantID'] = row['ParticipantID']  
            Summary.append(json_data)
            success = True
        
        except json.JSONDecodeError:
            # If JSON parsing fails, retry fetching the response
            print(f"Error: Response not loadable as JSON (Attempt {attempt_count}). Retrying...")
            clear_conversation()  
        
        except Exception as e:
            # Handle any other exceptions, including BadRequestError
            print(f"Error encountered: {e}. Skipping this prompt.")
            break  

    clear_conversation()

summary_df = pd.DataFrame(Summary)




In [172]:
# summary_df.to_csv('synopsisAndSentiment.csv', index=False)

In [173]:
summary_df.shape

(211, 3)

# Synthetic Data section

In [179]:
# Initialize conversation history e
conversation_history = [
    {"role": "system", "content": """
You are an intelligent job analyst assistant with extensive knowledge of various job sectors and divisions
"""

    }
]


In [180]:
# Define categories for each variable

categories = {
    "PHQ8_Binary": ["This patient has not been diagnosed with depression", "This patient has been diagnosed with depression"],
    "Gender": ["Gender is male", "Gender is female"],
    "PHQ8_NoInterest": [
        "Over the last 2 weeks, I was not at all bothered by having little interest or pleasure in doing things.",
        "Several days over the last 2 weeks, I was bothered by having little interest or pleasure in doing things.",
        "More than half the days over the last 2 weeks, I was bothered by having little interest or pleasure in doing things.",
        "Nearly every day over the last 2 weeks, I was bothered by having little interest or pleasure in doing things."
    ],
    "PHQ8_Depressed": [
        "Over the last 2 weeks, I was not at all bothered by feeling down, depressed, irritable, or hopeless.",
        "Several days over the last 2 weeks, I was bothered by feeling down, depressed, irritable, or hopeless.",
        "More than half the days over the last 2 weeks, I was bothered by feeling down, depressed, irritable, or hopeless.",
        "Nearly every day over the last 2 weeks, I was bothered by feeling down, depressed, irritable, or hopeless."
    ],
    "PHQ8_Sleep": [
        "Over the last 2 weeks, I was not at all bothered by having trouble falling or staying asleep, or sleeping too much.",
        "Several days over the last 2 weeks, I was bothered by having trouble falling or staying asleep, or sleeping too much.",
        "More than half the days over the last 2 weeks, I was bothered by having trouble falling or staying asleep, or sleeping too much.",
        "Nearly every day over the last 2 weeks, I was bothered by having trouble falling or staying asleep, or sleeping too much."
    ],
    "PHQ8_Tired": [
        "Over the last 2 weeks, I was not at all bothered by feeling tired or having little energy.",
        "Several days over the last 2 weeks, I was bothered by feeling tired or having little energy.",
        "More than half the days over the last 2 weeks, I was bothered by feeling tired or having little energy.",
        "Nearly every day over the last 2 weeks, I was bothered by feeling tired or having little energy."
    ],
    "PHQ8_Appetite": [
        "Over the last 2 weeks, I was not at all bothered by having poor appetite or overeating.",
        "Several days over the last 2 weeks, I was bothered by having poor appetite or overeating.",
        "More than half the days over the last 2 weeks, I was bothered by having poor appetite or overeating.",
        "Nearly every day over the last 2 weeks, I was bothered by having poor appetite or overeating."
    ],
    "PHQ8_Failure": [
        "Over the last 2 weeks, I was not at all bothered by feeling bad about myself - or that I am a failure or have let myself or my family down.",
        "Several days over the last 2 weeks, I was bothered by feeling bad about myself - or that I am a failure or have let myself or my family down.",
        "More than half the days over the last 2 weeks, I was bothered by feeling bad about myself - or that I am a failure or have let myself or my family down.",
        "Nearly every day over the last 2 weeks, I was bothered by feeling bad about myself - or that I am a failure or have let myself or my family down."
    ],
    "PHQ8_Concentrating": [
        "Over the last 2 weeks, I was not at all bothered by having trouble concentrating on things, such as reading the newspaper or watching television.",
        "Several days over the last 2 weeks, I was bothered by having trouble concentrating on things, such as reading the newspaper or watching television.",
        "More than half the days over the last 2 weeks, I was bothered by having trouble concentrating on things, such as reading the newspaper or watching television.",
        "Nearly every day over the last 2 weeks, I was bothered by having trouble concentrating on things, such as reading the newspaper or watching television."
    ],
    "PHQ8_Moving": [
        "Over the last 2 weeks, I was not at all bothered by moving or speaking so slowly that other people could have noticed. Or the opposite - being so fidgety or restless that I have been moving around a lot more than usual.",
        "Several days over the last 2 weeks, I was bothered by moving or speaking so slowly that other people could have noticed. Or the opposite - being so fidgety or restless that I have been moving around a lot more than usual.",
        "More than half the days over the last 2 weeks, I was bothered by moving or speaking so slowly that other people could have noticed. Or the opposite - being so fidgety or restless that I have been moving around a lot more than usual.",
        "Nearly every day over the last 2 weeks, I was bothered by moving or speaking so slowly that other people could have noticed. Or the opposite - being so fidgety or restless that I have been moving around a lot more than usual."
    ]
}


In [321]:
# Pricing details
PROMPT_COST_PER_1000_TOKENS = 0.005
COMPLETION_COST_PER_1000_TOKENS = 0.015

# Initialize the AzureOpenAI client with the specified endpoint and API key
client = AzureOpenAI(
    azure_endpoint="https://gptshuhaotest.openai.azure.com/",  # your Azure endpoint
    api_key="d0a5a4feefc34f71b685fc394d033b2c",  # your API key
    api_version="2024-02-01"
)

MODEL = "gpt-4o"  # change model here

# # Initialize conversation history with a system message
# conversation_history = [
#     {"role": "system", "content": "You are an expert on Animals"}
# ]

# keep track of conversation
def add_message(role, content):
    conversation_history.append({"role": role, "content": content})
    
# clear conversation  
def clear_conversation():
    global conversation_history
    conversation_history = [
        {"role": "system", "content": "You are an intelligent data generation assistant tasked with creating synthetic data for a patient questionnaire. You ensure that all words generated are properly spaced and punctuated"}
    ]

# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def get_response(prompt):
    # Add the user's message to the conversation history.
    add_message("user", prompt)
    
    # Create a chat completion request to the OpenAI API.
    response = client.chat.completions.create(
        model=MODEL,     
        # The conversation history, including system, user, and assistant messages.
        messages=conversation_history, 
        temperature= 0,  # Controls the randomness of the output. 
        max_tokens=2000,  # The maximum number of tokens to generate in the response.
        top_p=0.1,  # Controls the diversity via nucleus sampling; 1 means use all tokens.
        frequency_penalty=2,  # Penalizes new tokens based on their frequency in the text so far.
        presence_penalty=2,  # Penalizes new tokens based on whether they appear in the text so far.
        n=1,  # The number of completions to generate. Here, we are generating only one.
        response_format={ "type": "json_object" },
    )

    # Add the assistant's message to the conversation history.
    assistant_message = response.choices[0].message.content
    add_message("assistant", assistant_message)
    
    # Save usage and compute cost
    usage = response.usage
    computeCost(usage)

    # Return the assistant's message.
    return assistant_message



## Single prompt

In [222]:
prompt_template = """
You are an intelligent data generation assistant tasked with creating synthetic data for a patient questionnaire. The provided input is a single row from a dataframe containing patient responses. Your goal is to generate a new synthetic row that mimics the sentiment and structure of the input data.

Here is the input information:
1. ParticipantID: {ParticipantID}
2. Synopsis: {Synopsis}
3. Sentiment: {Sentiment}
4. PHQ8_Binary: {PHQ8_Binary}
5. PHQ8_Score: {PHQ8_Score}
6. Gender: {Gender}
7. PHQ8_NoInterest: {PHQ8_NoInterest}
8. PHQ8_Depressed: {PHQ8_Depressed}
9. PHQ8_Sleep: {PHQ8_Sleep}
10. PHQ8_Tired: {PHQ8_Tired}
11. PHQ8_Appetite: {PHQ8_Appetite}
12. PHQ8_Failure: {PHQ8_Failure}
13. PHQ8_Concentrating: {PHQ8_Concentrating}
14. PHQ8_Moving: {PHQ8_Moving}

Instructions:
1. Imagine yourself as a patient who feels very similarly to the sentiment of the input data.
2. Use the same number of columns and ensure the data is consistent with the original schema.
3. Ensure the generated data for certain categories adheres to the following rule:

- PHQ8_Binary: one of ({PHQ8_Binary_Categories})
- PHQ8_Score: Total PHQ8 score is a number from 0-24
- Gender: one of({Gender_Categories})
- PHQ8_NoInterest: one of ({PHQ8_NoInterest_Categories})
- PHQ8_Depressed: one of ({PHQ8_Depressed_Categories})
- PHQ8_Sleep: one of ({PHQ8_Sleep_Categories})
- PHQ8_Tired: one of ({PHQ8_Tired_Categories})
- PHQ8_Appetite: one of ({PHQ8_Appetite_Categories})
- PHQ8_Failure: one of ({PHQ8_Failure_Categories})
- PHQ8_Concentrating: one of ({PHQ8_Concentrating_Categories})
- PHQ8_Moving: one of ({PHQ8_Moving_Categories})

Output the synthetic row in a compact JSON in a single-line without whitespaces for the 11 properties listed above:

1. As an example for PHQ8_Binary you should have: "PHQ8_Binary": "synthetic value"

Important: 
1. Ensure the generated data follows the same structural format and sentiment as the input row provided.
2. Don’t output hallucinations or garbage values.
"""


In [None]:
# This section uses a single prompt to return an entire row of synthetic data, have issues of hallucination and printing garbage values
synthetic_data = []

for index, row in merged_data.iterrows():
    # Get synthetic data response
    prompt = prompt_template.format(
    ParticipantID=merged_data['ParticipantID'][index],
    Conversation=merged_data['Conversation'][index],
    PHQ8_Binary=merged_data['PHQ8_Binary'][index],
    PHQ8_Score=merged_data['PHQ8_Score'][index],
    Gender=merged_data['Gender'][index],
    PHQ8_NoInterest=merged_data['PHQ8_NoInterest'][index],
    PHQ8_Depressed=merged_data['PHQ8_Depressed'][index],
    PHQ8_Sleep=merged_data['PHQ8_Sleep'][index],
    PHQ8_Tired=merged_data['PHQ8_Tired'][index],
    PHQ8_Appetite=merged_data['PHQ8_Appetite'][index],
    PHQ8_Failure=merged_data['PHQ8_Failure'][index],
    PHQ8_Concentrating=merged_data['PHQ8_Concentrating'][index],
    PHQ8_Moving=merged_data['PHQ8_Moving'][index],
    PHQ8_Binary_Categories=", ".join(categories["PHQ8_Binary"]),
    Gender_Categories=", ".join(categories["Gender"]),
    PHQ8_NoInterest_Categories=", ".join(categories["PHQ8_NoInterest"]),
    PHQ8_Depressed_Categories=", ".join(categories["PHQ8_Depressed"]),
    PHQ8_Sleep_Categories=", ".join(categories["PHQ8_Sleep"]),
    PHQ8_Tired_Categories=", ".join(categories["PHQ8_Tired"]),
    PHQ8_Appetite_Categories=", ".join(categories["PHQ8_Appetite"]),
    PHQ8_Failure_Categories=", ".join(categories["PHQ8_Failure"]),
    PHQ8_Concentrating_Categories=", ".join(categories["PHQ8_Concentrating"]),
    PHQ8_Moving_Categories=", ".join(categories["PHQ8_Moving"])
)
    response = get_response(prompt)
    
    clear_conversation()
    print(response)
    # if index == :
    break
    


## Multipe Prompts with prompt chaining

In [268]:
# Define the prompts for different parts of the questionnaire
prompt_templates = [
    """
    You are an intelligent data generation assistant tasked with creating synthetic data for a patient questionnaire. The provided input is a single row from a dataframe containing patient synopsis and sentiment. Your goal is to generate a new synthetic row that mimics the original structure of the input data.

    Here is the input information:
    - ParticipantID: {ParticipantID}
    - Synopsis: {Synopsis}
    - Sentiment: {Sentiment}

    Instructions:
    1. Imagine yourself as a patient who feels very similarly to the sentiment of the input data.
    2. Use the same number of columns and ensure the data is consistent with the original schema.
    3. Ensure the generated data for is grammatical and readable. 

    Output the synthetic row in a compact JSON in a single-line without whitespaces for the 2 properties listed above:

    1. As an example for ParticipantID you should have: "ParticipantID": "synthetic value"

    Important: 
    1. Ensure the generated data follows the same structural format and sentiment as the input row provided.
    2. Don’t output hallucinations or garbage values.
    """,

    """
    You are the same intelligent data generation assistant tasked with creating synthetic data for a patient questionnaire, based on the conversation you have mimicked earlier, continue to do the following:

    Here is the input information:
    - PHQ8_Binary: {PHQ8_Binary}
    - PHQ8_Score: {PHQ8_Score}
    - Gender: {Gender}

    Instructions:
    1. Imagine yourself as a patient who feels very similarly to the sentiment of the input data.
    2. Use the same number of columns and ensure the data is consistent with the original schema.
    3. Ensure the generated data for the variables adheres to the following rule:
    - PHQ8_Binary: one of ({PHQ8_Binary_Categories})
    - PHQ8_Score: Total PHQ8 score is a number from 0-24
    - Gender: one of ({Gender_Categories})

    Output the synthetic row in a compact JSON in a single-line without whitespaces for the 3 properties listed above:

    1. As an example for PHQ8_Binary you should have: "PHQ8_Binary": "synthetic value"

    Important: 
    1. Ensure the generated data follows the same structural format and sentiment as the input row provided.
    """,

    """
    You are still the intelligent data generation assistant tasked with creating synthetic data for a patient questionnaire, continues to do the task given the previous context:
    Here is the input information:
    - PHQ8_NoInterest: {PHQ8_NoInterest}
    - PHQ8_Depressed: {PHQ8_Depressed}
    - PHQ8_Sleep: {PHQ8_Sleep}
    - PHQ8_Tired: {PHQ8_Tired}

    Instructions:
    1. Imagine yourself as a patient who feels very similarly to the sentiment of the input data.
    2. Use the same number of columns and ensure the data is consistent with the original schema.
    3. Ensure the generated data for the categories adheres to the following rule:
    - PHQ8_NoInterest: one of ({PHQ8_NoInterest_Categories})
    - PHQ8_Depressed: one of ({PHQ8_Depressed_Categories})
    - PHQ8_Sleep: one of ({PHQ8_Sleep_Categories})
    - PHQ8_Tired: one of ({PHQ8_Tired_Categories})

    Output the synthetic row in a compact JSON in a single-line without whitespaces for the 4 properties listed above:

    1. As an example for PHQ8_NoInterest you should have: "PHQ8_NoInterest": "synthetic value"

    Important: 
    1. Ensure the generated data follows the same structural format and sentiment as the input row provided.
    """,

    """
    You are still the intelligent data generation assistant tasked with creating synthetic data for a patient questionnaire. Given the above context, continue to do the task:

    Here is the input information:
    - PHQ8_Appetite: {PHQ8_Appetite}
    - PHQ8_Failure: {PHQ8_Failure}
    - PHQ8_Concentrating: {PHQ8_Concentrating}
    - PHQ8_Moving: {PHQ8_Moving}

    Instructions:
    1. Imagine yourself as a patient who feels very similarly to the sentiment of the input data.
    2. Use the same number of columns and ensure the data is consistent with the original schema.
    3. Ensure the generated data for the categories adheres to the following rule:
    - PHQ8_Appetite: one of ({PHQ8_Appetite_Categories})
    - PHQ8_Failure: one of ({PHQ8_Failure_Categories})
    - PHQ8_Concentrating: one of ({PHQ8_Concentrating_Categories})
    - PHQ8_Moving: one of ({PHQ8_Moving_Categories})

    Output the synthetic row in a compact JSON in a single-line without whitespaces for the 4 properties listed above:

    1. As an example for PHQ8_Appetite you should have: "PHQ8_Appetite": "synthetic value"

    Important: 
    1. Ensure the generated data follows the same structural format and sentiment as the input row provided.
    """
]


In [242]:
synthetic_data = []
index=2
row = new_merged_data.loc[index]

for prompt_template in prompt_templates:
    prompt = prompt_template.format(
        ParticipantID=row['ParticipantID'],
        Sentiment=row['Sentiment'],
        Synopsis=row['Synopsis'],
        PHQ8_Binary=row['PHQ8_Binary'],
        PHQ8_Score=row['PHQ8_Score'],
        Gender=row['Gender'],
        PHQ8_NoInterest=row['PHQ8_NoInterest'],
        PHQ8_Depressed=row['PHQ8_Depressed'],
        PHQ8_Sleep=row['PHQ8_Sleep'],
        PHQ8_Tired=row['PHQ8_Tired'],
        PHQ8_Appetite=row['PHQ8_Appetite'],
        PHQ8_Failure=row['PHQ8_Failure'],
        PHQ8_Concentrating=row['PHQ8_Concentrating'],
        PHQ8_Moving=row['PHQ8_Moving'],
        PHQ8_Binary_Categories=", ".join(categories["PHQ8_Binary"]),
        Gender_Categories=", ".join(categories["Gender"]),
        PHQ8_NoInterest_Categories=", ".join(categories["PHQ8_NoInterest"]),
        PHQ8_Depressed_Categories=", ".join(categories["PHQ8_Depressed"]),
        PHQ8_Sleep_Categories=", ".join(categories["PHQ8_Sleep"]),
        PHQ8_Tired_Categories=", ".join(categories["PHQ8_Tired"]),
        PHQ8_Appetite_Categories=", ".join(categories["PHQ8_Appetite"]),
        PHQ8_Failure_Categories=", ".join(categories["PHQ8_Failure"]),
        PHQ8_Concentrating_Categories=", ".join(categories["PHQ8_Concentrating"]),
        PHQ8_Moving_Categories=", ".join(categories["PHQ8_Moving"])
    )
    response = get_response(prompt)
    print(f"response: {response}")  # More contextual printing
    # Parse the response as JSON
    json_data = json.loads(response)
    synthetic_data.append(json_data)
        
# Clear any final conversation contexts if necessary
clear_conversation()

# Load all synthetic data into a DataFrame
synthetic_data_df = pd.DataFrame(synthetic_data)
synthetic_data_df


Total cost for this conversation: $0.00446
response: {"ParticipantID": "303", "Synopsis": "The patient shares their personal history, touching on family relationships and a love for teaching young children. They convey feelings of sorrow due to the loss of their father three years ago and suggest experiencing depression since that event. The patient also talks about recent tensions with siblings, job uncertainty in New York, challenges in maintaining friendships and achieving life aspirations but finds happiness through hobbies like painting and spending time outdoors.", "Sentiment": "The patient's emotions range from nostalgic (fond memories of working with kids), grief-stricken (loss of father), frustrated (sibling conflicts; unstable employment situation), melancholic or possibly depressed ('not officially' diagnosed but feels down since father's death). There is some positivity when discussing supportive friends as well as enjoyment derived from creative activities."}
Total cost fo

Unnamed: 0,ParticipantID,Synopsis,Sentiment,PHQ8_Binary,PHQ8_Score,Gender,PHQ8_NoInterest,PHQ8_Depressed,PHQ8_Sleep,PHQ8_Tired,PHQ8_Appetite,PHQ8_Failure,PHQ8_Concentrating,PHQ8_Moving
0,303.0,"The patient shares their personal history, touching on family relationships and a love for teaching young children. They convey feelings of sorrow due to the loss of their father three years ago and suggest experiencing depression since that event. The patient also talks about recent tensions with siblings, job uncertainty in New York, challenges in maintaining friendships and achieving life aspirations but finds happiness through hobbies like painting and spending time outdoors.","The patient's emotions range from nostalgic (fond memories of working with kids), grief-stricken (loss of father), frustrated (sibling conflicts; unstable employment situation), melancholic or possibly depressed ('not officially' diagnosed but feels down since father's death). There is some positivity when discussing supportive friends as well as enjoyment derived from creative activities.",,,,,,,,,,,
1,,,,This patient has not been diagnosed with depression,Total PHQ8 score is 5.,Gender is male,,,,,,,,
2,,,,,,,"More than half the days over the last 2 weeks, I was bothered by having little interest or pleasure in doing things.","Several days over the last 2 weeks, I was bothered by feeling down, depressed, irritable, or hopeless.","Nearly every day over the last 2 weeks, I was bothered by having trouble falling or staying asleep, or sleeping too much.","Over the last 2 weeks,Iwasnotatallbotheredbyfeelingtiredorhavinglittleenergy.",,,,
3,,,,,,,,,,,"Several days over the last 2 weeks, I was bothered by having poor appetite or overeating.","Over the last 2 weeks, I was not at all bothered by feeling bad about myself - or that I am a failure or have let myself or my family down.","More than half the days over the last 2 weeks, I was bothered by having trouble concentrating on things, such as reading the newspaper or watching television.","Nearly every day overthelasttwoweeks,Iwasbotheredbymovingsoslowlythatotherpeoplecouldhavenoticed.Ortheopposite-beingsofidgetyorrestlessthatIhavebeenmovingaroundalotmorethanusual."



# Iterative looping

In [335]:
# Initialize the list to store the combined results for each row
combined_results = []

# Iterate over each row in the merged DataFrame
for index, row in new_merged_data.iterrows():
    if index==2:
        break
    # Initialize a dictionary to store the results of multiple prompts for the current row
    row_result = {}

    # Process each prompt template for the current row
    for prompt_template in prompt_templates:
        attempt_count = 0
        success = False
        response = None

        while attempt_count < 5 and not success:
            # Generate the prompt using the current row data
            prompt = prompt_template.format(
            ParticipantID=row['ParticipantID'],
            Sentiment=row['Sentiment'],
            Synopsis=row['Synopsis'],
            PHQ8_Binary=row['PHQ8_Binary'],
            PHQ8_Score=row['PHQ8_Score'],
            Gender=row['Gender'],
            PHQ8_NoInterest=row['PHQ8_NoInterest'],
            PHQ8_Depressed=row['PHQ8_Depressed'],
            PHQ8_Sleep=row['PHQ8_Sleep'],
            PHQ8_Tired=row['PHQ8_Tired'],
            PHQ8_Appetite=row['PHQ8_Appetite'],
            PHQ8_Failure=row['PHQ8_Failure'],
            PHQ8_Concentrating=row['PHQ8_Concentrating'],
            PHQ8_Moving=row['PHQ8_Moving'],
            PHQ8_Binary_Categories=", ".join(categories["PHQ8_Binary"]),
            Gender_Categories=", ".join(categories["Gender"]),
            PHQ8_NoInterest_Categories=", ".join(categories["PHQ8_NoInterest"]),
            PHQ8_Depressed_Categories=", ".join(categories["PHQ8_Depressed"]),
            PHQ8_Sleep_Categories=", ".join(categories["PHQ8_Sleep"]),
            PHQ8_Tired_Categories=", ".join(categories["PHQ8_Tired"]),
            PHQ8_Appetite_Categories=", ".join(categories["PHQ8_Appetite"]),
            PHQ8_Failure_Categories=", ".join(categories["PHQ8_Failure"]),
            PHQ8_Concentrating_Categories=", ".join(categories["PHQ8_Concentrating"]),
            PHQ8_Moving_Categories=", ".join(categories["PHQ8_Moving"])
        )
            
            # Fetch the response
            response = get_response(prompt)
            print(f"response: {response}")  # More contextual printing
            attempt_count += 1

            try:
                # Attempt to parse the JSON response
                json_data = json.loads(response)
                # Merge this prompt's response into the row result
                row_result.update(json_data)
                success = True
            except json.JSONDecodeError:
                # If JSON parsing fails, log and retry
                print(f"Failed to parse JSON from response for index {index} on attempt {attempt_count}. Retrying...")
                continue

        if not success:
            print(f"Unable to load JSON after 5 attempts for index {index}. Skipping this prompt.")
    clear_conversation()

    # After all prompts for a row, add the combined result for the row to the list
    combined_results.append(row_result)

# Convert the list of row results into a DataFrame
output_df = pd.DataFrame(combined_results)



In [None]:
output_df

## Separate Prompt

In [336]:
phq8_questions = [
                    "PHQ8_NoInterest:Over the last 2 weeks, how often have you been bothered by having little interest or pleasure in doing things?",
                    "PHQ8_Depressed:Over the last 2 weeks, how often have you been bothered by feeling down, depressed, or hopeless?",
                    "PHQ8_Sleep:Over the last 2 weeks, how often have you been bothered by having trouble falling or staying asleep, or sleeping too much?",
                    "PHQ8_Tired:Over the last 2 weeks, how often have you been bothered by feeling tired or having little energy?",
                    "PHQ8_Appetite:Over the last 2 weeks, how often have you been bothered by having poor appetite or overeating?",
                    "PHQ8_Failure:Over the last 2 weeks, how often have you been bothered by feeling bad about yourself - or that you are a failure or have let yourself or your family down?",
                    "PHQ8_Concentrating:Over the last 2 weeks, how often have you been bothered by having trouble concentrating on things, such as reading the newspaper or watching television?",
                    "PHQ8_Moving:Over the last 2 weeks, how often have you been bothered by moving or speaking so slowly that other people could have noticed. Or the opposite - being so fidgety or restless that you have been moving around a lot more than usual?"    
                ]

In [309]:
prompt_templates = [
    """
    As an intelligent data generation assistant, your task is to create synthetic data for a patient questionnaire. Below is a single row from a dataframe containing key patient details. Your objective is to generate a new synthetic row that mimics the original data structure and sentiment.

    Input details:
    - ParticipantID: {ParticipantID}
    - Synopsis: {Synopsis}
    - Sentiment: {Sentiment}

    Instructions:
    1. Imagine yourself as a patient with sentiments similar to the provided data.
    2. Maintain the original number of columns and adhere to the data schema.
    3. Generate data that is grammatically correct and coherent.

    Output the synthetic row in a compact JSON in a single-line without whitespaces for the 3 properties listed above:

    1. As an example for ParticipantID you should have: "ParticipantID": "synthetic value"

    Important: 
    - The generated data must maintain the same format and emotional tone as the provided input.
    - Avoid producing hallucinations or nonsensical values.
    """,
    """
    Based on the synthetic synopsis and sentiment you've created, put yourself in the shoes of the patient to generate the following synthetic data:

    Provided input:
    1. PHQ8_Binary: {PHQ8_Binary}
    2. PHQ8_Score: {PHQ8_Score}
    3. Gender: {Gender}
    4. PHQ8_NoInterest: {PHQ8_NoInterest}
    5. PHQ8_Depressed: {PHQ8_Depressed}
    6. PHQ8_Sleep: {PHQ8_Sleep}
    7. PHQ8_Tired: {PHQ8_Tired}
    8. PHQ8_Appetite: {PHQ8_Appetite}
    9. PHQ8_Failure: {PHQ8_Failure}
    10. PHQ8_Concentrating: {PHQ8_Concentrating}
    11. PHQ8_Moving: {PHQ8_Moving}

    Task:
    Generate synthetic responses for items 4 to 11, choosing from the following response options:
    - "Not at all"
    - "Several days"
    - "More than half the days"
    - "Nearly every day"

    Additionally, confirm these details:
    - PHQ8_Binary: Select one from ({PHQ8_Binary_Categories})
    - PHQ8_Score: Enter a total PHQ8 score between 0 and 24
    - Gender: Select one from ({Gender_Categories})

    Output the synthetic data as a compact JSON string in a single line without whitespaces, formatted as follows (where the key number refers to the corresponding question number above):
    Example: "4": "Not at all", "5": "Several days", etc.
    """
]


In [332]:
combined_results = []

for index, row in new_merged_data.iterrows():
    row_result = {}

    # Process each prompt template for the current row
    for prompt_template in prompt_templates:
        attempt_count = 0
        success = False
        response = None

        while attempt_count < 5 and not success:
            # Generate the prompt using the current row data
            prompt = prompt_template.format(
            ParticipantID=row['ParticipantID'],
            Sentiment=row['Sentiment'],
            Synopsis=row['Synopsis'],
            PHQ8_Binary=row['PHQ8_Binary'],
            PHQ8_Score=row['PHQ8_Score'],
            Gender=row['Gender'],
            PHQ8_NoInterest=row['PHQ8_NoInterest'],
            PHQ8_Depressed=row['PHQ8_Depressed'],
            PHQ8_Sleep=row['PHQ8_Sleep'],
            PHQ8_Tired=row['PHQ8_Tired'],
            PHQ8_Appetite=row['PHQ8_Appetite'],
            PHQ8_Failure=row['PHQ8_Failure'],
            PHQ8_Concentrating=row['PHQ8_Concentrating'],
            PHQ8_Moving=row['PHQ8_Moving'],
            PHQ8_Binary_Categories=", ".join(categories["PHQ8_Binary"]),
            Gender_Categories=", ".join(categories["Gender"]),
        )
            
            response = get_response(prompt)
            print(f"response: {response}")  # More contextual printing
            attempt_count += 1

            try:
                # Attempt to parse the JSON response
                json_data = json.loads(response)
                row_result.update(json_data)
                success = True
            except json.JSONDecodeError:
                # If JSON parsing fails, log and retry
                print(f"Failed to parse JSON from response for index {index} on attempt {attempt_count}. Retrying...")
                continue

        if not success:
            print(f"Unable to load JSON after 5 attempts for index {index}. Skipping this prompt.")
            
    clear_conversation()
    combined_results.append(row_result)

output_df = pd.DataFrame(combined_results)



In [264]:
{"PHQ8_Binary":"This patient has not been diagnosed with depression","PHQ8_Score":4,"Gender":"Gender is male",4:"Not at all",5:"Several days",6:"More than half the days",7:"Not at all",8:"Several days",9: " Not  At All" ,10 :" Several Days ",11 : " Nearly Every Day "}

{'corrected_sentence': 'Over the last 2 weeks, I was not at all bothered by feeling tired or having little energy.'}