In [4]:
### ALL NECESSARY LIBRARIES ###
import pandas as pd
import json
import warnings
import random
warnings.simplefilter(action='ignore')

# for saving variables
import pickle
import os
import tiktoken


# needed for gpt
from openai import AzureOpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt


%run "./utilityFunctions.ipynb"

In [19]:
synopsisAndSentiment = pd.read_csv("synopsisAndSentiment.csv")
synopsisAndSentiment

Unnamed: 0,Synopsis,Sentiment,ParticipantID,PHQ8_Score
0,"The patient is originally from Atlanta, Georgi...",Positive emotions are expressed regarding love...,300,2
1,"The patient, originally from Los Angeles and c...",Patient exhibits mixed emotions: generally pos...,301,3
2,The patient discusses various aspects of their...,The patient's emotions range widely: nostalgia...,302,4
3,The patient discusses various aspects of their...,The patient's responses exhibit mixed emotions...,303,0
4,The patient is originally from Los Angeles and...,The patient's responses reveal mixed sentiment...,304,6
...,...,...,...,...
177,The patient discussed a variety of personal to...,The patient's responses indicate mixed emotion...,488,0
178,"The patient, originally from San Luis Obispo a...",The patient's responses convey a range of emot...,489,3
179,The patient discusses various aspects of their...,The overall sentiment expressed includes mixed...,490,2
180,The patient expresses feeling overwhelmed due ...,Emotions expressed include overwhelm (funeral ...,491,8


In [30]:
conversation_history = [
    {"role": "system", "content": """
You are a professional psychologist who is very compassionate and empathetic
"""

    }
]

In [22]:
syntheticSynopsisAndSentiment_prompt = """
You are an intelligent data generation assistant tasked with creating synthetic data for the synopsis and sentiment based on the input data.

Input Information:
1. Synopsis: {Synopsis}
2. Sentiment: {Sentiment}

Instructions:
1. Use the input synopsis and sentiment as a reference, but create a new synopsis with a different storyline, background, and locations that match a PHQ8 score of {PHQ8_Score}. Adjust the intensity of the sentiment to align with the PHQ8 score categories: 
   - 0-4: No or minimal depression
   - 5-9: Mild depression
   - 10-14: Moderate depression
   - 15-19: Moderately severe depression
   - 20-24: Severe depression
2. Maintain the same number of columns and ensure the data is consistent with the original schema.
3. Follow these rules for the generated data:
   - Synopsis: The synthetic synopsis should succinctly capture the key concerns and topics discussed by the patient, providing insightful and reflective observations.
   - Sentiment: Provide a detailed sentiment analysis of the synthetic synopsis, identifying and elaborating on the specific emotions expressed.

Output the synthetic row in a compact JSON format on a single line without whitespaces. For example:
"Synopsis":"Synthetic synopsis here","Sentiment":"Detailed sentiment analysis here","PHQ8_Score":{PHQ8_Score}
"""


In [23]:
# Pricing details
PROMPT_COST_PER_1000_TOKENS = 0.005
COMPLETION_COST_PER_1000_TOKENS = 0.015

# Initialize the AzureOpenAI client with the specified endpoint and API key
client = AzureOpenAI(
    azure_endpoint="https://gptshuhaotest.openai.azure.com/",  # your Azure endpoint
    api_key="d0a5a4feefc34f71b685fc394d033b2c",  # your API key
    api_version="2024-02-01"
)

MODEL = "gpt-4o"  # change model here

# keep track of conversation
def add_message(role, content):
    conversation_history.append({"role": role, "content": content})
    
# clear conversation  
def clear_conversation():
    global conversation_history
    conversation_history = [
        {"role": "system", "content": "You are a professional psychologist who is very compassionate and empathetic"}
    ]


# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def get_response(prompt):
    # Add the user's message to the conversation history.
    add_message("user", prompt)
    
    # Create a chat completion request to the OpenAI API.
    response = client.chat.completions.create(
        model=MODEL,     
        # The conversation history, including system, user, and assistant messages.
        messages=conversation_history, 
        temperature=1,  # Controls the randomness of the output. 
        max_tokens=2000,  # The maximum number of tokens to generate in the response.
        top_p=1,  # Controls the diversity via nucleus sampling; 1 means use all tokens.
        frequency_penalty=2,  # Penalizes new tokens based on their frequency in the text so far.
        presence_penalty=2,  # Penalizes new tokens based on whether they appear in the text so far.
        n=1,  # The number of completions to generate. Here, we are generating only one.
        response_format={ "type": "json_object" },
    )

    # Add the assistant's message to the conversation history.
    assistant_message = response.choices[0].message.content
    add_message("assistant", assistant_message)
    
    
    # Save usage and compute cost
    usage = response.usage
    computeCost(usage)

    # Return the assistant's message.
    return assistant_message

In [25]:
Synopsis = []

for index, row in synopsisAndSentiment.iterrows():
    # Determine the number of times to prompt based on PHQ_Binary value
    # prompt_multiplier = 8 if row['PHQ_Binary'] == 1 else 1
    prompt_multiplier = 3 
    
    for _ in range(prompt_multiplier):
        attempt_count = 0
        success = False
        PHQ8_generated = random.randint(0, 24)
        while attempt_count < 10 and not success:
            attempt_count += 1
            prompt = syntheticSynopsisAndSentiment_prompt.format(
                Synopsis= row["Synopsis"],
                PHQ8_Score= PHQ8_generated,
                Sentiment = row["Sentiment"]
            )
            try:
                # Get synthetic data response
                response = get_response(prompt)
                print(f"Participant ID {row['ParticipantID']}")
                print(f"Attempt {attempt_count}: {response}") 

                # Try to parse the response as JSON
                json_data = json.loads(response)

                # Check if the required keys are present in the JSON data
                if 'Synopsis' in json_data and 'Sentiment' in json_data:
                    json_data['ParticipantID'] = row['ParticipantID'] 
                    json_data['PHQ8_Score'] = PHQ8_generated
                    Synopsis.append(json_data)
                    success = True
                else:
                    print("Error: Required keys are missing from the JSON data. Retrying...")
                    clear_conversation()
                    continue  # Continue to retry since the JSON does not have the required keys

            except json.JSONDecodeError:
                # If JSON parsing fails, retry fetching the response
                print(f"Error: Response not loadable as JSON (Attempt {attempt_count}). Retrying...")
                clear_conversation()  # Reset conversation if needed for next attempt

            except Exception as e:
                # Handle any other exceptions, including BadRequestError
                print(f"Error encountered: {e}. Skipping this prompt.")
                break  


        clear_conversation()

# Convert the list of JSON objects to a DataFrame
Synopsis_df = pd.DataFrame(Synopsis)


Synopsis_df.to_csv('syntheticSynopsisAndSentiment.csv', index=False)

Synopsis_df 