# Import libararies and load data

In [64]:
import os
import openai
from openai import OpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pandas as pd
import json
import re
from openai import AzureOpenAI
import random
from tqdm import tqdm
import time

%run "./utilityFunctions.ipynb"

In [78]:
test = pd.read_csv("test.csv")
val = pd.read_csv("val.csv")
train = pd.read_csv("train.csv")
# Initialize an empty list to store subject_ids
subject_ids = []

# extract training subject_id and append to the list
def extract_subject_ids(df, subject_ids_list):
    subject_ids_list.extend(df['subject_id'].tolist())

extract_subject_ids(train, subject_ids)

print("List of subject_ids:")
print(subject_ids)

List of subject_ids:
[303, 304, 305, 310, 312, 313, 315, 316, 317, 318, 319, 320, 321, 322, 324, 325, 326, 327, 328, 330, 333, 336, 338, 339, 340, 341, 343, 344, 345, 347, 348, 350, 351, 352, 353, 355, 356, 357, 358, 360, 362, 363, 364, 366, 368, 369, 370, 371, 372, 374, 375, 376, 379, 380, 383, 385, 386, 391, 392, 393, 397, 400, 401, 402, 409, 412, 414, 415, 416, 419, 423, 425, 426, 427, 428, 429, 430, 433, 434, 437, 441, 443, 444, 445, 446, 447, 448, 449, 454, 455, 456, 457, 459, 463, 464, 468, 471, 473, 474, 475, 478, 479, 485, 486, 487, 488, 491]


In [81]:
# Download the training dataset as json as openAI vectorbase doesn't allow uploading csv format 
synopsisAndSentiment = pd.read_csv("synopsisAndSentiment.csv")

filtered_data = synopsisAndSentiment[synopsisAndSentiment['ParticipantID'].isin(subject_ids)]
extracted_data = filtered_data[['ParticipantID', 'Synopsis', 'Sentiment']]
data_list = extracted_data.to_dict(orient='records')
json_data = json.dumps(data_list, ensure_ascii=False)
file_name = 'synopsisAndSentiment.json'

with open(file_name, 'w', encoding='utf-8') as file:
    file.write(json_data)


                                 EmotionsFromSentiment  ParticipantID  \
3    contentment,ambition,nostalgia,frustration,lov...            303   
4    positive,negative,regretful/reflective,betraye...            304   
5    frustration,hopefulness,nostalgia,anger,hurt,d...            305   
10   frustration,loneliness,nostalgia,anger managem...            310   
12      nostalgia,anxiety,contentment,mild frustration            312   
..                                                 ...            ...   
174  fatigue,appreciation,frustration,dislike,pasio...            485   
175  excitement,enjoyment,appreciation,uncertainty,...            486   
176  nostalgia,satisfaction,resilience,curiosity,ex...            487   
177  enjoyment,frustration,nostalgia,uncertainty,se...            488   
180  overwhelm,stress,nostalgia,ambition,frustratio...            491   

     PHQ8_Score                                          Sentiment  \
3             0  The patient's responses exhibit mixe

In [84]:
descriptions = """
 You are an intelligent data generation assistant tasked with creating synthetic data for synopsis and sentiment analysis by learning from the samples in the database
"""

In [85]:
prompt = """
Given a PHQ8 score of {PHQ8score}, where the PHQ8 score categories are:
   - 0-4: No or minimal depression
   - 5-9: Mild depression
   - 10-14: Moderate depression
   - 15-19: Moderately severe depression
   - 20-24: Severe depression

Use the synopses and sentiments of different participants in the database as references. Learn the various types of stories, ways of living, experiences of suffering, personal development, and emotions exhibited. Create a single synthetic synopsis and sentiment with different storylines and experiences that match the magnitude of the given PHQ8 score.

Format your output as a compact JSON object on a single line, including two properties: 'Synopsis' and 'Sentiment'. Avoid any extra spaces or line breaks within the JSON.

Example: "Synopsis":"Synopsis here", "Sentiment":"Sentiment here"
"""


In [86]:

client = AzureOpenAI(
    azure_endpoint="https://dalleshuhaotest.openai.azure.com/",  # your Azure endpoint
    api_version="2024-05-01-preview",
    api_key="5d21ccaf284a4b2baa3b1e85113e8b65",  # your API key
    
)

# Create an assistant
assistant = client.beta.assistants.create(
    name="Synopsis and Sentiment Generation Assistant",
    instructions=prompt,
    model= "gpt-4o", # You must replace this value with the deployment name for your model.
    tools=[{"type": "file_search"}],
    
)

print(f"Assistant created: {assistant.name}")
print(f"Assistant ID: {assistant.id}")

Assistant created: Synopsis and Sentiment Generation Assistant
Assistant ID: asst_Pv2CkfKV6UKAashLCCcprXzn


In [90]:
assistantID = 'asst_Pv2CkfKV6UKAashLCCcprXzn'
vectorStoreID = 'vs_wZ8xTW4zX6A3gxW5dnqleZE4'

In [87]:
#Get current directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

Current Working Directory: /Users/Chenjunyu/Desktop/Work/Computational Vision and Learning Lab/Depression Detection


In [88]:
# Create vector store and process files of interest
vector_store = client.beta.vector_stores.create(name="Synopsis and Sentiment Database")

file_paths = ["/Users/Chenjunyu/Desktop/Work/Computational Vision and Learning Lab/Depression Detection/synopsisAndSentiment.json"]
file_streams = [open(path, "rb") for path in file_paths]

In [91]:
# Use the upload and poll SDK helper to upload the files, add them to the vector store, and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vectorStoreID, files=file_streams
)

print(file_batch.status)
print(file_batch.file_counts)

completed
FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1)


In [None]:
# Update the status of our assistant
assistant = client.beta.assistants.update(
  assistant_id=assistantID,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

# Playground section

In [93]:
thread = client.beta.threads.create()

In [45]:
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content= "Do what the instruction says",
    
)

In [46]:
run = client.beta.threads.runs.create_and_poll(
  thread_id=thread.id,
  assistant_id=assistantID,
  instructions=prompt,
)


In [49]:
if run.status == 'completed': 
    messages = client.beta.threads.messages.list(
    thread_id=thread.id,
  )
    print(messages)
else:
    print(run.status)
    
    

SyncCursorPage[Message](data=[Message(id='msg_YeZiWvRB7LTmu4ngXpApI9ld', assistant_id='asst_h6RvQIqrZRgRXDeO0lJAUXVj', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='```json\n{"Synopsis":"The patient moved to Los Angeles for better opportunities but has been struggling significantly with personal and professional difficulties. They describe keen interest in creative pursuits such as photography and acting but face financial instability causing stress and a sense of unfulfillment. Diagnosed with severe depression three years ago, they experience frequent sleep disturbances and anxiety attacks originating from past trauma. Despite therapy offering some relief, economic constraints prevent continued sessions, exacerbating feelings of hopelessness and isolation. Precious moments are often overshadowed by persistent sadness and regret regarding unachieved academic or career goals. Social interactions are limited to close family members and a su

In [None]:
messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))

message_content = messages[0].content[0].text.value
message_content

# Iterative pipeline

In [98]:
def get_response(client, assistantID, PHQ8score):
    # Create a new thread
    thread = client.beta.threads.create()
    
    # Create a new message in the thread
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content="Do what the instruction says"
    )
    
    # Create and poll the run
    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread.id,
        assistant_id=assistantID,
        instructions=prompt.format(PHQ8score=PHQ8score)
    )
    attempt =0
    
    while run.status != "completed" and attempt < 5:
        print(f"Current status: {run.status}")
        attempt +=1
        time.sleep(15)  # Wait for 5 seconds before checking the status again
        run = client.beta.threads.runs.create_and_poll(
        thread_id=thread.id,
        assistant_id=assistantID,
        instructions=prompt.format(PHQ8score=PHQ8score)
    )
    print(f"Current status: {run.status}")
    # Get messages from the run once the status is completed
    messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))

    # Extract the content of the first message
    message_content = messages[0].content[0].text.value if messages else ""
    print(message_content)
    
    return thread, run, messages, message_content




In [None]:

json_responses = []


for iteration in tqdm(range(308)):
    attempt_count = 0
    success = False
    PHQ8score = random.randint(0, 24)
    while attempt_count < 10 and not success:
        attempt_count += 1
    
        # Try to fix and load the JSON response
        try:
            thread, run, messages, message_content = get_response(client, assistantID, PHQ8score)
            json_response = fix_json_response(message_content)
            # If the JSON response is valid and contains the required keys, append it to the list
            if json_response and 'Synopsis' in json_response and 'Sentiment' in json_response:
                json_response['PHQ8_Score'] = PHQ8score
                json_responses.append(json_response)
                success = True

        except ValueError:
            continue
        
        except Exception as e:
            # Handle any other exceptions, including BadRequestError
            print(f"Error encountered: {e}. Skipping this prompt.")
            break

    
df = pd.DataFrame(json_responses)
print(df)

In [96]:
df

Unnamed: 0,Synopsis,Sentiment,PHQ8_Score
0,The patient moved from Pennsylvania to Los Ang...,The patient displays a mix of emotions includi...,16
1,"A participant living in Los Angeles, educated ...",The participant's emotional landscape is mostl...,3
2,The patient reflects on their daily life in Lo...,The patient's emotions range from contentment ...,4
3,The patient recently transitioned from Pennsyl...,The patient's emotions are predominantly negat...,24
4,The patient is originally from Florida but mov...,Contentment: Enjoyment of LA weather and cultu...,4
...,...,...,...
303,The patient expresses their experiences of mov...,The patient's emotions fluctuate between sadne...,16
304,"The patient discusses various life aspects, in...",Patient exhibits mixed emotions: contentment w...,16
305,The patient discusses a range of life experien...,Mixed emotions: sadness about romantic prospec...,6
306,The participant discusses life in Los Angeles ...,Sadness due to financial struggles and lack of...,3


In [97]:
df.to_csv('syntheticSynopsisAndSentimentFromAssistant.csv', index=False)