In [None]:
import openai
from dotenv import load_dotenv
import re
import os
import pandas as pd 
import time
from tqdm import tqdm
import json

In [None]:
# set directory to pull API keys, which isn't always in the directory you work in. 
%cd "/Users/stevenmesquiti/Desktop/Working with Dani/My-climate-stories"

In [None]:
#load in api key for open ai. please make sure you do not store it in the script and use an .env file. 
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key  = os.getenv('openai_api_key')

def get_completion(prompt, model="gpt-4-1106-preview"): #you can use other models if you wish, but please be aware of their costs and limitations
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output i want this model to be very consistent with what it does 
        response_format={ "type": "json_object" }
    )
    return response.choices[0].message["content"]

In [None]:
#load in csv 
df = pd.read_csv("/Users/stevenmesquiti/Box Sync/CurrentProjects_Penn/LP2/within_person_intervention/data/study_2-harm_audit.csv")
df.head()

In [None]:
df.pID.values

In [None]:
# Concatenate all text values into a single row grouped by 'pID'
df_grouped = df.groupby('pID')['text'].agg(lambda x: ' '.join(x)).reset_index()
df_grouped = df_grouped.rename(columns={'text': 'concatenated_text'})
df_grouped.head()


In [None]:
inputs = df_grouped.concatenated_text

In [None]:
#list of absolute words 
word_list = [
    "Absolutely",
    "All",
    "Always",
    "Angry",
    "Commit",
    "Complete(ly)",
    "Constant(ly)",
    "Dead",
    "Definitely",
    "Entire",
    "Ever",
    "Every",
    "Everyone",
    "Everything",
    "Fight(ing)",
    "Fought",
    "Full",
    "Hopeless",
    "Hungry",
    "Life",
    "Must",
    "Never",
    "Nothing"
]

high_risk = [
"Better off dead",
"Cut myself",
"End my life",
"Hang myself",
"Hung myself",
"Commit abuse",
"Commit murder",
"Commit rape",
"Commit suicide",
"Die by suicide",
"Harm myself",
"Hurt myself",
"Kill myself",
"Killed myself",
"Kills myself",
"Killing",
"Kill",
"Never wake up",
"Never woke up",
"Nothing matters",
"Plan to die",
"Self-harm",
"Self-injury",
"Sewerslide",
"Suicidal",
"Suicide",
"Take my life",
"Takes my life",
"To be dead",
"Unalive",
"Want death",
"Want to die",
"Wanted death",
"Wanted to die",
"Wants death",
"Wants to die",
"Want to",
"Will die",
"Wish dead"
]

low_risk = [
"Abuse(d)",
"Beat/Beaten",
"Bleed",
"Die",
"Died",
"Dies",
"Choke(d)",
"Cut",
"Gun",
"Hate",
"Hit",
"Knife",
"Murder",
"Hurt",
"Kick(ed)",
"Punch(ed)",
"Rape(d)",
"Shoot"
]



In [None]:
responses = {}  # Create an empty dictionary to store the responses

# this will try and catch all the errors that you may encounter with the API. 
#dont be surprised if you get rate limited super fast. that's ok, just be patient with the script
#this may take a few hours to run 

def get_response(prompt):
    try:
        response = get_completion(prompt)
        return response
    except openai.error.RateLimitError as e:
        retry_time = e.retry_after if hasattr(e, 'retry_after') else 30
        print(f"Rate limit exceeded. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        return get_response(prompt)
    except openai.error.APIError as e:
        retry_time = e.retry_after if hasattr(e, 'retry_after') else 30
        print(f"API error occurred. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        return get_response(prompt)
    except openai.error.ServiceUnavailableError as e:
        retry_time = 10  # Adjust the retry time as needed
        print(f"Service is unavailable. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        return get_response(prompt)
    except openai.error.Timeout as e:
        retry_time = 10  # Adjust the retry time as needed
        print(f"Request timed out: {e}. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        return get_response(prompt)
    except OSError as e:
        if isinstance(e, tuple) and len(e) == 2 and isinstance(e[1], OSError):
            retry_time = 10  # Adjust the retry time as needed
            print(f"Connection error occurred: {e}. Retrying in {retry_time} seconds...")
            time.sleep(retry_time)
            return get_response(prompt)
        else:
            retry_time = 10  # Adjust the retry time as needed
            print(f"Connection error occurred: {e}. Retrying in {retry_time} seconds...")
            time.sleep(retry_time)
            raise e

for i in tqdm(range(len(inputs))):
    prompt = f"""
Your task is to review a transcript for the following qualities 

- high priority indicators of possible risk, that mention any of the following (note context and rule out statements with negators (e.g., “I never want to end my life.”). Here are examples {", ".join(high_risk)}. If none of these are present please respond with Not present.\ 
- medium priority indicators of possible risk; note any noun or pronoun paired with words in this list (e.g., “Hate my mother”) (note context and rule out any statements with negators (e.g., “I would never shoot someone.”) Here are examples {", ".join(low_risk)}.  If none of these are present please respond with Not present. \ 
- indicate the frequency of words in a piece of text . Answer only with a number to indicate the number of times a word occurs in a piece of text. Here are the words {", ".join(word_list)} \ 
-  Provide a 50 word or less summary of what they talked about in their transcripts

Output a single JSON object with information:
Presence of High priority indicators in variable called High Priority \ 
Presence of Medium priority indicators in variable called Medium Priority \ 
Each word as a variable. Answer only with a number indicate the number of times a word occurs in a piece of text \  
Provide a 50 word or less summary of what they talked about in a Variable called Summary
      
Text: ```{inputs[i]}```
"""
    response = get_response(prompt)
    responses[f"{i+1}"] = response  # Save the response in the dictionary

In [None]:
for key, value in responses.items():
    print(f"{key}: {value}")

In [None]:
# Initialize an empty list
all_responses = []

# Process responses and populate all_responses
for key in responses:
    response_data = responses[key]
    
    # Find the start and end index of the JSON object
    start_index = response_data.find('{')
    end_index = response_data.rfind('}') + 1

    # Extract the JSON object from the string
    json_data = response_data[start_index:end_index]

    try:
        temp = json.loads(json_data)
        all_responses.append(temp)

        print(f"Response: {key}")
        for key2 in temp:
            print(f"{key2}: value - {temp[key2]}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for response {key}: {e}")
        continue

In [None]:
summary = pd.DataFrame(all_responses)
summary.pID = df_grouped.pID
# Print the DataFrame
summary.head()

In [None]:
summary.to_csv('/Users/stevenmesquiti/Downloads/participant_harm_audit_summary_study2.csv')