In [None]:
import os
import re
import json
import time
import random
import argparse
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm  # use this for general progress bars (works in console & notebooks)
from openai import OpenAI, RateLimitError, APIError, APITimeoutError

In [None]:
SEED = 42

In [None]:
load_dotenv("/Users/sm9518/Desktop/Article-Summarizer/.env") # where i keep my API key... 
api_key = os.getenv("OPENAI_API_KEY")
if api_key:
    print("API Key loaded successfully!\n:)")
else:
    raise ValueError("API Key not found.\nMake sure it is set in the .env file.")

In [None]:
client = OpenAI()
client.models.list()

In [None]:
df = pd.read_csv('/Users/sm9518/Library/CloudStorage/Box-Box/LP2/well-being-prediction/data/Prolific_wellbeing-prediction-text-long.csv',index_col=0)
df.columns

In [None]:
#df = df.sample(frac=frac, random_state=SEED) #randomly sample 300 comments

In [None]:
# set parameters so we know what we're working with
#model="gpt-3.5-turbo-1106" # set model
model='gpt-4'
temperature=0 # set temp 
input_column = 'SWLS-Text' 
#input_column = 'Autonomy-Text'
# create index of the text values... 
input = df[input_column].tolist()
def get_completion(prompt):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    return response.choices[0].message.content

def get_response(prompt):
    try:
        response = get_completion(prompt)
        return response
    except RateLimitError as e:
        retry_time = e.retry_after if hasattr(e, 'retry_after') else 30
        print(f"Rate limit exceeded. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        return get_response(prompt)
    except APIError as e:
        retry_time = 30
        print(f"API error occurred. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        return get_response(prompt)
    except APITimeoutError as e:
        retry_time = 10
        print(f"Request timed out: {e}. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        return get_response(prompt)
    except Exception as e:
        retry_time = 10
        print(f"An error occurred: {e}. Retrying in {retry_time} seconds...")
        time.sleep(retry_time)
        return get_response(prompt)

# Initialize an empty dictionary to store responses
responses = {}

# Verify input contents
print(f"Total number of inputs: {len(input)}")
print(f"Number of unique inputs: {len(set(input))}")


In [None]:

# Assuming 'input' is your list of text inputs
for i, text in enumerate(tqdm(input)):
    prompt = f"""
    Your task is to evaluate the following piece of text along a 7-point scale (where 1 = not at all and 7 = a great deal) for each of the following well-being dimensions:

    Self-acceptance
    To what degree does the text reflect a positive attitude toward the self, including acknowledgment and acceptance of both strengths and weaknesses, and positive feelings about the past? 

    Positive relations with others
    To what degree does the text convey warm, satisfying, or trusting relationships with others, empathy, affection, or concern for others’ welfare?

    Autonomy
    To what degree does the text suggest self-determination, independence, resistance to social pressures, or behavior guided by personal standards?

    Environmental mastery
    To what degree does the text indicate competence in managing life and environment, including the ability to navigate external demands or create favorable contexts?

    Purpose in life
    To what degree does the text suggest a sense of direction, meaningful goals, or beliefs that give life purpose?

    Personal growth
    To what degree does the text reflect openness to new experiences, personal development, or a sense of realizing potential and evolving as a person?

    Satisfaction with life
    To what degree does the text express a global assessment of life satisfaction or overall contentment with one's life circumstances?

    Output your answer as a single JSON object with keys for each dimension (use these exact names: "Self_acceptance", "Positive_relations", "Autonomy", "Environmental_mastery", "Purpose_in_life", "Personal_growth","Satisfaction_with_life").

    Text: ```{input[i]}```
    """
    response = get_response(prompt)
    try:
        # Attempt to parse the response as JSON
        json_response = json.loads(response)
        responses[f"{i+1}"] = json_response
    except json.JSONDecodeError:
        # If parsing fails, store the raw response
        responses[f"{i+1}"] = response

# After the loop, save the responses to a file
with open('gpt_responses.json', 'w') as f:
    json.dump(responses, f, indent=2)

print("gpt_responses.json saved successfully.")

# Print the first 10 responses
for i in range(1, min(11, len(responses) + 1)):
    print(f"{i}: {responses.get(str(i), 'No response')}")


In [None]:
# Initialize an empty list
all_responses = []

# Process responses and populate all_responses
for key in responses:
    temp = responses[key]  
    all_responses.append(temp)

    print(f"Response: {key}")
    for key2 in temp:
        print(f"{key2}: value - {temp[key2]}")


In [None]:
prefix = input_column.split('-')[0]  # Extracts 'SWLS'
prefix

In [None]:
scores = pd.DataFrame(all_responses)
scores_aligned = scores.reset_index(drop=True)
column_map = {
    "Self_acceptance":         f"{prefix}_SelfAcceptance_{model}",
    "Positive_relations":      f"{prefix}_PositiveRelations_{model}",
    "Autonomy":                f"{prefix}_Autonomy_{model}",
    "Environmental_mastery":   f"{prefix}_EnvironmentalMastery_{model}",
    "Purpose_in_life":         f"{prefix}_PurposeInLife_{model}",
    "Personal_growth":         f"{prefix}_PersonalGrowth_{model}",
    "Satisfaction_with_life":  f"{prefix}_SWLS_{model}"
}
scores_aligned = scores_aligned.rename(columns=column_map)
df = df.reset_index(drop=True)
for col in scores_aligned.columns:
    df[col] = pd.to_numeric(scores_aligned[col], errors='coerce')

df.head(10)

In [None]:
df.columns

In [None]:
#gpt_cols = [
#'Autonomy_SelfAcceptance_gpt-3.5-turbo-1106',
#'Autonomy_PositiveRelations_gpt-3.5-turbo-1106',
#'Autonomy_Autonomy_gpt-3.5-turbo-1106',
#'Autonomy_EnvironmentalMastery_gpt-3.5-turbo-1106',
#'Autonomy_PurposeInLife_gpt-3.5-turbo-1106',
#'Autonomy_PersonalGrowth_gpt-3.5-turbo-1106',
#'Autonomy_SWLS_gpt-3.5-turbo-1106']

gpt_cols = [ 'SWLS_SelfAcceptance_gpt-4','SWLS_PositiveRelations_gpt-4', 'SWLS_Autonomy_gpt-4','SWLS_EnvironmentalMastery_gpt-4', 'SWLS_PurposeInLife_gpt-4','SWLS_PersonalGrowth_gpt-4', 'SWLS_SWLS_gpt-4']

# List of PWB columns
pwb_cols = [
    'PWB autonomy',
    'PWB environmental_mastery',
    'PWB mean',
    'PWB personal_growth',
    'PWB positive_relations',
    'PWB purpose',
    'PWB self_acceptance',
    'SWLS mean'
]

# Subset the dataframe to relevant columns and drop rows with missing data
df_corr = df[gpt_cols + pwb_cols].dropna()

# Calculate correlation matrix between GPT and PWB columns
corr_matrix = df_corr[gpt_cols].corrwith(df_corr[pwb_cols], axis=0)

# Alternatively, compute full pairwise correlations between GPT and PWB columns
corr_full = df_corr[gpt_cols + pwb_cols].corr().loc[gpt_cols, pwb_cols]

print("Pairwise correlations between GPT and Well-being columns:")
print(corr_full)

In [None]:
df.to_csv(f"/Users/sm9518/Desktop/LP2-wellbeing-prediction/data/GPT/WBP-{input_column}-GPT-{model}-{temperature}-scores.csv", index=True)