In [None]:
import json
import pickle
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
client = OpenAI(api_key='###')

In [None]:
# Load and preprocess the DataFrame
df = pd.read_pickle('subset_tokenized_exploded.pkl')
electoral_term = 17
df = df[df['electoral_term'] == electoral_term]

In [None]:
# Add unique ID for each speech
df['split_id'] = df.groupby('id').cumcount() + 1
df['id'] = (df["electoral_term"].astype(str) + "_" + df["session"].astype(str) + "_" + df["id"].astype(str) + "_" +df["split_id"].astype(str))
df = df[['id', 'split_speeches']]
speech_list = df.to_dict(orient='records')

In [None]:
# Read System Prompt from .txt file
with open('prompt.txt', 'r') as file:
    system_prompt = file.read()

In [None]:
# List to store the response values
response_values = []

# Function to process a single speech
def process_speech(speech):
    act_speech = speech['split_speeches']
    act_id = speech['id']
    prompt = f"ID: {act_id}\nSpeech: \"{act_speech}\""
    error_count = 0
    while error_count <= 1:
        try:
            response = client.chat.completions.create(
                model="ft:gpt-4o-mini-2024-07-18:political-polarization-thesis:4o-mini-finetune-test-1:ARKbzyeS",
                messages=[
                    {
                        "role": "system",
                        "content": system_prompt
                    },
                    {"role": "user", "content": prompt}
                ],
                temperature=1,
                max_tokens=2048,
                response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "speech_data",
                        "schema": {
                            "type": "object",
                            "required": [
                                "speech_id",
                                "polarizing"
                            ],
                            "properties": {
                                "speech_id": {
                                    "type": "string",
                                    "description": "Unique identifier for the speech."
                                },
                                "polarizing": {
                                    "type": "string",
                                    "description": "Indicates if the speech is polarizing, represented as '0' or '1'."
                                }
                            },
                            "additionalProperties": False
                        },
                        "strict": True
                    }
                }
            )
            response_text = response.choices[0].message.content
            response_json = json.loads(response_text)

            # Convert keys to lowercase and remove leading/trailing whitespaces
            response_json = {k.lower().strip(): v for k, v in response_json.items()}
            if 'speech_id' in response_json.keys() and 'polarizing' in response_json.keys():
                return response_json
            else:
                raise Exception("Invalid response format.")
        except Exception as e:
            print(f"Error for speech {act_id}: {e}")
            if error_count == 1:
                print(f"Skipping speech {act_id}...")
                return {"speech_id": act_id, "polarizing": "2"}
            else:
                print(f"Retrying for speech {act_id}...")
            error_count += 1

In [None]:
# Process in parallel
with ThreadPoolExecutor(max_workers=6) as executor:
    futures = [executor.submit(process_speech, speech) for speech in speech_list]
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            response_values.append(future.result())
        except Exception as e:
            print(f"Error: {e}")

In [None]:
with open('response_values.pkl', 'wb') as f:
    pickle.dump(response_values, f)


In [None]:
# Convert results to DataFrame
response_df = pd.DataFrame(response_values)
response_df.rename(columns={'speech_id': 'id', 'polarizing': 'polarizing_4o_mini'}, inplace=True)


In [None]:
# Merge the original DataFrame with the response DataFrame
df = df.merge(response_df, on='id', how='left')

In [None]:
# fill nan and convert to int
df['polarizing_4o_mini'] = df['polarizing_4o_mini'].fillna(2).astype(int)

In [None]:
# show amount of errors
print("Amount of Errors: " + str(df[df['polarizing_4o_mini'] == 2].shape[0]))

In [None]:
# show amount of non-polarizing speeches
print("Amount of Non-Polarizing Speeches: " + str(df[df['polarizing_4o_mini'] == 0].shape[0]))
# show amount of polarizing speeches
print("Amount of Polarizing Speeches: " + str(df[df['polarizing_4o_mini'] == 1].shape[0]))

In [None]:
# Save the results
suffix = 'final'
df.to_pickle('response_4o_mini_'+suffix+'_'+str(electoral_term)+'.pkl')
df.to_excel('response_4o_mini_'+suffix+'_'+str(electoral_term)+'.xlsx', index=False)