In [None]:
import json
import numpy as np
import pandas as pd
import openai
import tqdm
import transformers

from datasets import load_dataset

In [None]:
with open('.openai_key', 'r') as f:
    openai_key = f.read()
    
with open('.openai_org', 'r') as f:
    openai_org = f.read()

In [None]:
from openai import OpenAI
client = OpenAI(
    organization=openai_org,
    api_key=openai_key,
)

In [None]:
dataset_train = "TextCEsInFinance/fomc-communication"
dataset_test = "TextCEsInFinance/fomc-communication-counterfactual"

train = load_dataset(dataset_train)['train'].to_pandas()
test = load_dataset(dataset_test)['test'].to_pandas()

In [None]:
train_hawkish = train[train['text_label'] == 'hawkish']
train_dovish = train[train['text_label'] == 'dovish']
train_neutral = train[train['text_label'] == 'neutral']

In [None]:
classes = ['DOVISH', 'HAWKISH', 'NEUTRAL']
class_map = {0: 'DOVISH', 1: 'HAWKISH', 2: 'NEUTRAL'}

In [None]:
classifier_fomc = "gtfintechlab/FOMC-RoBERTa"

pipe = transformers.pipeline(model=classifier_fomc, device=0)

In [None]:
def generate_prompt(fact):
    label = class_map[fact['label']]
    target = class_map[fact['target']]

    n_examples = 10

    example_label = ''

    for i, text in enumerate(train[train['text_label'] == label.lower()].sample(n=n_examples)['text']):
        example_label += f'{i+1}. {text}\n'

    example_target = ''

    for i, text in enumerate(train[train['text_label'] == target.lower()].sample(n=n_examples)['text']):
        example_target += f'{i+1}. {text}\n'


    prompt = f"""A classification Machine Learning model classifies texts into three classes: {', '.join(classes)}. Your task is to transform a QUERY sentence that was classified as {label} into a COUNTERFACTUAL that should be classified as {target}. You can replace, remove or add words, but you should keep the amount of changes to minimum, only performing up to 5 changes. You can use the EXAMPLE {label} and EXAMPLE {target} sentences as examples how sentences belonging to those classes might look like. You should generate only one COUNTERFACTUAL sentence.

    EXAMPLE {label}:
    {example_label}

    EXAMPLE {target}:
    {example_target}

    {label} QUERY: {fact['text']}

    {target} COUNTERFACTUAL:"""

    return prompt

In [None]:
def classify(sentences):
    classified = []
    for out in pipe(sentences, batch_size=8, truncation="only_first"):
        label = out['label'].split("_")[1]
        out['label'] = int(label)
        classified.append(out)
    return classified

In [None]:
results = []
succ = 0

for i, row in tqdm.tqdm(test.iterrows(), total=test.shape[0]):
    
    prompt = generate_prompt(row)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ],
        n=5
    )
    
    classifications = classify([x.message.content for x in response.choices])
    
    res = {
        'index': row['index'],
        'text': row['text'],
        'label': row['label'],
        'text_label': row['text_label'],
        'target': row['target'],
        'success': False,
        'counterfactuals': []
    }
    
    for cl, resp in zip(classifications, response.choices):
        res['counterfactuals'].append({
            'text': resp.message.content,
            'label': cl['label'],
            'score': cl['score']
        })
        
        if cl['label'] == row['target']:
            res['success'] = True
            
    if res['success']:
        succ += 1
        
    results.append(res)
    
print(succ / len(results))
        
    

In [None]:
for res in results:
    succ_ces = []
    for ce in res['counterfactuals']:
        if ce['label'] == res['target']:
            succ_ces.append(ce)
    
    if len(succ_ces):
        ce = sorted(succ_ces, key=lambda x: x['score'], reverse=True)[0]
    
        print(f"{class_map[res['label']]}: {res['text']}")
        print(f"{class_map[ce['label']]}: {ce['text']}\n")

In [None]:
def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()

In [None]:
with open('gpt-4o_results.json', 'w', encoding='utf-8') as file:
    file.write(json.dumps(results, default=np_encoder, indent=4))