In [3]:
from datasets import load_dataset
from polyjuice import Polyjuice

import json
import tqdm
import transformers

import numpy as np
import pandas as pd

In [None]:
classifier_fomc = "gtfintechlab/FOMC-RoBERTa"

dataset_id = "TextCEsInFinance/fomc-communication-counterfactual"

dataset = load_dataset(dataset_id, encoding='utf-8')

In [7]:
# low = 100
test = dataset['test'].to_pandas()[['text', 'label', 'text_label', 'target']]

In [4]:
test = pd.read_csv('diff.csv', encoding='utf-8')

In [None]:
test.iloc[8]

In [6]:
results = []

In [None]:
pj = Polyjuice(model_path="uw-hai/polyjuice", is_cuda=True)

for i in tqdm.tqdm(range(len(test))):
    # the base sentence
    row = test.iloc[i]
    text = row['text']
    target = row['target']

    # perturb the sentence with one line:
    # When running it for the first time, the wrapper will automatically
    # load related models, e.g. the generator and the perplexity filter.
    perturbations = pj.perturb(text, num_beams=4, num_perturbations=5)
    results.append({
        'text': row['text'],
        'label': row['label'],
        'counterfactuals': perturbations,
        'target': target,
    })

In [8]:
def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()

with open('results_diff.json', 'w') as file:
    file.write(json.dumps(results, default=np_encoder, indent=4))

In [11]:
pipe = transformers.pipeline(model=classifier_fomc, device=0)

In [12]:
def classify(sentences):
    classified = []
    for out in pipe(sentences, batch_size=8, truncation="only_first"):
        label = out['label'].split("_")[1]
        out['label'] = int(label)
        classified.append(out)
    return classified

In [13]:
def classify_results(results):
    results = results.copy()
    for result in results:
        result['classification'] = classify(result['text'])
        ces_classified = classify(result['counterfactuals'])
        new_ces = [{'text': x, 'label': y['label'], 'score': y['score']} for x, y in zip(result['counterfactuals'], ces_classified)]
        new_ces = sorted(new_ces, key=lambda x: x['score'], reverse=True)
        result['counterfactuals'] = new_ces
    return results

In [14]:
cl_results = classify_results(results)

In [15]:
with open('results_classified_diff.json', 'w') as file:
    file.write(json.dumps(cl_results, default=np_encoder, indent=4))

In [33]:
with open('results.json', 'r') as file:
    old_results = json.load(file)
    
with open('results_1.json', 'r') as file:
    old_results_1 = json.load(file)
    
old_results = old_results + old_results_1
    
with open('results_classified.json', 'r') as file:
    old_results_classified = json.load(file)
with open('results_classified_1.json', 'r') as file:
    old_results_classified_1 = json.load(file)
    
old_results_classified = old_results_classified + old_results_classified_1

In [None]:
c = 0
for i, old_result in enumerate(old_results_classified):
    for result in results:
        if old_result['text'] == result['text']:
            c += 1
            old_results_classified[i] = result
c

In [None]:
old_results_classified

In [39]:
with open('results_new.json', 'w') as file:
    file.write(json.dumps(old_results_classified, default=np_encoder, indent=4))