Vanilla evaluation for a single label classification. If the predicted label matches any of the target labels, it's assumed to be the correct prediction.

In [44]:
import json
from typing import List, Tuple

In [6]:
LABEL_MAP = {'abusive ad hominem':1,
             'ad populum':2,
             'appeal to false authority':3,
             'appeal to nature':4,
             'appeal to tradition':5,
             'guilt by association':6,
             'tu quoque':7,
             'causal oversimplification':8,
             'circular reasoning':9,
             'equivocation':10,
             'false analogy':11,
             'false causality':12,
             'false dilemma':13,
             'hasty generalization':14,
             'slippery slope':15,
             'straw man':16,
             'fallacy of division':17,
             'appeal to positive emotion':18,
             'appeal to anger':19,
             'appeal to fear':20,
             'appeal to pity':21,
             'appeal to ridicule':22,
             'appeal to worse problem':23}

In [72]:
def extract_fallacies(json_predictions_path: str,
                      json_golds_path: str):

    with open(json_predictions_path, 'r') as file:
        predictions = [json.loads(line) for line in file]

    predicted_fallacies = []

    for d in predictions:
        if len(d) != 0:
            pred = list(d[0].values())[1]
            if pred.lower() in LABEL_MAP:
                predicted_fallacies.append(LABEL_MAP[pred.lower()])
            else:
                predicted_fallacies.append(0)
        else:
            predicted_fallacies.append(0)

    with open(json_golds_path, 'r') as j:
        golds = json.loads(j.read())

    gold_fallacies = []

    for d in golds:
        if len(list(d.values())[0]) != 0:
            one_sample_fallacies = []
            for f in list(d.values())[0]:
                pred = list(f.keys())[0]
                if pred.lower() in LABEL_MAP:
                    one_sample_fallacies.append(LABEL_MAP[pred.lower()])
                else:
                    one_sample_fallacies.append(0)
            gold_fallacies.append(one_sample_fallacies)
        else:
            gold_fallacies.append([0])

    return predicted_fallacies, gold_fallacies

In [77]:
def calculate_precision_recall_f1(predictions: List[int], golden_labels: List[List[int]]):

    if len(predictions) != len(golden_labels):
        return "predictions and golden labels are not of the same length"

    TP, FP, FN = 0, 0, 0

    for pred, gold in zip(predictions, golden_labels):
        if pred in gold:
            TP += 1
        else:
            FP += 1
            FN += len(gold)

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1_score:.4f}")

## Run the evaluation

Example structure of example_preds.json file:


```
[
    {"TITLE: There is a difference between a'smurf' and an'alt'. Please learn it and stop using them interchangeably. POST: Someone once told me they have an 'alt' cause their main account was too high of rank to play with their friends. It's exactly the same as smurfing.":[{"false analogy": [0,12]},{"Appeal TO Fear":[12,29]}]},
    {"America is the best place to live, because it's better than any other country.": [{"circular reasoning": [0,78]}]}
]
```

Example structure of example_golds.json file:


```
[
    {"TITLE: There is a difference between a'smurf' and an'alt'. Please learn it and stop using them interchangeably. POST: Someone once told me they have an 'alt' cause their main account was too high of rank to play with their friends. It's exactly the same as smurfing.":[{"appeal to fear":[12,29]}]},
    {"America is the best place to live, because it's better than any other country.": [{"Circular Reasoning": [0,78]}]}
]
```



In [75]:
predicted_fallacies, gold_fallacies = extract_fallacies('/content/llama3_sg.json', '/content/golden_dataset.json')

In [78]:
calculate_precision_recall_f1(predicted_fallacies, gold_fallacies)

Precision: 0.4750
Recall: 0.3480
F1-score: 0.4017
