## Golden dataset preprocessing

In [51]:
from typing import Any, List, Set, Tuple, Union, Dict
import pandas as pd
import json
import ast
import pprint

In [4]:
gold_dataset = pd.read_csv("/content/Gold_Standard_Dataset.csv")
gold_dataset.head(3)

Unnamed: 0,text,labels,comments,sentences_with_labels
0,TITLE: Endless Ledge Skip Campaign for Alts PO...,"[[155, 588, 'slippery slope']]","['Slippery slope: P1 = poster, A = why not jus...","{""TITLE: Endless Ledge Skip Campaign for Alts ..."
1,"Two of my best friends are really introverted,...","[[84, 145, 'hasty generalization']]","[""Based on two people only, you can't draw gen...","{""Two of my best friends are really introverte..."
2,TITLE: There is a difference between a'smurf' ...,"[[118, 265, 'false analogy']]","['False Analogy: X: Having an alt , Y: smurfin...","{""TITLE: There is a difference between a'smurf..."


In [42]:
def convert_to_json(df:pd.DataFrame, add_fallacies:bool=True) -> List[Dict]:
    result = []
    for index, row in df.iterrows():
        if add_fallacies:
            fallacies = [{fallacy[2]: [fallacy[0], fallacy[1]]} for fallacy in ast.literal_eval(row['labels'])]
            result.append({row['text']: fallacies})
        else:
            result.append({row['text']: []})
    return result

In [56]:
result_json = convert_to_json(df=gold_dataset,add_fallacies=True)
with open('golden_dataset.json', 'w') as f:
    json.dump(result_json, f)

In [59]:
result_json[3]

{"TITLE: Discussion Thread (Part 3): 2020 Presidential Race Democratic Debates - Post Debate | Night 2 POST: Joe Biden will lose to Trump if he is the nominee. The Democratic party clearly has not learned the right lesson from Hillary Clinton's miserable failure. NOBODY WANTS ESTABLISHMENT POLITICIANS ANYMORE. NOBODY LIKES THE STATUS QUO. Like Jesus Christ you think they would learn. POST: The status quo in America is that its the best its ever been. We live in one of the best societies in the best times that humans have ever experienced.\n": [{'guilt by association': [107,
    261]},
  {'causal oversimplification': [107, 338]},
  {'ad populum': [158, 338]},
  {'nothing': [158, 338]},
  {'circular reasoning': [391, 542]}]}

## Collect texts for prompting

In [60]:
result_json = convert_to_json(df=gold_dataset,add_fallacies=False)
with open('test_dataset.json', 'w') as f:
    json.dump(result_json, f)

In [61]:
result_json[3]

{"TITLE: Discussion Thread (Part 3): 2020 Presidential Race Democratic Debates - Post Debate | Night 2 POST: Joe Biden will lose to Trump if he is the nominee. The Democratic party clearly has not learned the right lesson from Hillary Clinton's miserable failure. NOBODY WANTS ESTABLISHMENT POLITICIANS ANYMORE. NOBODY LIKES THE STATUS QUO. Like Jesus Christ you think they would learn. POST: The status quo in America is that its the best its ever been. We live in one of the best societies in the best times that humans have ever experienced.\n": []}