### Calculating spans and converting to evaluation format

In [1]:
def calc_span(paragraph, sentence):
    if isinstance(sentence, dict):
        if sentence.__contains__('text'):
            text = sentence.get('text') 
            if text in paragraph:
                start = paragraph.find(text)
                end = paragraph.find(text) + len(text)
                return [start, end]
            else:
                return [0,0]
        else:
            return [0,0]
    elif isinstance(sentence, str):
        if sentence in paragraph:
            start = paragraph.find(text)
            end = paragraph.find(text) + len(text)
            return [start, end]
        else:
            return [0,0]
    else:
        return [0,0]

                 

In [91]:
def convert_to_eval_format(paragraph, prediction):
    span = calc_span(paragraph, prediction)
    if isinstance(span, list):
        return {paragraph: {prediction['class']: span}}
    else:
        return "invalid"

In [2]:
def convert_list_to_eval_format(paragraph, predictions):
    span_list = []
    for prediction in predictions:
        span = calc_span(paragraph, prediction)
        span_list.append({prediction['class']: span})
    return {paragraph: span_list}

In [8]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [33]:
import json

with open('../Data/gold_standard_dataset.jsonl', 'r') as file:
    data = [json.loads(line) for line in file]

In [57]:
text = data[1]['text']
# example prediction from LLM
pred = {"text": "That leads to me believe that most cat lovers are really shy.", "class": "hasty generalization"}
print(calc_span(text, pred))
print(convert_to_eval_format(text, pred))

[84, 145]
{'Two of my best friends are really introverted, shy people, and they both have cats. That leads to me believe that most cat lovers are really shy.\n': {'hasty generalization': [84, 145]}}


-------------------------------------------------
### Formatting

In [3]:
# Imports
import pandas as pd
import json
import re

In [4]:
# Read file
input_filename = 'results_mistral_mc.csv'
df = pd.read_csv(input_filename)

In [5]:
def extract_json_string(model_output):
    """
    Extracts valid JSON strings from the text within square brackets [].
    Only includes JSON objects that have a non-empty 'class' field.
    """
    try:
        # Find all substrings within square brackets
        json_strings = re.findall(r'\[.*?\]', model_output, re.DOTALL)
        
        valid_json_objects = []
        
        for json_str in json_strings:
            try:
                # Clean the JSON string to ensure it's in valid JSON format
                json_str = json_str.replace('\\"', '\"').replace("\\'", "'")

                json_objects = json.loads(json_str)
                
                for obj in json_objects:
                    if obj.get("class"):
                        valid_json_objects.append(obj)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e} in string: {json_str}")
                continue
        
        # Convert the list of valid JSON objects back to a JSON string
        if valid_json_objects:
            return json.dumps(valid_json_objects)
        
        return "[]"
    except Exception as e:
        print(f"Error extracting JSON: {e}")
        return "[]"


Error decoding JSON: Unterminated string starting at: line 1 column 11 (char 10) in string: [{"text": "It's a mistake. [for]
Error decoding JSON: Unterminated string starting at: line 1 column 11 (char 10) in string: [{"text": "It's a mistake. [for]
Error decoding JSON: Expecting ',' delimiter: line 1 column 37 (char 36) in string: [{"text": "We don't need to do the "slippery slope" thing either.", "class": "Slippery Slope"}]
Error decoding JSON: Expecting ',' delimiter: line 2 column 62 (char 169) in string: [{"text": "There are many of us and we can't all be wrong.", "class": "Appeal to Popularity (Ad Populum)"},
 {"text": "You are doing a dis-service to your country being"politically correct" in regard to Islam and your children will suffer because of it.", "class": "Guilt by Association"}]
Error decoding JSON: Expecting ',' delimiter: line 2 column 239 (char 349) in string: [{"text": "So he's saying liberals are better educated than conservatives. We agree.", "class": "No fallacy"}

In [None]:
df['json_string'] = df['output'].apply(extract_json_string)

In [114]:
def extract_fallacy_class(model_output):
    """
    Extracts fallacy class from the model_output string.
    If the class is "no fallacy" or "No fallacy", returns an empty JSON string.
    """
    try:
        # Find all substrings within curly braces {}
        matches = re.findall(r'\{(.*?)\}', model_output)
        
        # Filter out "no fallacy" or "No fallacy" classes
        valid_classes = [match for match in matches if match.lower() != "no fallacy"]
        
        # Create a list of dictionaries with the extracted classes
        if valid_classes:
            result = [{"class": cls} for cls in valid_classes]
            return json.dumps(result)
        
        return "[]"
    except Exception as e:
        print(f"Error extracting fallacy class: {e}")
        return "[]"

In [115]:
df['json_string'] = df['output'].apply(extract_fallacy_class)

In [6]:
df.to_csv('results_mistral_mc_eval.csv', index=False)

## Convert .csv to eval .json

In [7]:
#convert csv results to eval json
import csv

# Read CSV file
file_path = "results_mistral_mc_eval.csv"

file = open(file_path, mode = 'r', newline = '', encoding='utf8')
csv_reader = csv.reader(file, delimiter=',')

#skip header
next(csv_reader)
# Read the rest of the rows
data = [row for row in csv_reader]

### Multi-label

In [92]:
import json

input = [row[0] for row in data]
predictions = [json.loads(row[2]) for row in data]
eval_data = []
for n, paragraph in enumerate(input):
    eval_data.append(convert_list_to_eval_format(paragraph, predictions[n]))

In [93]:
file_name = file_path.replace('.csv', '.json')
with open(f'../Data/formatted_results/eval_{file_name}', 'w', encoding='utf8') as out:
    json.dump(eval_data , out, indent = 2)


### Multi-class

In [8]:
import json
input = [row[0] for row in data]
predictions = [json.loads(row[2]) for row in data]
eval_data = []
for n, paragraph in enumerate(input):
    prediction = predictions[n]
    if prediction:
        eval_data.append({"text": paragraph, "class": prediction[0]['class']})
    else:
        eval_data.append({"text": paragraph, "class": 0})


In [10]:
file_name = file_path.replace('.csv', '.json')
with open(f'../Data/formatted_results/gkp_eval_{file_name}', 'w', encoding='utf8') as out:
    for line in eval_data:
        json_string = json.dumps(line)
        out.write(json_string + '\n')
    # json.dump(eval_data , out, indent = 0)
    # out.write("\n")