### Conversion utilities

This notebook contains conversion utilities for pre- and post-processing evaluation sets as needed.

In [1]:
# imports

import pickle
import pandas as pd

In [2]:
files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]

files_retest = ["noun_phrases_positive-retest"]


#### Convert initial .txt files to .csv

The input txt files were manually compiled and annotated.
The output csv files are used as input for the AnthroScore evaluation in experiment 1, and for creating the AtypicalAnimacy flavored evaluation sets (+ .pkl) in experiment 2.

In [3]:
import pandas as pd

for file in files:
    column_names = ['id', 'sentence', 'AI phrase', 'mask', 'AI entity', 'anthro component', 'score']
    df = pd.read_csv(f'../data/evaluation_sentences_txt/{file}.txt', sep='\t', header=None, names=column_names,index_col=False)
    df.to_csv(f'../data/evaluation_sentences_csv/{file}.csv', index=False)  # comma is the default delimiter

In [5]:
import pandas as pd

column_names = ['id', 'sentence', 'AI phrase', 'mask', 'AI entity', 'anthro component', 'score']
df = pd.read_csv('../experiment_2/noun_phrases_positive-retest.txt', sep='\t', header=None, names=column_names,index_col=False)
df.to_csv('../experiment_2/noun_phrases_positive-retest.csv', index=False)  # comma is the default delimiter

#### Check if Pia and Jelke's IAA sentences exist in the evaluation sets after revision

In [6]:
import csv
import re

def normalized(string):
    return re.sub(r'\s+', ' ', string.strip())
    

def get_ids(filename):
    
    with open(f"../data/evaluation_sentences_csv/{filename}.csv","r") as infile:

        ids = []

        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:
            sentence_id = normalized(row[0])
            ids.append(sentence_id)

        return ids


all_ids = []
for file in files:
    ids = get_ids(file)
    all_ids.extend(ids)

raters = ['Pia','Jelke']

def check_ids(rater,ids):

    with open(f"../data/IAA/IAA_evaluation_set_{rater}.csv","r") as infile:

        row_counter = 0
        id_counter = 0

        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:
            row_counter += 1
            sentence_id = normalized(row[0])
            if sentence_id in ids:
                id_counter += 1
            if sentence_id not in ids:
                print(f"did not find {sentence_id} in ids")


    print(f"found {id_counter} ids out of {row_counter} sentences in {rater}'s evaluation set")
                

for rater in raters:
    check_ids(rater,all_ids)

found 42 ids out of 42 sentences in Pia's evaluation set
found 42 ids out of 42 sentences in Jelke's evaluation set


#### Convert .csv files to .txt for further revision as necessary

In [7]:
for file in files:
    df = pd.read_csv(f'../experiment_2/anthroscore/predictions/csv/{file}.csv')  # comma is default
    df.to_csv(
        f'../experiment_2/anthroscore/predictions/txt/{file}.txt', # assumes /txt dir exists
        sep='\t',
        index=False,
        header=False
    )

OSError: Cannot save file into a non-existent directory: '../experiment_2/anthroscore/predictions/txt'

#### Convert processed .txt files back to .csv

The function below allows for converting txt files to csv and dropping specified columns.

In [3]:
import pandas as pd

def txt_to_csv(input_file,output_file,column_names,columns_to_drop):

    df = pd.read_csv(input_file, sep='\t', header=None, names=column_names)

    df = df.drop(columns=columns_to_drop)

    df.to_csv(output_file, index=False)
    print(f"Saved to {output_file}")

AA_column_names = ['id','previous_sentence','current_sentence','masked_sentence','next_sentence','AI_phrase',
                                'masked_phrase','AI_entity','component','target_expression','expectations','target_sentence',
                                 'predicted','scores','predicted_tokens','predictions']
AA_columns_to_drop = ['target_expression', 'target_sentence', 'predicted', 'scores', 'predicted_tokens']

anthroscore_column_names = ['id','sentence','masked_sentence','AI_phrase','mask','AI_entity',
                            'component','original_term','original_noun','expectations','predictions']
anthroscore_columns_to_drop = ['original_term','original_noun']

for file in files:
    input_file_path = f"../experiment_1/anthroscore/predictions/txt/{file}.txt"
    output_file_path = f"../experiment_1/anthroscore/predictions/csv/{file}.csv"
    txt_to_csv(input_file_path,output_file_path,anthroscore_column_names,anthroscore_columns_to_drop)

Saved to ../experiment_1/anthroscore/predictions/csv/adjective_phrases_inconclusive.csv
Saved to ../experiment_1/anthroscore/predictions/csv/adjective_phrases_negative.csv
Saved to ../experiment_1/anthroscore/predictions/csv/adjective_phrases_positive.csv
Saved to ../experiment_1/anthroscore/predictions/csv/comparisons_inconclusive.csv
Saved to ../experiment_1/anthroscore/predictions/csv/noun_phrases_positive.csv
Saved to ../experiment_1/anthroscore/predictions/csv/possessives_positive.csv
Saved to ../experiment_1/anthroscore/predictions/csv/verb_objects_inconclusive.csv
Saved to ../experiment_1/anthroscore/predictions/csv/verb_objects_negative.csv
Saved to ../experiment_1/anthroscore/predictions/csv/verb_objects_positive.csv
Saved to ../experiment_1/anthroscore/predictions/csv/verb_subjects_inconclusive.csv
Saved to ../experiment_1/anthroscore/predictions/csv/verb_subjects_negative.csv
Saved to ../experiment_1/anthroscore/predictions/csv/verb_subjects_positive.csv


#### Convert .csv files to .pkl (for the AtypicalAnimacy evaluation)

In [8]:
def csv_to_AA_pkl(csv_path,pkl_path,rename_map):
    """
    This function takes a .csv file, renames columns and saves as a .pkl file.
    
      :param csv_path: csv_path (str): Path to input CSV.
      :type csv_path: string
      :param pkl_path: pkl_path (str): Path for output PKL.
      :type pkl_path: string
      :param rename_map: dictionary that maps old column names to new column names
      :type rename_map: dict
    """
    df = pd.read_csv(csv_path)
    df = df.rename(columns=rename_map)

    df.to_pickle(pkl_path, compression=None)
    print(f"Saved to {pkl_path}")

column_map = {'id':'Id',
              'Previous Sentence':'prevSentence',
              'Current Sentence':'currentSentence',
              'Masked Sentence':'maskedSentence',
              'Next Sentence':'nextSentence',
              'AI Phrase':'AIPhrase',
              'Suggested Mask':'suggestedMask',
              'AI Entity':'AIEntity',
              'Anthropomorphic Component':'anthroComponent',
              'Target Expression':'targetExpression',
              'Animated':'animated'
             }

for filename in files_retest:    
    csv_path = f"../experiment_2/AtypicalAnimacy/expectations/csv/{filename}.csv"
    pkl_path = f"../experiment_2/AtypicalAnimacy/expectations/pkl/{filename}.pkl"
    csv_to_AA_pkl(csv_path,pkl_path,column_map)

Saved to ../experiment_2/AtypicalAnimacy/expectations/pkl/noun_phrases_positive-retest.pkl
