# Data preparation

The following code is used to prepare the positive interpretations dataset for the experiment.

In [None]:
import pandas as pd
import csv
import utils.data_preparation as utils

### 1. Read and convert CoNLL files

First, we read the CoNLL-2011 annotated files and do the following:

- get the original sentence identifiers (not per part, but per file)
- merge all data into one dataframe

In [None]:
# Read and concatenate
df_dev = utils.get_all_tokens_conll("../data/conll2011-gold/dev_gold_conll" )
df_test = utils.get_all_tokens_conll("../data/conll2011-gold/test_gold_conll" )
df_train = utils.get_all_tokens_conll("../data/conll2011-gold/train_gold_conll" )
df_conll = pd.concat([df_dev, df_test, df_train])
df_conll = df_conll[df_train.columns] # keep original order of columns
df_conll.head()

In [None]:
# Write to file, or load df_conll
#df_conll.to_csv("/Users/Chantal/Corpora/conll-2011/gold/all_gold_conll", sep="\t", index=False)
df_conll = pd.read_csv("../data/conll2011-gold/all_gold_conll", sep="\t")

### 2. Convert annotations

Then, we convert the original annotations file for the positive interpretations to a format that is easier to work with and that is extended with some additional information:

- use tab as separator (instead of #)
- split the verb & role information into separate columns
- add original sentence identifiers of OntoNotes
- convert scores into classes (tertiary and binary)

In [None]:
# Read file
ann_file = "../data/NAACL2016-Annotations/Annotations-SemanticRoles.csv"
columns = ["file_id", "part_id", "sent_id_part", "predicate", "verb", "role", "negation", 
          "positive_interpretation", "label"]
df = pd.read_csv(ann_file, sep="#", quoting=csv.QUOTE_NONE, names=columns, index_col=False)
df = df[df.label != "invalid"] # one invalid instance

In [None]:
# Convert and add information
df = utils.find_original_sent_ids(df, df_conll)  
df = utils.rewrite_verb_and_role_features(df)
df = utils.categorize_scores(df)

# Change order of columns
columns = ['file_id','sent_id_file','part_id','sent_id_part','predicate','negation','positive_interpretation',
           'verb_wf','verb_pos','verb_span','verb_label','verb_tokens',
           'role_head_wf','role_head_pos','role_span','role_label','role_tokens',
           'label', 'class_tertiary', 'class_binary']
df = df[columns]

df.head()

In [None]:
# Write to new file
tsv_file = "../data/NAACL2016-Annotations/Annotations-SemanticRoles.tsv"
df.to_csv(tsv_file, sep="\t", index=False)

### 3. Split into train/test sets

Finally, we split the data into a train/test sets. **Note:** splitting is done randomly, so this will generate a new train/test split every time.

In [None]:
# Read data from file
tsv_file = "../data/NAACL2016-Annotations/Annotations-SemanticRoles.tsv"
df = pd.read_csv(tsv_file, sep="\t")

In [None]:
# Split random into test/train
df_train, df_test = utils.split_train_test(df, test_ratio=0.2, to_shuffle=True)
df_train["dataset"] = "train"
df_test["dataset"] = "test"
df_all = pd.concat([df_train, df_test])
print(len(df_train), len(df_test), len(df_all))

In [None]:
# Write to files
df_test.to_csv("../data/test.tsv", sep="\t", index=False)
df_train.to_csv("../data/train.tsv", sep="\t", index=False)
df_all.to_csv("../data/all.tsv", sep="\t", index=False)