# Create annotation data
This notebook reads in the original data file that was provided by the client. Then it selects the English customer feedbacks and creates data files for the annotations. These annotation files include:
* a .csv file containing just 50 sentences to check the agreement between annotators on a smaller batch
* three .csv files containing around 1000 different sentences for each each annotator (this means that the annotated sentences are different between annotators in order to create a bigger dataset for the task)


*note: at this stage the output is hidden because it shows private data.*

In [1]:
import pandas as pd
import spacy

In [None]:
df = pd.read_csv('path/to/airlineDataset.csv', sep= ';', encoding='utf-8') # read the complete dataset provided by the client in .csv format
df.head()

In [None]:
en = df['QST_language'] == 'EN'
en_df = df[en]
en_df.head()

In [4]:
# split the original feedback text into sentences
nlp = spacy.load("en_core_web_sm", exclude=["parser"])
nlp.enable_pipe("senter")


ids = []  #feedback id
feedbacks = []  # sentences
n = 0
counter = [] # sentence id

for i, feedback in zip(en_df['QST_identifier'], en_df['Open_Answer']):
    doc = nlp(str(feedback))
    for sent in doc.sents:
        if len(sent.text) > 1:
            ids.append(i)
            feedbacks.append(sent.text)
            n += 1
            counter.append(n)
    # we also append an empty line to separate the feedbacks in the annotation file that is going to be generated
    ids.append('')
    feedbacks.append('')
    counter.append('')
    # being the original dataset extremely big, I set a limit to 13000 sentences
    if len(feedbacks) > 13000:
        break
        
print(len(ids),len(feedbacks),len(counter))

13003 13003 13003


In [None]:
for c, n, text in zip(counter, ids,feedbacks):

    print(c, n, text)

    if c == 54:
        break

## Create and save the dataset for the calculation of the IAA with 50 sentences.

In [6]:
# create a data file with the first 50 sentences
d = {'Sentence_ID' : counter[:68], 'Feedback_ID' : ids[:68], 'Sentence': feedbacks[:68], 'Aspect_Category' : '', 'Sentiment' : '', 'Aspect_Term' : ''}
an_data = pd.DataFrame(data=d)

In [None]:
an_data

In [None]:
an_data.to_csv('path/to/fifty_sents.csv', sep=';', index = False) # saving as .csv file

## Create a .csv file for each annotator with around 1000 sentences to be annotated

*note: when extracting the sentences, we put a higher number than 1000 because empty lines are also contained.*

In [None]:
d1 = {'Sentence_ID' : counter[69:1376], 'Feedback_ID' : ids[69:1376], 'Sentence': feedbacks[69:1376], 'Aspect_Category' : '', 'Sentiment' : '', 'Aspect_Term' : ''}
an_data1 = pd.DataFrame(data=d1)
an_data1

In [10]:
an_data1.to_csv('path/to/annotator1.csv', sep=';', index = False) # dataset for Annotator 1 with 1000 sentences. 

In [None]:
# we repeat for the other two annotators

d2 = {'Sentence_ID' : counter[1377:2677], 'Feedback_ID' : ids[1377:2677], 'Sentence': feedbacks[1377:2677], 'Aspect_Category' : '', 'Sentiment' : '', 'Aspect_Term' : ''}
an_data2 = pd.DataFrame(data=d2)
an_data2

In [12]:
an_data2.to_csv('path/to/annotator2.csv', sep=';', index = False)

In [None]:
d3 = {'Sentence_ID' : counter[2678:3985], 'Feedback_ID' : ids[2678:3985], 'Sentence': feedbacks[2678:3985], 'Aspect_Category' : '', 'Sentiment' : '', 'Aspect_Term' : ''}
an_data3 = pd.DataFrame(data=d3)
an_data3

In [14]:
an_data3.to_csv('path/to/annotator3.csv', sep=';', index = False)

# End of the notebook.