# Create overview of data

Goal:

* Create an overview table of all texts with the following info:
    * document title
    * document id (create unique id)
    * language
    * event type id
    * incident id
    * annotation status (manual, automatic, not)
* Sort and store files using identifiers instead of document titles
* Write conversion scripts from naf to:
    * conll
    * json (see what is convenient - look at hugging face input formats)
    
    
**Step 1**

Downloaded all repositories mentioned in our [data overview](https://docs.google.com/document/d/1uH1MawK5HVh1SkrD9qe-7A-hDfFAT-TYbkucIckPVGQ/edit?usp=sharing) (excluding Sam's Google drive updates) and stored in data/releases-and-repos. 


**Step 2**

Sorted all repositories so that data are stored in the same dir structure: 

* unstructured/[lang]/docs.naf
* unstructured/... (if present)

Copied data to releases-and-repos-sorted following the structure outliend above. Observations:

* typical_frames: no new data in  (downloades from DFNDataReleases) --> skipped
* HDD_analysis: also downloaded from DFNDataReleases but some files found in test --> added
* dfn-data-cleaning: contains two products:
    * data-headlines --> integrated as dfn-data-cleaning-headlines-labeled
    * data-headlines-unlabeled --> integrated as dfn-data-cleaning-headlines-unlabeled



In [1]:
import os
from lxml import etree as et
import csv

In [2]:
%pwd

'/Users/piasommerauer/Code/DutchFrameNetData/scr'

lxml.etree.XMLSyntaxError

In [3]:
# look at dir structures

data_dir = '../data/releases-and-repos-sorted'

all_paths = []

for subdir in os.listdir(data_dir):
    if not subdir.startswith('.DS'):
        all_paths.append(f'{data_dir}/{subdir}')

In [6]:
all_paths

['../data/releases-and-repos-sorted/v1',
 '../data/releases-and-repos-sorted/dfn-data-cleaning-headlines-unlabeled',
 '../data/releases-and-repos-sorted/v2.1',
 '../data/releases-and-repos-sorted/v1.1',
 '../data/releases-and-repos-sorted/DFNDataReleases',
 '../data/releases-and-repos-sorted/DFN_annotations',
 '../data/releases-and-repos-sorted/HDD_analysis',
 '../data/releases-and-repos-sorted/v1.2',
 '../data/releases-and-repos-sorted/dfn-data-cleaning-headlines-labeled']

In [8]:
# check if annotated

def get_annotation_status(path):
    
    ann_time_dict = dict()

    try: 
        tree = et.parse(path)
        root = tree.getroot()
        srl = root.find('srl')
        timestamp = root.find('nafHeader/fileDesc').get('creationtime')
        annotation_srl = [el for el in root.findall('nafHeader/linguisticProcessors') if el.get('layer') == 'srl']
        if len(annotation_srl) > 0:
        # get annotators and timestamps
        
            annotators = annotation_srl[0].findall('lp')
            for an in annotators:
                name = an.get('name')
                ts = an.get('endTimestamp')
                ann_time_dict[name] = ts
        
   
    
        if not srl is None:

            preds = srl.findall('predicate')
            annotation_mode = set()
            for pred in preds:
                annotation_mode.add(pred.get('status'))
        else:
            annotation_mode = {'none'}
            
        
    except et.XMLSyntaxError:
        annotation_mode = {'xml syntax error'}
        timestamp = '-'

        
    
    return annotation_mode, timestamp, ann_time_dict

    
test = all_paths[5]
get_annotation_status(test)    

({'xml syntax error'}, '-', {})

In [9]:
def get_most_recent_annotation(ann_time_dict):
    
    if len(ann_time_dict) > 0:
    
        times = list(ann_time_dict.values())
        most_recent = max(times)

        for name, ts in ann_time_dict.items():
            if ts == most_recent:
                latest_name = name
                break
    else:
        most_recent = '-'
        latest_name = '-'
    return most_recent, latest_name

In [10]:
# get quick overview

doc_names = []
full_paths = []
langs = ['en', 'nl']

for path in all_paths:
    path_l = path.split('/')
    source = path_l[-1]
    for lang in langs:
        path_lang = f'{path}/unstructured/{lang}/'
        if os.path.isdir(path_lang):
            for path_text in os.listdir(path_lang):
                if path_text.endswith('.naf'):
                    full_path = f'{path_lang}{path_text}'
                    full_paths.append(full_path)
                    ann_mode, timestamp, name_time_dict = get_annotation_status(full_path)
                    most_recent, latest_name = get_most_recent_annotation(name_time_dict)
                    title = path_text.split('.')[0].strip()
                    d = dict()
                    d['release'] = source
                    d['lang'] = lang
                    d['text_title'] = path_text
                    d['annotation_mode'] = '-'.join(ann_mode)
                    d['most_recent_annotation'] = most_recent
                    d['most_recent_annotator'] = latest_name
                    d['annotators'] = ' '.join(name_time_dict.keys())
                    d['creationtime'] = timestamp
                    doc_names.append(d)

In [11]:
header = doc_names[0].keys()
print(header)
with open('../data/overview.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames = header, delimiter = ',')
    writer.writeheader()
    for row in doc_names:
        writer.writerow(row)

dict_keys(['release', 'lang', 'text_title', 'annotation_mode', 'most_recent_annotation', 'most_recent_annotator', 'annotators', 'creationtime'])
