# Create unique identifiers and remove duplicates


* Find file with the most annotations
* Assign unique id 
* Store in cleaned dataset


In [60]:
import csv
import os
from collections import defaultdict
import uuid

In [71]:
with open('../data/overview.csv') as infile:
    data = list(csv.DictReader(infile, delimiter = ','))

In [72]:
article_data = defaultdict(list)

for d in data:
    title = (d['text_title'], d['lang'])
    article_data[title].append(d)

In [73]:
print(len(data))
print(len(article_data))

37729
28642


In [74]:
for title, d in article_data.items():
    if len(d) > 1:
        break

In [30]:
title

('Ultra Music Festival.naf', 'en')

In [75]:
d

[{'release': 'v1',
  'lang': 'en',
  'text_title': 'Ultra Music Festival.naf',
  'annotation_mode': 'system',
  'most_recent_annotation': '2020-05-28T21:22:31UTC',
  'most_recent_annotator': 'open-sesame',
  'annotators': 'open-sesame',
  'creationtime': '2019-07-20T00:00:00UTC'},
 {'release': 'dfn-data-cleaning-headlines-unlabeled',
  'lang': 'en',
  'text_title': 'Ultra Music Festival.naf',
  'annotation_mode': 'system',
  'most_recent_annotation': '2021-01-08T09:25:43UTC',
  'most_recent_annotator': 'open-sesame',
  'annotators': 'open-sesame',
  'creationtime': '1-01-01T00:00:00UTC'},
 {'release': 'DFNDataReleases',
  'lang': 'en',
  'text_title': 'Ultra Music Festival.naf',
  'annotation_mode': 'system',
  'most_recent_annotation': '2021-01-08T09:25:43UTC',
  'most_recent_annotator': 'open-sesame',
  'annotators': 'open-sesame',
  'creationtime': '1-01-01T00:00:00UTC'},
 {'release': 'DFN_annotations',
  'lang': 'en',
  'text_title': 'Ultra Music Festival.naf',
  'annotation_mode':

In [76]:

def find_most_annotated(article_data):
    
    times = defaultdict(list)
    
    for n, d in enumerate(article_data):
        time = d['most_recent_annotation']
        # mode = d['annotation_mode']
        # release = d['release']
        times[time].append(d)
        
    sorted_times = sorted(times, reverse = True)

    most_recent = sorted_times[0]
    d_most_recent = times[most_recent]
    if len(d_most_recent) == 1:
        selected_most_recent = d_most_recent[0]
    else:
        for d in d_most_recent:
            release = d['release']
            mode = d['annotation_mode']
            if release.startswith('dfn-data-cleaning-headlines') and mode == 'manual':
                selected_most_recent = d
                break
            elif release.startswith('dfn-data-cleaning-headlines') and mode == 'system':
                selected_most_recent = d
                break
            else:
                selected_most_recent = None
        if selected_most_recent is None:
            selected_most_recent = d_most_recent[0]

    target_release = selected_most_recent['release']
    
    # extend with other releases
    
    other_releases = [d['release'] for d in article_data if d['release'] != target_release]
    selected_most_recent['other releases'] = ' '.join(other_releases)
    
    # add unique id
    uid = uuid.uuid4()
    selected_most_recent['unique_id'] = str(uid)
    return selected_most_recent
        

find_most_annotated(d)

{'release': 'dfn-data-cleaning-headlines-unlabeled',
 'lang': 'en',
 'text_title': 'Ultra Music Festival.naf',
 'annotation_mode': 'system',
 'most_recent_annotation': '2021-01-08T09:25:43UTC',
 'most_recent_annotator': 'open-sesame',
 'annotators': 'open-sesame',
 'creationtime': '1-01-01T00:00:00UTC',
 'other releases': 'v1 DFNDataReleases DFN_annotations',
 'unique_id': '5e0a916c-8f25-4058-af46-353edeed8710'}

In [77]:
# store in file

data_unique = []

for article_title, article_d in article_data.items():
    most_recent_d = find_most_annotated(article_d)
    data_unique.append(most_recent_d)

In [78]:
header = data_unique[0].keys()
with open('../data/overivew-unique.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames = header, delimiter = ',')
    writer.writeheader()
    for d in data_unique:
        writer.writerow(d)

In [79]:
print(len(data_unique))

28642


In [83]:
# store original files in one directory

for d in data_unique:
    release = d['release']
    title = d['text_title']
    lang = d['lang']
    uid = d['unique_id']
    path = f'../data/releases-and-repos-sorted/{release}/unstructured/{lang}/{title}'
    new_uuid_path = f'../data/data-unique-ids/unstructured/{uid}.naf'
    
    with open(path) as infile:
    break

../data/releases-and-repos-sorted/v1/unstructured/en/1866 Swiss federal election.naf
