# Check coverage

* duplicates
* structured information
* text type (primary or secondary)
* broken files copy to separate directory and manual clean-up
* extract language from file


In [1]:
import csv
import json
import requests
from requests import ConnectionError
from tqdm import tqdm

In [2]:
with open('../data/overview-unique-ids-structured.csv') as infile:
    data = list(csv.DictReader(infile, delimiter = ','))

# Check all incidents and add missing incident ids

In [3]:
inc_known = 0
inc_unknown = 0
texts_without_q = set()

for d in data:
    if d['inc_q'].startswith('Q'):
        inc_known += 1
    else:
        inc_unknown += 1
        #print(d['text_title'][:-4])
        texts_without_q.add(d['text_title'][:-4])

print(inc_known, inc_unknown)

24842 3800


In [4]:
path = '../data/releases-and-repos-sorted/v1/structured/inc2doc_index.json'

with open(path) as infile:
    inc2doc_v1 = json.load(infile)
    
path = '../data/releases-and-repos-sorted/DFNDataReleases/structured/inc2lang2doc_index.json'

with open(path) as infile:
    inc2doc_dfn = json.load(infile)
    
    
path = '../data/releases-and-repos-sorted/DFN_annotations/structured/inc2lang2doc_index.json'

with open(path) as infile:
    inc2doc_annotations = json.load(infile)
    
    
path = '../data/releases-and-repos-sorted/v1.1/structured/inc2str_index.json'

with open(path) as infile:
    inc2doc_v11 = json.load(infile)
    
    
with open('../data/releases-and-repos-sorted/v1/structured/structured_and_unstructured.json') as infile:
    struc_unstruc = json.load(infile)
    
    

In [5]:
def text_inc_check(inc2doc, texts_without_q):
    
    text_q_inc_dict = dict()
    for q_inc, lang_texts in inc2doc.items():
        if type(lang_texts) == list:
            for lang_text in lang_texts:
                lang, text = lang_text.split('/')
                #text = text.split('.')[0]
                if text in texts_without_q:
                    #print('found text:', text, q_inc)
                    text_q_inc_dict[text] = q_inc
        else:
            for lang, texts in lang_texts.items():
                for text in texts:
                    #print(text)
                    #text = text.split('.')[0]
                    if text in texts_without_q:
                        #print('found text:', text, q_inc)
                        text_q_inc_dict[text] = q_inc
    print('done')
    return text_q_inc_dict

In [6]:
text_inc = text_inc_check(inc2doc_v1, texts_without_q)
print(len(text_inc))

done
0


In [7]:
text_inc2 = text_inc_check(inc2doc_dfn, texts_without_q)
text_inc.update(text_inc2)
print(len(text_inc))

done
0


In [8]:
text_inc3 = text_inc_check(inc2doc_annotations, texts_without_q)
text_inc.update(text_inc3)
print(len(text_inc))

done
0


In [9]:
text_inc4 = text_inc_check(inc2doc_v11, texts_without_q)
text_inc.update(text_inc4)
print(len(text_inc))

done
0


In [10]:
title_url = dict()
for q_inc, ref_dict in struc_unstruc.items():
    #print(q_inc, ref_dict)
    lang_texts = ref_dict['reference_texts']
    for lang, texts in lang_texts.items():
        for text in texts:
            text_inc[text['title']] = q_inc
            title_url[text['title']] = text['url']

In [11]:
# add to data
print(len(text_inc))

25961


In [12]:
for d in data:
    title = d['text_title'][:-4]
    if title in text_inc:
        d['inc_q'] = text_inc[title]
        #print('added q number')

In [13]:
inc_known = 0
inc_unknown = 0
texts_without_q = set()

for d in data:
    if d['inc_q'].startswith('Q'):
        inc_known += 1
    else:
        inc_unknown += 1
        #print(d['text_title'][:-4])
        texts_without_q.add(d['text_title'][:-4])
    
    if d['text_title'][:-4] in title_url:
        #print('found url')
        d['wikipedia_url_text'] = title_url[d['text_title'][:-4]]
    else:
        d['wikipedia_url_text'] = '-'

print(inc_known, inc_unknown)

24842 3800


In [14]:
header = data[0].keys()
with open('../data/overview-unique-ids-structured.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames = header, delimiter = ',')
    writer.writeheader()
    for d in data:
        writer.writerow(d)

## Check all types and add missing type ids


In [15]:
data_dir = '../data/releases-and-repos-sorted'

with open(f'{data_dir}/v1/structured/structured_and_unstructured.json') as infile:
    struc_unstruc = json.load(infile)
    
with open(f'{data_dir}/v1/structured/type2inc_index.json') as infile:
    type2inc_v1 = json.load(infile)
    
with open(f'{data_dir}/DFNDataReleases/structured/type2inc_index.json') as infile:
    type2inc_dfn = json.load(infile)
    
    
with open(f'{data_dir}/DFN_annotations/structured/type2inc_index.json') as infile:
    type2inc_ann = json.load(infile)

In [16]:
print(type2inc_v1 == type2inc_dfn)
print(type2inc_v1 == type2inc_ann)
print(type2inc_dfn == type2inc_ann)

False
False
False


In [17]:
# to fill 
inc_type_dict = dict()
type_url_dict = dict()


# collect from individual dicts
for q_inc, ref_dict in struc_unstruc.items():
    #print(q_inc, ref_dict)
    event_type = ref_dict['event_type']
    q_type = event_type.split('/')[-1]
    inc_type_dict[q_inc] = q_type
    type_url_dict[q_type] = event_type
  
print(len(inc_type_dict))

for q_type, incs in type2inc_v1.items():
    for inc in incs:
        inc_type_dict[inc] = q_type
print(len(inc_type_dict))
        
for q_type, incs in type2inc_dfn.items():
    for inc in incs:
        inc_type_dict[inc] = q_type
        
print(len(inc_type_dict))

for q_type, incs in type2inc_ann.items():
    for inc in incs:
        inc_type_dict[inc] = q_type
        
print(len(inc_type_dict))

19979
20908
21213
21213


In [18]:
for d in data:
    inc_q = d['inc_q']
    type_q = d['type_q']
    
    if inc_q in inc_type_dict:
        type_q_new = inc_type_dict[inc_q]
    
        if type_q_new != type_q:
            #print('found new type')
            d['type_q'] = type_q_new
            
            
    type_q = d['type_q']
    if type_q in type_url_dict:
        d['type_url'] = type_url_dict[type_q]
    else:
        d['type_url'] = f'http://www.wikidata.org/entity/{type_q}'
        
    d['incident_url'] = f'http://www.wikidata.org/entity/{inc_q}'
            

In [19]:
header = data[0].keys()
with open('../data/overview-unique-ids-structured.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames = header, delimiter = ',')
    writer.writeheader()
    for d in data:
        writer.writerow(d)

## Add human-readable type and incident information

In [20]:
for k, v in data[1].items():
    print(k, v)

release v1
lang en
text_title 2000 Wokingham District Council election.naf
annotation_mode system
most_recent_annotation 2020-05-28T21:22:31UTC
most_recent_annotator open-sesame
annotators open-sesame
creationtime 2019-07-20T00:00:00UTC
other releases -
unique_id 84c5d927-de62-4c1f-b82a-28f0404fbfd8
type_q Q15966540
inc_q Q8029342
wikipedia_url_text https://en.wikipedia.org/wiki/2000_Wokingham_District_Council_election
type_url http://www.wikidata.org/entity/Q15966540
incident_url http://www.wikidata.org/entity/Q8029342
type_name local election
inc_name 2000 Wokingham District Council election
valid_naf TRUE


In [None]:
http://www.wikidata.org/entity/Q15966540
http://www.wikidata.org/entity/Q8029342


http://www.wikidata.org/entity/Q15061650

In [81]:
# 
type_urls = set()
inc_urls = set()

for d in data:
    if d['inc_q'].startswith('Q'):
        if d['inc_name'] == '-':
            inc_urls.add(d['incident_url'])
    if d['type_q'].startswith('Q'):
        type_urls.add(d['type_url'])
print(len(type_urls), len(inc_urls))                   

35 1522


In [38]:
# type_names = dict()
# for type_url in tqdm(type_urls):
#     q = type_url.split('/')[-1]
#     r = requests.get(type_url)
#     status = r.status_code
#     if status == 200:
#         json  = r.json()['entities']
#         if q in json:
#             type_name_en = json[q]['labels']['en']['value']
#             type_names[q] = type_name_en
#         else:
#             print(q, 'not found')
#     else:
#         print(status)
#         print('too slow')

In [184]:
for d in data:
    type_q = d['type_q']
    if type_q in type_names:
        d['type_name'] = type_names[type_q]
    else:
        d['type_name'] = '-'

In [185]:
header = data[0].keys()
with open('../data/overview-unique-ids-structured.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames = header, delimiter = ',')
    writer.writeheader()
    for d in data:
        writer.writerow(d)

In [82]:
inc_names = dict()
for inc_url in tqdm(inc_urls):
    q = inc_url.split('/')[-1]
    try:
        r = requests.get(inc_url)
        status = r.status_code
        if status == 200:
            json  = r.json()['entities']
            if q in json:
                inc_name_en = json[q]['labels']['en']['value']
                inc_names[q] = inc_name_en
            else:
                print(q, 'not found')
        else:
            print(status)
            print('too slow')
    except ConnectionError:
        print('connection error')

  0%|                                          | 1/1522 [00:00<15:54,  1.59it/s]

404
too slow


  0%|                                          | 2/1522 [00:01<13:16,  1.91it/s]

Q48796779 not found


  0%|                                          | 3/1522 [00:01<12:09,  2.08it/s]

404
too slow


  0%|                                          | 4/1522 [00:02<13:45,  1.84it/s]

Q57526331 not found


  0%|▏                                         | 5/1522 [00:02<12:37,  2.00it/s]

404
too slow


  0%|▏                                         | 6/1522 [00:03<15:37,  1.62it/s]

Q4948963 not found


  0%|▏                                         | 7/1522 [00:04<15:43,  1.60it/s]

Q16971755 not found


  1%|▏                                         | 8/1522 [00:04<15:58,  1.58it/s]

404
too slow


  1%|▏                                         | 9/1522 [00:05<16:18,  1.55it/s]

404
too slow


  1%|▎                                        | 10/1522 [00:05<14:45,  1.71it/s]

Q25006560 not found


  1%|▎                                        | 11/1522 [00:06<13:47,  1.83it/s]

Q2624708 not found


  1%|▎                                        | 12/1522 [00:06<14:22,  1.75it/s]

Q4870812 not found


  1%|▎                                        | 13/1522 [00:07<14:35,  1.72it/s]

Q4948965 not found


  1%|▍                                        | 14/1522 [00:07<13:46,  1.82it/s]

Q18640034 not found


  1%|▍                                        | 15/1522 [00:08<13:48,  1.82it/s]

Q22704310 not found


  1%|▍                                        | 16/1522 [00:09<15:03,  1.67it/s]

404
too slow


  1%|▍                                        | 17/1522 [00:10<16:36,  1.51it/s]

404
too slow


  1%|▍                                        | 18/1522 [00:10<16:05,  1.56it/s]

404
too slow


  1%|▌                                        | 19/1522 [00:12<23:09,  1.08it/s]

404
too slow


  1%|▌                                        | 20/1522 [00:13<23:47,  1.05it/s]

404
too slow


  1%|▌                                        | 21/1522 [00:13<19:48,  1.26it/s]

404
too slow


  1%|▌                                        | 22/1522 [00:14<18:20,  1.36it/s]

404
too slow


  2%|▌                                        | 23/1522 [00:14<17:29,  1.43it/s]

Q16154083 not found


  2%|▋                                        | 24/1522 [00:15<20:22,  1.23it/s]

Q48449867 not found


  2%|▋                                        | 25/1522 [00:16<17:42,  1.41it/s]

Q4017350 not found


  2%|▋                                        | 26/1522 [00:17<17:12,  1.45it/s]

Q17515467 not found


  2%|▋                                        | 27/1522 [00:17<18:35,  1.34it/s]

404
too slow


  2%|▊                                        | 28/1522 [00:18<19:14,  1.29it/s]

404
too slow


  2%|▊                                        | 29/1522 [00:19<19:21,  1.29it/s]

404
too slow


  2%|▊                                        | 30/1522 [00:20<18:11,  1.37it/s]

404
too slow


  2%|▊                                        | 31/1522 [00:21<20:06,  1.24it/s]

404
too slow


  2%|▊                                        | 32/1522 [00:21<19:30,  1.27it/s]

404
too slow


  2%|▉                                        | 33/1522 [00:22<16:52,  1.47it/s]

Q4584442 not found


  2%|▉                                        | 34/1522 [00:22<15:07,  1.64it/s]

Q19576776 not found


  2%|▉                                        | 35/1522 [00:23<15:05,  1.64it/s]

404
too slow


  2%|▉                                        | 36/1522 [00:23<13:58,  1.77it/s]

Q18127182 not found


  2%|▉                                        | 37/1522 [00:24<12:56,  1.91it/s]

404
too slow


  2%|█                                        | 38/1522 [00:24<12:19,  2.01it/s]

Q15060533 not found


  3%|█                                        | 39/1522 [00:25<13:04,  1.89it/s]

404
too slow


  3%|█                                        | 40/1522 [00:25<12:19,  2.01it/s]

404
too slow


  3%|█                                        | 41/1522 [00:26<13:31,  1.82it/s]

Q4709225 not found


  3%|█▏                                       | 42/1522 [00:27<13:59,  1.76it/s]

404
too slow


  3%|█▏                                       | 43/1522 [00:27<12:52,  1.91it/s]

404
too slow


  3%|█▏                                       | 44/1522 [00:27<12:09,  2.03it/s]

404
too slow


  3%|█▏                                       | 45/1522 [00:28<13:17,  1.85it/s]

Q48836038 not found


  3%|█▏                                       | 46/1522 [00:29<14:29,  1.70it/s]

Q19962617 not found


  3%|█▎                                       | 47/1522 [00:30<16:17,  1.51it/s]

Q47487878 not found


  3%|█▎                                       | 48/1522 [00:30<14:30,  1.69it/s]

404
too slow


  3%|█▎                                       | 49/1522 [00:31<15:11,  1.62it/s]

Q18127153 not found


  3%|█▎                                       | 50/1522 [00:31<15:13,  1.61it/s]

Q48532728 not found


  3%|█▎                                       | 51/1522 [00:32<15:34,  1.57it/s]

Q62612551 not found


  3%|█▍                                       | 52/1522 [00:33<15:10,  1.61it/s]

404
too slow


  3%|█▍                                       | 53/1522 [00:34<22:50,  1.07it/s]

404
too slow


  8%|███▏                                    | 121/1522 [01:19<11:41,  2.00it/s]

Q55873777 not found


 18%|███████                                 | 270/1522 [02:53<18:45,  1.11it/s]

404
too slow


 18%|███████▏                                | 274/1522 [02:56<15:44,  1.32it/s]

404
too slow


 21%|████████▎                               | 318/1522 [03:29<15:50,  1.27it/s]

Q6826029 not found


 22%|████████▌                               | 328/1522 [03:36<13:24,  1.48it/s]

Q16243575 not found


 25%|█████████▊                              | 375/1522 [04:08<11:07,  1.72it/s]

404
too slow


 29%|███████████▌                            | 438/1522 [04:51<13:49,  1.31it/s]

Q56042819 not found


 34%|█████████████▌                          | 518/1522 [05:38<10:12,  1.64it/s]

Q48724051 not found


 36%|██████████████▎                         | 546/1522 [05:55<08:37,  1.88it/s]

Q20312333 not found


 36%|██████████████▍                         | 549/1522 [05:57<10:30,  1.54it/s]

Q48844546 not found


 48%|███████████████████▏                    | 731/1522 [07:50<06:11,  2.13it/s]

404
too slow


 52%|████████████████████▋                   | 786/1522 [08:22<06:25,  1.91it/s]

404
too slow


 54%|█████████████████████▊                  | 829/1522 [08:47<08:03,  1.43it/s]

Q4709211 not found


 62%|████████████████████████▊               | 944/1522 [10:06<05:22,  1.79it/s]

Q30681486 not found


 63%|█████████████████████████▏              | 957/1522 [10:15<07:08,  1.32it/s]

404
too slow


 64%|█████████████████████████▌              | 974/1522 [10:28<08:39,  1.06it/s]

404
too slow


 64%|█████████████████████████▋              | 976/1522 [10:30<08:32,  1.07it/s]

Q16243580 not found


 79%|██████████████████████████████▋        | 1197/1522 [12:54<04:50,  1.12it/s]

Q7982148 not found


 82%|███████████████████████████████▉       | 1246/1522 [13:27<03:17,  1.40it/s]

Q21511542 not found


 82%|████████████████████████████████       | 1252/1522 [13:32<03:27,  1.30it/s]

Q28172437 not found


 93%|████████████████████████████████████▏  | 1412/1522 [15:19<01:09,  1.58it/s]

Q4585775 not found


100%|███████████████████████████████████████| 1522/1522 [16:33<00:00,  1.53it/s]


In [83]:
print(len(inc_names))

1448


In [None]:
# 15571

In [84]:

for d in data:
    inc_q = d['inc_q']
    if inc_q in inc_names:
        d['inc_name'] = inc_names[inc_q]
    else:
        pass

        

In [85]:
# 2509/19267 
# run again tomorrow

header = data[0].keys()
with open('../data/overview-unique-ids-structured.csv', 'w') as outfile:
    writer = csv.DictWriter(outfile, fieldnames = header, delimiter = ',')
    writer.writeheader()
    for d in data:
        writer.writerow(d)

# Write broken nafs to separate directory

In [3]:
import os
from lxml import etree as et
from lxml.etree import XMLSyntaxError

In [4]:
with open('../data/overview-unique-ids-structured.csv') as infile:
    data = list(csv.DictReader(infile, delimiter = ','))

In [5]:
path_dir = '../data/data-unique-ids-final/broken-nafs'
#os.mkdir(path_dir)

In [25]:
broken_nafs = []

path_dir_naf =  '../data/data-unique-ids-final/unstructured'

for d in data:
    #print(d['valid_naf'])
    if d['valid_naf'] == 'FALSE':
        print(d['release'], d['text_title'])
        uid = d['unique_id']
        print(uid)
        path_naf = f'{path_dir_naf}/{uid}.naf'
        
        try:
            tree = et.parse(path_naf)
            print('File is a valid naf!')
            print()

        except XMLSyntaxError:
            print('File is NOT a valid naf')
            print()
   

v1.1 Danforth shooter ‘was afraid he was going to hurt people’.naf
b4b6c894-8f0b-4ce3-b898-971d98f84289
File is a valid naf!

v1.1 Abrupt End To Defense In Rail Case.naf
1681bf80-8d22-44db-92aa-5bc9ad729ebc
File is a valid naf!

v1.1 After Train Killings, Worry About Backlash.naf
fa949687-7d49-4c7c-aa85-29d60918284f
File is a valid naf!

v1.1 Teen’s downward spiral ends in gunfire, death.naf
eedde8d0-e5cc-4183-babe-7018f7a6b570
File is a valid naf!

v1.1 Police evacuate Wis. neighborhood near shooting.naf
0c423f11-7f19-4df2-af7f-4ea21fb76f31
File is a valid naf!

v1.1 101 Calif. massacre law firm closing.naf
d4d7b7a8-f717-4198-b6be-4cab612d9fb1
File is a valid naf!

v1.1 A timeline of the Fredericton shooting that killed 4 people, including 2 officers.naf
96a89271-cfa9-43b5-be09-68e79826c588
File is a valid naf!

v1.1 Wilkinsburg Mass Shooting Suspect Robert Thomas Out Of Jail After Having Case Dismissed.naf
4fa77a7b-ebbb-40f5-a3b3-d75c93f259dc
File is a valid naf!

v1.1 Hesston shoote

In [26]:
# fix file

test_path = f'{path_dir_naf}/508f1047-9908-4750-aeb2-4eb86316d921.naf'

with open(test_path) as infile:
    naf_txt = infile.read()

In [12]:
#tree = et.parse(test_path)

In [27]:
naf_list = naf_txt.split('\n')
 
naf_parts = []
lines1 = naf_txt.split('=======')      


for lines in lines1:
    lines2 = lines.split('>>>>>>> Stashed changes')
    for lines in lines2:
        lines3 = lines.split('<<<<<<< Updated upstream')
        naf_parts.extend(lines3)
        
        
for n, part in enumerate(naf_parts):
    with open(f'nafpart{n}.txt', 'w') as outfile:
        outfile.write(part)
    

# for n, part in enumerate(naf_parts):
#     print('PART', n)
#     print(len(part.split('\n')))
#     print('start')
#     print(part[:100])
#     print()
#     print('end')
#     print(part[-100:])
#     print()
#     print('-------------')
#     print()
        

In [92]:
# Test naf
print(test_path)

tree = et.parse(test_path)

root = tree.getroot()
print(root.tag)


../data/data-unique-ids-final/unstructured/1681bf80-8d22-44db-92aa-5bc9ad729ebc.naf
NAF
