# Setup
### Imports

In [1]:
import sys
sys.path.append('../')
del sys

%reload_ext autoreload
%autoreload 2

from toolbox.parsers import standard_parser, add_annotations_arguments
from database_creation.annotation_task import AnnotationTask
from preprocess_annotations import filter_annotations
from os.path import join as path_join

### Parameters

In [2]:
ap = standard_parser()
add_annotations_arguments(ap)
args = ap.parse_args(["--root", ".."])

### Load the annotations data (and first preprocessing step)

In [3]:
annotation_task = AnnotationTask(silent=args.silent,
                                     results_path=path_join(args.root, args.annotations_path),
                                     years=None,
                                     max_tuple_size=None,
                                     short=None,
                                     short_size=None,
                                     random=None,
                                     debug=None,
                                     random_seed=None,
                                     save=None,
                                     corpus_path=None)
annotation_task.process_task(exclude_pilot=args.exclude_pilot)

queries = annotation_task.queries
annotations = annotation_task.annotations

Processing the modeling task...
Computing the annotated queries...
Initial length of queries: 0.
Object loaded from ../results/annotation_task/annotations/v2_0/task/queries_short.pkl.
Object loaded from ../results/annotation_task/annotations/v2_1/task/queries.pkl.
Object loaded from ../results/annotation_task/annotations/v2_2/task/queries.pkl.
Final length of queries: 61056.
Done. Elapsed time: 1s.

Computing the annotations...
Initial length of annotations: 0.
batch_00 loaded from annotations/v2_0/results/batch_00_complete.csv
Correcting "n this article, Nevada and Ohio are discussed. The two American states..." to " The two American states..."
Correcting "In this article, California and Oregon are discussed. The two neighboring states..." to " The two neighboring states..."
Correcting "In this article, California and Oregon are discussed. The two West Coast states..." to " The two West Coast states..."
batch_01 loaded from annotations/v2_0/results/batch_01_complete.csv
Discarding "Th

We discarded 23 aggregation annotations in the first step.

In [11]:
print(len(annotations))
for id_, annotation_list in annotations.items():
    for annotation in annotation_list:
        print(annotation)
    break

1718


TypeError: 'Annotation' object is not iterable

In [14]:
from numpy import mean
l = []
for id_, annotation_list in annotations.items():
    query = queries[id_]    
    l.append(len(query.entities))
print(min(l))
print(mean(l))
print(max(l))

2
2.4
6


### Number of annotators

In [4]:
ids=set()
for id_, annotation_list in annotations.items():
    for annotation in annotation_list:
        ids.add(annotation.worker_id)

print(len(ids))

63


### 2nd and 3rd preprocessing steps

In [5]:
annotations = filter_annotations(annotations, args=args)

Filtering the annotations; annotations answered: 4993, n/a: 1306...
Number of workers discarded: 21
First filter done (number of assignments); annotations answered: 4963, n/a: 1299...
Second filter done (number of answers); annotations answered: 4675, n/a 453.



### Number of annotators

In [6]:
ids=set()
for id_, annotation_list in annotations.items():
    for annotation in annotation_list:
        ids.add(annotation.worker_id)

print(len(ids))

42


### Remaining data

In [5]:
from collections import defaultdict

to_del = []
for id_, annotations_list in annotations.items():
    annotations[id_] = [annotation for annotation in annotations_list if annotation.preprocessed_answers]
    
    if not annotations[id_]:
        to_del.append(id_)
        
for id_ in to_del:
    del annotations[id_]
    
length1 = sum([len([annotation for annotation in annotation_list if annotation.preprocessed_answers])
               for _, annotation_list in annotations.items()])
length2 = sum([len([annotation for annotation in annotation_list if not annotation.preprocessed_answers])
               for _, annotation_list in annotations.items()])

detailed_aggreg = defaultdict(list)
detailed_entities = defaultdict(list)
for id_, annotation_list in annotations.items():
    type_ = queries[id_].entities_type_
    entities = ', '.join(sorted(queries[id_].entities))
    detailed_entities['all'].append(entities)
    detailed_entities[type_].append(entities)
    
    for annotation in annotation_list:
        for aggregation in annotation.preprocessed_answers:
            detailed_aggreg['all'].append(aggregation)
            detailed_aggreg[type_].append(aggregation)

### Table data

In [6]:
initial_aggreg_instances = 2100
initial_annotations = 4993+1306
initial_na = 1306-23
first_filter_discarded_answered_annotations = 23
second_filter_discarded_answered_annotations = 4993-4963
third_filter_discarded_answered_annotations = 4963-4675
final_aggregation_annotations = 4675

assert initial_annotations  - initial_na \
                            - first_filter_discarded_answered_annotations \
                            - second_filter_discarded_answered_annotations \
                            - third_filter_discarded_answered_annotations \
            == final_aggregation_annotations

print("Initial number of aggreg. instances: %i" % initial_aggreg_instances)
print("Initial number of annotations: %i" % initial_annotations)
print()
print("Initial number of n/a annotations: %i" % initial_na)
print("First filter discarded aggregation annotations: %i" % first_filter_discarded_answered_annotations)
print("Second filter discarded aggregation annotaions: %i" % second_filter_discarded_answered_annotations)
print("Third filter discarded aggregation annotaions: %i" % third_filter_discarded_answered_annotations)
print("Final number of aggreg instances: %i" % len(annotations))
print("Final number of (aggregation) annotations: %i" % final_aggregation_annotations)
print()
for type_, l in detailed_entities.items():
    print("Entities sets (tot./unique) %s: %i/%i" % (type_, len(l), len(set(l))))
print()
for type_, l in detailed_aggreg.items():
    print("Aggregations (tot./unique) %s: %i/%i" % (type_, len(l), len(set(l))))

Initial number of aggreg. instances: 2100
Initial number of annotations: 6299

Initial number of n/a annotations: 1283
First filter discarded aggregation annotations: 23
Second filter discarded aggregation annotaions: 30
Third filter discarded aggregation annotaions: 288
Final number of aggreg instances: 1718
Final number of (aggregation) annotations: 4675

Entities sets (tot./unique) all: 1718/1336
Entities sets (tot./unique) location: 629/412
Entities sets (tot./unique) person: 941/801
Entities sets (tot./unique) org: 148/123

Aggregations (tot./unique) all: 5397/1681
Aggregations (tot./unique) location: 2041/505
Aggregations (tot./unique) person: 2900/951
Aggregations (tot./unique) org: 456/239
