# Generate Reasoning Data

Combine the information gathered through the neural network and construct the knowledge graph. 

Extract some statistics of the resulting data and generate the setup for manual information enrichment.

## Extract Information from Gold Data

Test the knowledge graph construction on the hand annotated SoSci gold standard.

In [None]:
%load_ext autoreload
%autoreload 2

import json
import csv

from collections import Counter
from os import listdir, mkdir
from os.path import join, exists
from util.doc_info import get_doc_dict

In [None]:
if not exists('data/sosci_reasoning'):
    mkdir('data/sosci_reasoning')
    
with open('data/sosci_bio.txt', 'r') as bio_file:
    line = bio_file.readline()
    current_doc = ''
    current_candidate = ''
    current_candidates = []
    while line:
        if line in ['\n', '\t\n']:
            if current_candidate:
                current_candidates.append(current_candidate)
            current_candidate = ''
        elif line.startswith('-DOCSTART-'):
            if current_candidate:
                current_candidates.append(current_candidate)
            current_candidate = ''
            # Here we write a document if we already have one. 
            if current_doc:
                article_info = get_doc_dict(current_doc, current_candidates)
                #json.dump(document_dict, json_file, indent=4)
                with open('data/sosci_reasoning/'+current_doc+'.json', 'w') as out_file:
                    json.dump(article_info, out_file, indent=4)
                #print(article_info)
            current_candidates = []
            current_doc = line.split(':')[1].rstrip('\n')
        else:
            token, annotation = line.split('\t')
            annotation = annotation.rstrip('\n')
            if current_candidate:
                if annotation == 'O':
                    current_candidates.append(current_candidate)
                    current_candidate = ''
                elif annotation == 'B-software':
                    current_candidates.append(current_candidate)
                    current_candidate = token
                elif annotation == 'I-software':
                    current_candidate += ' {}'.format(token) # extend
            else:
                if annotation == 'O':
                    pass
                elif annotation == 'B-software':
                    current_candidate = token
                elif annotation == 'I-software':
                    print("This is not allowed to happen.")
        line = bio_file.readline()

Next we combine the information from all separate files.

In [None]:
software_kg = {
  "@context": {
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "base": "http://data.gesis.org/softwarekg/",
    "schema": "http://schema.org/",
    "swo": "http://www.ebi.ac.uk/swo/swo.owl#",
    "owl": "http://www.w3.org/2002/07/owl#",
    "dct": "http://purl.org/dc/elements/1.1/",
    "nif": "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"
    }
}
graph = []
for file in listdir('data/sosci_reasoning'):
    with open('data/sosci_reasoning/'+file, 'r') as json_file:
        graph_entry = json.load(json_file)
        graph.append(graph_entry)
software_kg['@graph'] = graph
with open('data/software_kg_sosci.json', 'w') as kg:
    json.dump(software_kg, kg, indent=2)

Now we can get the plain software names in order to analyze how often they appear. 

In [None]:
c = Counter()
with open('data/software_kg_sosci.json', 'r') as json_file:
    kg = json.load(json_file)
    for article_node in kg['@graph']:
        for software_node in article_node['http://data.gesis.org/softwarekg/software']:
            software_name = software_node['http://schema.org/name']
            c[software_name] += 1

with open('data/software_counted_list_sosci.csv', 'w') as csv_file:
    fieldnames = ['name', 'count']
    software_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
    software_writer.writerow(fieldnames)
    for s in c.most_common():
        software_writer.writerow(s)

## Extract Information from the Silver Standard

We can of course look at the same information based on the silver standard. However, they are not as interesting because they are only suggestively labeled. 
Therefore, the construction of a knowledge graph is also less intersting.

In [None]:
silver_standard_data = 'data/pos_silver_samples_cor_data.txt'
silver_standard_labels = 'data/pos_silver_samples_cor_labels.txt'
if not exists('data/silver_standard_reasoning'):
    mkdir('data/silver_standard_reasoning')
    
c = Counter()
error_count = 0
with open(silver_standard_data, 'r') as data_file, open(silver_standard_labels, 'r') as labels_file:
    data_line = data_file.readline()
    labels_line = labels_file.readline()
    current_candidate = ''
    current_candidates = []
    counter = 0
    while data_line and labels_line:
        counter += 1
        tokens = data_line.split()
        labels = labels_line.split()
        token = tokens.pop(0)
        annotation = labels.pop(0)
        while token and annotation:
            if current_candidate:
                if annotation == 'O':
                    c[current_candidate] += 1
                    current_candidate = ''
                elif annotation == 'B-software':
                    c[current_candidate] += 1
                    current_candidate = token
                elif annotation == 'I-software':
                    current_candidate += ' {}'.format(token) # extend
            else:
                if annotation == 'O':
                    pass
                elif annotation == 'B-software':
                    current_candidate = token
                elif annotation == 'I-software':
                    error_count += 1
                    #print("This is not supposed to happen.")
                    #print(data_line)
                    #print(labels_line)
                    #print(token)
                    #print('#####\n')
            if len(tokens) > 0 and len(labels) > 0:
                token = tokens.pop(0)
                annotation = labels.pop(0)
            else:
                token = None
                annotation = None
        if current_candidate:
            c[current_candidate] += 1
        current_candidate = ''
                
        data_line = data_file.readline()
        labels_line = labels_file.readline()
        #if counter > 10000:
        #    break
print("Errors: {}".format(error_count))

with open('data/software_counted_list_silver_standard.csv', 'w') as csv_file:
    fieldnames = ['name', 'count']
    software_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
    software_writer.writerow(fieldnames)
    for s in c.most_common():
        software_writer.writerow(s)

# Extract Information from Reasoning Data

Large scale information extraction from all data in our reasoning set. 
Here we combine all reasoning outputs our model has created in a knowledge graph an extract the first statistics. 

In [None]:
reasoning_location = 'data/reasoning_output_production_model/'
reasoning_files = listdir(reasoning_location)

In [None]:
c_total = Counter()
c_relative = Counter()
empty_file_num = 0
for file in reasoning_files:
    #print(file)
    try:
        with open(join(reasoning_location, file), 'r') as json_file:
            software_in_article = []
            data = json.load(json_file)
            for software in data['http://data.gesis.org/softwarekg/software']:
                software_in_article.append(software['http://schema.org/name'])
            c_total.update(software_in_article)
            c_relative.update(set(software_in_article))
    except json.JSONDecodeError:
        print("Empty file {}".format(file))
        empty_file_num += 1
print("Totally extracted software: {}".format(sum(c_total.values())))
print("Relative per paper extracted software: {}".format(sum(c_relative.values())))
print("{} empty files in total".format(empty_file_num))
with open('software_reasoning_production_model.csv', 'w') as total_file, open('software_reasoning_production_model_relative.csv', 'w') as relative_file:
    fieldnames = ['name', 'count']
    software_writer_total = csv.writer(total_file, delimiter=',', quotechar='"')
    software_writer_total.writerow(fieldnames)
    for s in c_total.most_common():
        software_writer_total.writerow(s)
    software_writer_relative = csv.writer(relative_file, delimiter=',', quotechar='"')
    software_writer_relative.writerow(fieldnames)
    for s in c_relative.most_common():
        software_writer_relative.writerow(s)

If there is a error in the following code itis likely that a empty file was generated during prediction. 
To not overlook errors we manually check how much and why that happend `find path_to_reasoning_files -empty (-delete)`. 
In the current final run 3 files came up empty and were ignored:

./10.1371_journal.pone.0069554.json

./10.1371_journal.pone.0069504.json

./10.1371_journal.pmed.1001418.json

In [None]:
software_kg = {
  "@context": {
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "base": "http://data.gesis.org/softwarekg/",
    "schema": "http://schema.org/",
    "swo": "http://www.ebi.ac.uk/swo/swo.owl#",
    "owl": "http://www.w3.org/2002/07/owl#",
    "dct": "http://purl.org/dc/elements/1.1/",
    "nif": "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#"
    }
}
graph = []
for file in listdir('data/reasoning_output_production_model'):
    with open('data/reasoning_output_production_model/'+file, 'r') as json_file:
        graph_entry = json.load(json_file)
        graph.append(graph_entry)
software_kg['@graph'] = graph
with open('data/software_kg_production_model.json', 'w') as kg:
    json.dump(software_kg, kg, indent=2)

In [None]:
c.most_common()