# Add Knowledge to the Graph

Here the knowledge graph is enhanced by including links to other knowledge bases and manually gathered information. 

In [None]:
import pandas as pd
df = pd.read_csv("ma_papers.csv.gz")

In [None]:
import json
import gzip

with gzip.GzipFile('../data/software_kg_production_model.json.gz', 'r') as f:
    kg = json.load(f)

Add link to MA Graph:

In [None]:
for paper in kg['@graph']:
    doi = paper["http://schema.org/identifier"]
    rows = df.loc[df['doi'] == doi]
    paper["http://schema.org/sameAs"] = []
    if len(rows) > 0:
        for idx, row in enumerate(rows['paper']):
            paper["http://schema.org/sameAs"].append({
                "@id" : str(row),
                "@type" : "@id"})          

Get the list with the manual added information:

In [None]:
linked_entities = pd.read_csv("../data/software_reasoning_final_production_model.csv")

linked_entities["id"] = ["http://data.gesis.org/softwarekg/software/{}".format(i) for i in range(len(linked_entities))]

Now all additional information is added to each paper. 
Softwares found per paper are replaced through 'mentions' while for each distinct software name a new node in the graph is generated that is connected to the mention via its ID. 

In [None]:
for i_paper, paper in enumerate(kg['@graph']):
    if not paper['@type'] == "http://schema.org/ScholarlyArticle":
        # no paper, maybe software
        continue 
    # create empty list of mentions
    paper['http://schema.org/mentions'] = []
    doi = paper['http://schema.org/identifier']
    
    # TODO
    publisher_name = paper['http://schema.org/publisher']
    paper["http://schema.org/publisher"] = [{
          "@id": "http://data.gesis.org/softwarekg/"+doi+"/publisher/0",
          "@type": "http://schema.org/Organization",
          "http://schema.org/name": publisher_name
        }]
    for i_author, author in enumerate(paper['http://schema.org/author']):
        affiliation_list = []
        for i_affiliation, affiliation in enumerate(author['http://schema.org/affiliation']):
            affiliation_list.append({
                "@id": "http://data.gesis.org/softwarekg/{}/author/{}/affiliation/{}".format(doi, i_author, i_affiliation),
                "@type": "http://schema.org/Organization",
                "http://schema.org/name": affiliation
            })
        author['http://schema.org/affiliation'] = affiliation_list
        
    if 'http://data.gesis.org/softwarekg/headings' in paper:
        paper.pop('http://data.gesis.org/softwarekg/headings')

    for i_software, software in enumerate(paper['http://data.gesis.org/softwarekg/software']):
        # for each software in paper
        software_name = software['http://schema.org/name']
        #print(software_name)
        rows = linked_entities.loc[linked_entities['name']==software_name]
        #print(rows)
        if (len(rows) == 0):
            print("Software '{}' not found in linking list".format(software_name))
            continue
        if len(rows) > 1: 
             print("More than one software found for '{}'".format(software_name))
        linked_name = rows["linked_name"].iloc[0]
        linked_softwares = linked_entities.loc[linked_entities['linked_name']==linked_name]
        if len(linked_softwares) == 0:
            print("did not find linked name for {}".format(linked_name))
            print(rows)
            
        l_name = linked_softwares["id"].iloc[0]
        #print(l_name)
        software['http://schema.org/name'] = l_name
        #break
        paper['http://schema.org/mentions'].append({

            "@id": "http://data.gesis.org/softwarekg/{}/mention/{}".format(doi,i_software),
            "@type": "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#String",
            "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#isString": software_name,
            "http://data.gesis.org/softwarekg/software" : [
                {
                    "@id" : l_name,
                    "@type" : "@id"}
            ],
        })
    if 'http://data.gesis.org/softwarekg/software' in paper:
        paper.pop('http://data.gesis.org/softwarekg/software')
        

And we get the additional information for each of the distinct software entities

In [None]:
import numpy as np

def add_if_exists(entry, name, value):
    if pd.isnull(value):
        return False
    entry[name] = value
    return True

def add_sameAs_if_exists(entry, value):
    if pd.isnull(value):
        return False
        
    same_as = {
        "@id" : value,
        "@type" : "@id",
    }
    entry.append(same_as)
    return True

software_list = []
unique_names = []
for idx, row in linked_entities.iterrows():
    if row['linked_name'] in unique_names:
        continue
    unique_names.append(row["linked_name"])
    entry = {
        "@id" : row["id"],
        "@type" : "http://schema.org/SoftwareApplication",
        "http://schema.org/name" : row["linked_name"],
    }
    
    if not pd.isnull(row['Manufacturer']):
        entry["http://schema.org/publisher"] = [{
          "@id": "http://data.gesis.org/softwarekg/software/{}/publisher/0".format(idx),
          "@type": "http://schema.org/Organization",
          "http://schema.org/name": row['Manufacturer']
        }]

    #add_if_exists(entry, "http://schema.org/publisher",row['Manufacturer'])
    add_if_exists(entry, "http://schema.org/url",row["URL"])
    add_if_exists(entry, "http://data.gesis.org/softwarekg/freeAvailable",row["Free"])
    add_if_exists(entry, "http://data.gesis.org/softwarekg/sourceAvailable",row["Source Available"])
    add_if_exists(entry, "http://schema.org/license",row['Licence'])
    add_if_exists(entry, "http://schema.org/sameAs",[]) 
    add_sameAs_if_exists(entry["http://schema.org/sameAs"], row['SWO_ID'])
    add_sameAs_if_exists(entry["http://schema.org/sameAs"], row['Wikidata'])
    add_sameAs_if_exists(entry["http://schema.org/sameAs"], row['Wikipedia'])
    add_sameAs_if_exists(entry["http://schema.org/sameAs"], row['DBpedia'])    
    
    software_list.append(entry)

Actually inserting the new software nodes into the graph:

In [None]:
kg['@graph'].extend(software_list)

In [None]:
with open("data/software_kg_production.json",'w') as kgf:
    json.dump(kg, kgf, indent=2) 