In [1]:
import pandas as pd #for handling csv and csv contents
from rdflib import Graph, Literal, RDF, URIRef, Namespace #basic RDF handling
from rdflib.namespace import FOAF , XSD, RDFS, DCTERMS #most common namespaces
import urllib.parse #for parsing strings to URI's
import numpy as np
from array import array
import codecs
import ordf
from anytree import Node, RenderTree, PreOrderIter
from anytree.exporter import JsonExporter, DictExporter
from anytree.importer import DictImporter
from fuzzywuzzy import fuzz


Define Namespaces for the Data

In [9]:
g = Graph()
pmid = Namespace('http://example.org/pmid/')
title = Namespace('http://example.org/title/')
year = Namespace('http://example.org/year/')
volume = Namespace('http://example.org/volume/')
keyword_merged = Namespace('http://example.org/keyword_merged/')
subfield = Namespace('http://example.org/subfield/')
aimethod = Namespace('http://example.org/ai_method')
disease = Namespace('http://example.org/disease')
research = Namespace('http://example.org/research_type')
benefits = Namespace('http://example.org/benefits')


Upload the data files and merge new examples with diseases

In [21]:
new_example = pd.read_json('example_new.json')
#diseases = pd.read_csv('/Users/maida/Desktop/GitHub/AI-Team-Project/RDF/papers_with_disease_n_abbr_list.csv')

def change_PMID(str_id):
  return int(str_id.replace('PMID:',''))

#diseases['PMID'] = diseases['PMID'].apply(change_PMID)
#diseases.head(9000)
#new_example.head(36)


merge the 2 dataframes

In [22]:
#result = pd.merge(new_example, diseases, how='outer', on=['Title','Authors','Year','Doi','PMID','Abstract','Keywords_Abstract','Keywords_MeSH','Keywords_Merged'])
#result.drop(['FullTextLinks_y'], axis=1)
#result.drop_duplicates()
#result.head()


Open and use hierarchy file 

In [5]:
with codecs.open("/Users/maida/Desktop/GitHub/AI-Team-Project/RDF/acm_hierarchy_restructure.txt", "r", encoding="utf8") as f_in:
    
    hierarchy = f_in.read().split("\n")
    nodes = list()
    lvl = 0
    
    for term in hierarchy:
        values = term.split(" | ")
        lvl = values[0].count("\t")
        values[0] = values[0].replace("\t","")
        values[-1] = values[-1].replace("\r","")
        #print(str(lvl) + " " + str(values[0]))
        kws = set()
        if(values[-1] == ""):
            keyword = values[0].replace("-"," ").lower()
            if(keyword[-1] == "s"):
                keyword = keyword[0:-1]
            kws.add(keyword)
        else:
            kws_list = values[-1].split(",")
            for kw in kws_list:
                kws.add(kw)
        node = Node(values[0], keywords = kws, level = lvl, count = 0) 
        nodes.append(node)
        #print(values[0])
    index1 = 0
    for node in nodes:
        children = list()
        index2 = index1
        while(index2 < len(nodes)-1):
            index2 += 1
            if(nodes[index2].level == node.level+1):
                children.append(nodes[index2])
            elif(nodes[index2].level < node.level):
                break
        if(children):
            nodes[index1].children = children
        index1 += 1
    
    def listfromtree(self, prev_data = []):
        if self.children:
            child_data = []
            for child in self.children:
                child_data.extend(child.tree2list(prev_data + self._data))
                return child_data
        else:
            return [prev_data + self._data]
    
f_in.close()
#f_in.json.dump()

In [6]:
for node in nodes:
    if node.children:
        out = node.name + ": "
        for child in node.children:
            out += child.name + ", "
        print("-------------------------------------------------------\n" + out)


-------------------------------------------------------
Artificial intelligence: Natural language processing, Logic, knowledge representation and reasoning, Searching, planning and scheduling, Distributed artificial intelligence, Computer vision, Machine learning, Modeling and simulation, Mathematical algorithms and methods, Algorithmic game theory and mechanism design, Information retrieval and data mining, Human-centered computing, Embedded and cyber-physical systems, Healthcare and medicine specific applications, 
-------------------------------------------------------
Natural language processing: Syntactic analysis, Semantic analysis, Discourse, dialogue and pragmatics, Speech recognition |, 
-------------------------------------------------------
Syntactic analysis: Grammar induction, Stemming, Lemmatization, Morphological segmentation, Part-of-speech tagging, Syntactic Parsing, Sentence boundary disambiguation, Word segmentation, Terminology extraction, 
-------------------------

Create and Serialize the Schema

In [15]:
for index, row in new_example.iterrows():
    
    authors = str(row['Authors']).split(";")
    for a in authors[:-1]:
        g.add((URIRef(pmid+str(row['PMID'])), DCTERMS['creator'], Literal(a, datatype=XSD.string) ))
        
    keywords = str(row['Keywords_Merged']).split(";")
    for key in keywords:
        g.add((URIRef(pmid+str(row['PMID'])), URIRef(keyword_merged), Literal(key, datatype=XSD.string) ))
    
    fullTexts = str(row['FullTextLinks']).split(";")
    for text in fullTexts:
        g.add((URIRef(pmid+str(row['PMID'])), RDFS.comment, Literal(text, datatype=XSD.string)))
        
    related = str(row['FullTextLinks']).split(";")
    for a in related:
        g.add((URIRef(pmid+str(row['PMID'])), RDFS.seeAlso, Literal(a, datatype=XSD.string)))
    
    g.add((URIRef(pmid+str(row['PMID'])),RDF.type, Literal(row['PMID'])))
    g.add((URIRef(pmid+str(row['PMID'])), DCTERMS['identifier'], Literal(row['PMID']))) 
    g.add((URIRef(pmid+str(row['PMID'])), DCTERMS['identifier'],Literal(row['Doi'], datatype=XSD.integer) ))
    g.add((URIRef(pmid+str(row['PMID'])), DCTERMS['publisher'], Literal(row['Journal'], datatype=XSD.string) ))
    g.add((URIRef(pmid+str(row['PMID'])), DCTERMS['abstract'], Literal(row['Abstract'], datatype=XSD.string) ))
    g.add((URIRef(pmid+str(row['PMID'])), URIRef(disease), Literal(row['Diseases'])))
    #for m in str(row['Meaning']):
        #print (meaning)
        #g.add((URIRef(disease)), RDFS.comment, Literal(m))
        
    g.add((URIRef(pmid+str(row['PMID'])), URIRef(benefits), Literal(row['Benefits'])))
    g.add((URIRef(pmid+str(row['PMID'])), URIRef(volume), Literal(row['Volume'])))
    g.add((URIRef(pmid+str(row['PMID'])), URIRef(subfield), Literal(row['Subfields'])))
    g.add((URIRef(pmid+str(row['PMID'])), URIRef(research), Literal(row['Research_Type'])))

    methods = str(row['Methods']).split('/')
    for node in nodes:
        for x in methods:
            if node.children:
                out = node.name
                ratio_h = fuzz.token_set_ratio(x, out) #levenstein distance
                if ratio_h>=99: 
                    g.add((URIRef(pmid+str(row['PMID'])), URIRef(aimethod), Literal(out, datatype=XSD.string)))
                for child in node.children:
                    ratio = fuzz.token_set_ratio(x, child.name)
                    if ratio>= 95:
                        out += child.name + ", "
                    g.add((URIRef(aimethod), RDFS.subClassOf, Literal(out,datatype=XSD.string)))
            else: g.add((URIRef(aimethod), RDF.type, Literal(x,datatype=XSD.string)))
            
            #print("-------------------------------------------------------\n" + out)
            #print(methods)
            

   
    
    ##add the rdf for diseases
    

In [20]:
NotebookApp.iopub_data_rate_limit=1.0e10

NameError: name 'NotebookApp' is not defined

In [16]:
print(g.serialize(format='turtle').decode('UTF-8'))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

