# Knowledge Graph Builder

This Notebook serves as a short example for building knowledge graphs in Python out of scientific article abstracts. 

In this example, we use abstracts about depression to build a "depression knowledge graph". Why would we want a depression knowledge graph? Because we can answer questions about depression with it, and it "knows" everything we feed into it, which could be millions of articles.

## Setup

Download and import required packages.

In [61]:
#@title
!pip install networkx
!pip install spacy



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [59]:




#@title
# Data manipulation
import pandas as pd
import random

#API tools
import requests
import numpy as np

#NLP
import spacy


# Plotting
import networkx as nx
import matplotlib.pyplot as plt

## Explore

PLOS is clearly the best journal publisher. They provide an open API to query all their articles.

In [31]:
resp = requests.get('http://api.plos.org/search?q=title:"depression"&fl=id, abstract&start=1&rows=100')


In [33]:
print(resp.json())



In [36]:
articles=resp.json()['response']['docs']

In [41]:
abstracts_arr = []
for article in articles:
  abstracts_arr.append(article['abstract'][0])

In [44]:
abstracts_corpus = ' '.join(abstracts_arr)

## Using Spacy For Building Relations and Knwoledge Graph



This Code was jacked from a medium article by Marius Borcan (officially the GOAT). 




In [62]:
import spacy
from spacy.lang.en import English
import networkx as nx
import matplotlib.pyplot as plt


def getSentences(text):
    nlp = English()
    
    nlp.add_pipe('sentencizer')

    document = nlp(text)
    return [sent.text.strip() for sent in document.sents]

def printToken(token):
    print(token.text, "->", token.dep_)

def appendChunk(original, chunk):
    return original + ' ' + chunk

def isRelationCandidate(token):
    deps = ["ROOT", "adj", "attr", "agent", "amod"]
    return any(subs in token.dep_ for subs in deps)

def isConstructionCandidate(token):
    deps = ["compound", "prep", "conj", "mod"]
    return any(subs in token.dep_ for subs in deps)

def processSubjectObjectPairs(tokens):
    subject = ''
    object = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
     #   printToken(token) absolutely don't print the token you will have a bad
        if "punct" in token.dep_:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.lemma_)
        if isConstructionCandidate(token):
            if subjectConstruction:
                subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
                objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.dep_:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.dep_:
            object = appendChunk(object, token.text)
            object = appendChunk(objectConstruction, object)
            objectConstruction = ''

    # print (subject.strip(), ",", relation.strip(), ",", object.strip()) do not print the relation
    return (subject.strip(), relation.strip(), object.strip())

def processSentence(sentence):
    tokens = nlp_model(sentence)
    return processSubjectObjectPairs(tokens)

def printGraph(triples):
    G = nx.Graph()
    for triple in triples:
        G.add_node(triple[0])
        G.add_node(triple[1])
        G.add_node(triple[2])
        G.add_edge(triple[0], triple[1])
        G.add_edge(triple[1], triple[2])

    #Let's create a random subgraph because the whole graph is ridiculously huge
    S = G.subgraph(list(G.nodes)[1:100:None])

    pos = nx.spring_layout(S)
    fig = plt.figure(1, figsize=(200, 80), dpi=60)
    nx.draw(S, pos, edge_color='black', width=1, linewidths=1,
            node_size=500, node_color='seagreen', alpha=0.9,
            labels={node: node for node in S.nodes()})
    plt.axis('off')
    plt.savefig("graph.png", dpi=100)
    


if __name__ == "__main__":

    sentences = getSentences(abstracts_corpus)
    nlp_model = spacy.load('en_core_web_sm')

    triples = []
    for sentence in sentences:
        triples.append(processSentence(sentence))

    printGraph(triples)


KeyboardInterrupt: ignored