# Exploring the Entity Extraction System

In [2]:
import pandas as pd
import numpy as np
import pickle
import re
import nltk
import difflib
from neo4j.v1 import GraphDatabase, basic_auth

# Neo4j driver and client
driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "naturalmed"))
session = driver.session()

In [1]:
ENTITY_LIST = [
    'Medicine',
    'Disease',
    'Food',
    'Context',
    'HerbSuplement',
    'LaboratoryTest',
    'Pharmacokinetics',
    'ScientificName',
    'Synonymous'
]

QUESTIONS = [
    # What Is Question
    ('What is ([A-Z].*)\?', 'WHAT_IS'),
    # Simple Relation Question
    ('What is the relation between ([A-Z].*) and ([A-Z].*)\?', 'SIMPLE_RELATION'),
    ('How are ([A-Z].*) and ([A-Z].*) related\?', 'SIMPLE_RELATION'),
    ('Is ([A-Z].*) related to ([A-Z].*)\?', 'SIMPLE_RELATION')
]

In [3]:
entities = []
types = []

for entity in ENTITY_LIST:
    prop = 'name' if entity == 'Medicine' else 'id' 
    res = session.run("MATCH (n:%s) RETURN n.%s;" % (entity, prop))
    
    for e in res:
        entities.append(e.values()[0])
        types.append(entity)

dframe = pd.DataFrame({
    'entity': pd.Series(entities),
    'type': pd.Series(types)
})

with open("../Dumps/entities.pickle", "wb") as fp:
    pickle.dump(dframe, fp)

In [4]:
dframe.head()

Unnamed: 0,entity,type
0,Abscess Root,Medicine
1,Abuta,Medicine
2,Acacia,Medicine
3,Acacia Rigidula,Medicine
4,Acai,Medicine


In [5]:
dframe.count()

entity    24076
type      24076
dtype: int64

In [4]:
def comp_str(arr, string):
    """ Compare the matching from 0 to 1 between two strings
    """
    sm = difflib.SequenceMatcher(None)
    
    for e in arr:
        sm.set_seq1(str(e))
        sm.set_seq2(string)
        
        yield sm.ratio()

In [21]:
def compile_question(string):
    """ Compiles a question and returns a list of the entities matched on it.
    """
    for question in QUESTIONS:
        matching = re.match(question[0], string) 
        if matching:
            return { 
                'type': question[1],
                'question': string,
                'entities': list(matching.groups()) }
    return None

In [11]:
compile_question("What is Marijuana?")

{'entities': ['Marijuana'],
 'question': 'What is Marijuana?',
 'type': 'WHAT_IS'}

In [12]:
compile_question("What is the relation between Marijuana and Depression?")

{'entities': ['Marijuana', 'Depression'],
 'question': 'What is the relation between Marijuana and Depression?',
 'type': 'SIMPLE_RELATION'}

In [13]:
def score_entity(dframe, entity):
    """ Scores the entity in relation to the entities table
    """
    dframe['confidence'] = list(comp_str(dframe['entity'], entity.title()))
    return dframe.sort_values(by='confidence', ascending=False).head(n=5)

In [33]:
def s_question(question):
    res = compile_question(question)
    print('Question Type: {}'.format(res['type']))
    for entity in res['entities']:
        print("Entity: {}\n".format(entity))
        print(score_entity(dframe, entity))
        print("\n")

## Examples

In [34]:
s_question("What is Açai?")

Question Type: WHAT_IS
Entity: Açai

       entity        type  confidence
4        Acai    Medicine    0.750000
6230     Açaï  Synonymous    0.750000
6238    Assai  Synonymous    0.666667
13287   Alasi  Synonymous    0.666667
10375  Arandi  Synonymous    0.600000




In [35]:
s_question("What is Blue Bells?")

Question Type: WHAT_IS
Entity: Blue Bells

            entity        type  confidence
6188    Blue Bells  Synonymous    1.000000
18609    Blue Balm  Synonymous    0.736842
20833   Blue Curls  Synonymous    0.700000
14896     Hen Bell  Synonymous    0.666667
12842  Blue Mallee  Synonymous    0.666667




In [36]:
s_question("What is the relation between Marijuana and Depression?")

Question Type: SIMPLE_RELATION
Entity: Marijuana

          entity           type  confidence
743    Marijuana       Medicine    1.000000
2840   Marijuana  HerbSuplement    1.000000
17494  Mariguana     Synonymous    0.888889
17495  Marihuana     Synonymous    0.888889
7330     Maranta     Synonymous    0.750000


Entity: Depression

                    entity        type  confidence
1232            Depression     Disease    1.000000
1964     Mental Depression     Disease    0.740741
23251              Cresson  Synonymous    0.705882
1763   Atypical Depression     Disease    0.689655
11344             Espresso  Synonymous    0.666667




## Exploring a Simple Implementation of the Question Parsing Function

In [104]:
def question(question):
    res = compile_question(question)
    res['response'] = {}
    
    obj = {}
    
    if res['type'] == 'WHAT_IS':
        score = score_entity(dframe, res['entities'][0])
        matching = []
        
        for row in range(len(score)):
            matching.append({
                'entity': score.iloc[row]['entity'],
                'type': score.iloc[row]['type'],
                'confidence': score.iloc[row]['confidence']
            })
        
        first = matching[0]
        obj['selected_entity'] = first
        
        prop = 'name' if first['type'] == 'Medicine' else 'id'
        
        query = "MATCH (n:%s {%s: '%s'}) RETURN n" % (first['type'], prop, first['entity'])
        data = session.run(query).single()
        
        node = data.values()[0]
        
        obj['description'] = node.get('description')
    
        res['response']['matching'] = matching
    
    res['response']['data'] = obj
    
    return res

In [114]:
def consume(question):
    print("Answer: {}\n".format(question['response']['data']['description']))
    
    print("Selected Entity: \n")
    print(question['response']['data']['selected_entity'])
    print("\n")
    
    print("Entity Confidence Table: \n")
    for e in question['response']['matching']:
        print(e)

In [115]:
res = question("What is Açai?")
consume(res)

Answer: Açaí (acai) is a berry grown on the açaí palm tree (Euterpe oleracea), which is native to tropical Central and South America and grows mainly in floodplains and swamps. It produces small flowers that are brown to purple in color.

Selected Entity: 

{'entity': 'Acai', 'type': 'Medicine', 'confidence': 0.75}


Entity Confidence Table: 

{'entity': 'Acai', 'type': 'Medicine', 'confidence': 0.75}
{'entity': 'Açaï', 'type': 'Synonymous', 'confidence': 0.75}
{'entity': 'Assai', 'type': 'Synonymous', 'confidence': 0.66666666666666663}
{'entity': 'Alasi', 'type': 'Synonymous', 'confidence': 0.66666666666666663}
{'entity': 'Arandi', 'type': 'Synonymous', 'confidence': 0.59999999999999998}


In [137]:
#def ngram_it(text, n=3):
#    """ Returns a list of ngrams of sizes 1 to n
#    """
#    ngrams = []
#    tokens = nltk.word_tokenize(text)
#    
#    for i in range(1, n + 1):
#        ngrams += list(map(" ".join, nltk.ngrams(tokens, i)))
#    
#    return ngrams

In [None]:
# Close the Neo4j Session
#session.close()