# Exploring the Entity Extraction System

In [251]:
import pandas as pd
import numpy as np
import pickle
import re
import nltk
import difflib
from neo4j.v1 import GraphDatabase, basic_auth

# Neo4j driver and client
driver = GraphDatabase.driver("bolt://localhost:7687", auth=basic_auth("neo4j", "naturalmed"))
session = driver.session()

In [225]:
ENTITY_LIST = [
    'Medicine',
    'Disease',
    'Food',
    'Context',
    'HerbSuplement',
    'LaboratoryTest',
    'Pharmacokinetics'
]

QUESTIONS = [
    'What is ([A-Z].*?)\?'
]

In [264]:
entities = []
types = []

for entity in ENTITY_LIST:
    prop = 'name' if entity == 'Medicine' else 'id' 
    res = session.run("MATCH (n:%s) RETURN n.%s;" % (entity, prop))
    
    for e in res:
        entities.append(e.values()[0])
        types.append(entity)

dframe = pd.DataFrame({
    'entity': pd.Series(entities),
    'type': pd.Series(types)
})

with open("../Dumps/entities.pickle", "wb") as fp:
    pickle.dump(dframe, fp)

In [261]:
dframe.head()

Unnamed: 0,entity,type
0,Abscess Root,Medicine
1,Abuta,Medicine
2,Acacia,Medicine
3,Acacia Rigidula,Medicine
4,Acai,Medicine


In [262]:
dframe.count()

entity    3364
type      3364
dtype: int64

In [155]:
def comp_str(arr, string):
    """ Compare the matching from 0 to 1 between two strings
    """
    sm = difflib.SequenceMatcher(None)
    
    for e in arr:
        sm.set_seq1(str(e))
        sm.set_seq2(string)
        
        yield sm.ratio()

In [169]:
test_dframe = dframe

In [171]:
test_dframe['confidence'] = list(comp_str(test_dframe['entity'], "Açai"))

In [178]:
test_dframe.sort_values(by='confidence', ascending=False).head(n=5)

Unnamed: 0,entity,type,confidence
4,Acai,Medicine,0.75
2,Acacia,Medicine,0.6
1,Abuta,Medicine,0.444444
13,Caffeine,Food,0.333333
3,Acacia Rigidula,Medicine,0.315789


In [233]:
def compile_question(string):
    """ Compiles a question and returns a list of the entities matched on it.
    """
    for question in QUESTIONS:
        matching = re.match(question, string) 
        if matching:
            return list(matching.groups())
    return None

In [234]:
compile_question("What is Açai?")

['Açai']

In [246]:
def score_entity(dframe, entity):
    """ Scores the entity in relation to the entities table
    """
    dframe['confidence'] = list(comp_str(dframe['entity'], entity))
    return dframe.sort_values(by='confidence', ascending=False).head(n=5)

In [269]:
e = compile_question("What is Marijuana?")
score_entity(dframe, e[0])

Unnamed: 0,entity,type,confidence
743,Marijuana,Medicine,1.0
2840,Marijuana,HerbSuplement,1.0
1834,Malaria,Disease,0.625
771,Moringa,Medicine,0.625
247,Carlina,Medicine,0.625


In [137]:
#def ngram_it(text, n=3):
#    """ Returns a list of ngrams of sizes 1 to n
#    """
#    ngrams = []
#    tokens = nltk.word_tokenize(text)
#    
#    for i in range(1, n + 1):
#        ngrams += list(map(" ".join, nltk.ngrams(tokens, i)))
#    
#    return ngrams

In [None]:
# Close the Neo4j Session
#session.close()