# Entity relations
A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:

`$9.4 million --> Net income.`

In [1]:
import spacy
import pandas as pd

In [2]:
TEXTS = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
]

In [4]:
nlp = spacy.load('en_core_web_lg')
t = ''
for text in TEXTS:
    t+=text
doc = nlp(t)

In [5]:
for num, entity in enumerate(doc.ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print( '')

Entity 1: $9.4 million - MONEY

Entity 2: the prior year - DATE

Entity 3: $2.7 million - MONEY

Entity 4: twelve billion dollars - MONEY

Entity 5: 1b - MONEY



In [None]:
type(doc)

In [6]:
token_text = [token.orth_ for token in doc]
token_pos = [token.pos_ for token in doc]

pd.DataFrame(list(zip(token_text, token_pos)),
         columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,Net,ADJ
1,income,NOUN
2,was,VERB
3,$,SYM
4,9.4,NUM
5,million,NUM
6,compared,VERB
7,to,ADP
8,the,DET
9,prior,ADJ


In [7]:
token_text = [token.orth_ for token in doc]
token_pos = [token.pos_ for token in doc]

pd.DataFrame(list(zip(token_text, token_pos)),
         columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,Net,ADJ
1,income,NOUN
2,was,VERB
3,$,SYM
4,9.4,NUM
5,million,NUM
6,compared,VERB
7,to,ADP
8,the,DET
9,prior,ADJ


In [8]:
token_lemma = [token.lemma_ for token in doc]
token_shape = [token.shape_ for token in doc]

pd.DataFrame(list(zip(token_text, token_lemma, token_shape)),
         columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,Net,net,Xxx
1,income,income,xxxx
2,was,be,xxx
3,$,$,$
4,9.4,9.4,d.d
5,million,million,xxxx
6,compared,compare,xxxx
7,to,to,xx
8,the,the,xxx
9,prior,prior,xxxx


In [9]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in doc]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,Net,-12.989803,,,,,
1,income,-9.906891,,,,,
2,was,-5.25232,,,,,
3,$,-7.450107,,,,,
4,9.4,-15.661046,,,,Yes,
5,million,-9.529832,,,,Yes,
6,compared,-9.563892,,,,,
7,to,-3.856022,,,,,
8,the,-3.528767,,,,,
9,prior,-10.569441,,,,,


In [10]:
def main(model='en_core_web_lg'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))

    for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))

In [11]:
def extract_currency_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations

In [12]:
print(TEXTS)

['Net income was $9.4 million compared to the prior year of $2.7 million.', 'Revenue exceeded twelve billion dollars, with a loss of $1b.']


In [13]:
main()

Loaded model 'en_core_web_lg'
Processing 2 texts
Net income	MONEY	$9.4 million
the prior year	MONEY	$2.7 million
Revenue   	MONEY	twelve billion dollars
a loss    	MONEY	1b
