# Understanding the task

First, a simple example:

In [1]:
information = "When Harry met Sally"
words = information.split()
print (f"Participant1 = {words[words.index('met')-1]}")
print (f"Action = met")
print (f"Participant2 = {words[words.index('met')+1]}")

Participant1 = Harry
Action = met
Participant2 = Sally


Represent the action and its participants as tuples. Then you can easily extract the answers to queries:

In [1]:
meetings = [('Boris Johnson', 'meets with', 'the Queen'),
            ('Donald Trump', 'meets with', 'his cabinet'),
            ('administration', 'meets with', 'tech giants'),
            ('the Queen', 'meets with', 'the Prime Minister'),
            ('Donald Trump', 'meets with', 'Finnish President')]
query = [p2 for (p1, act, p2) in meetings if p1=='Donald Trump']
print(query)

['his cabinet', 'Finnish President']


As "meeting" is a mutual action, a participant may appear on the right or on the left – make sure both cases are covered:

In [2]:
query = [p2 for (p1, act, p2) in meetings if p1=='the Queen']
query += [p1 for (p1, act, p2) in meetings if p2=='the Queen']
print(query)

['the Prime Minister', 'Boris Johnson']


# Natural Language Processing with spaCy

## Part-of-speech tagging

Run `nlp` pipeline on some input text:

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("On Friday board members meet with senior managers " +
          "to discuss future development of the company.")

rows = []
rows.append(["Word", "Position", "Lowercase", "Lemma", "POS", "Alphanumeric", "Stopword"])
for token in doc:
    rows.append([token.text, str(token.i), token.lower_, token.lemma_, 
                 token.pos_, str(token.is_alpha), str(token.is_stop)])

columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
    print(''.join(' {:{width}} '.format(row[i], width=column_widths[i]) 
                  for i in range(0, len(row))))


 Word         Position  Lowercase    Lemma        POS    Alphanumeric  Stopword 
 On           0         on           on           ADP    True          True     
 Friday       1         friday       Friday       PROPN  True          False    
 board        2         board        board        NOUN   True          False    
 members      3         members      member       NOUN   True          False    
 meet         4         meet         meet         VERB   True          False    
 with         5         with         with         ADP    True          True     
 senior       6         senior       senior       ADJ    True          False    
 managers     7         managers     manager      NOUN   True          False    
 to           8         to           to           PART   True          True     
 discuss      9         discuss      discuss      VERB   True          False    
 future       10        future       future       ADJ    True          False    
 development  11        deve

Use more challenging text, for example "Jabberwocky":

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Beware the Jabberwock, my son! The jaws that bite, the claws that catch! " +
          "Beware the Jubjub bird, and shun The frumious Bandersnatch!")

rows = []
rows.append(["Word", "Position", "Lowercase", "Lemma", "POS", "Alphanumeric", "Stopword"])
for token in doc:
    rows.append([token.text, str(token.i), token.lower_, token.lemma_, 
                 token.pos_, str(token.is_alpha), str(token.is_stop)])

columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
    print(''.join(' {:{width}} '.format(row[i], width=column_widths[i]) 
                  for i in range(0, len(row))))

 Word          Position  Lowercase     Lemma         POS    Alphanumeric  Stopword 
 Beware        0         beware        beware        VERB   True          False    
 the           1         the           the           DET    True          True     
 Jabberwock    2         jabberwock    Jabberwock    PROPN  True          False    
 ,             3         ,             ,             PUNCT  False         False    
 my            4         my            my            PRON   True          True     
 son           5         son           son           NOUN   True          False    
 !             6         !             !             PUNCT  False         False    
 The           7         the           the           DET    True          True     
 jaws          8         jaws          jaw           NOUN   True          False    
 that          9         that          that          DET    True          True     
 bite          10        bite          bite          VERB   True          Fa

## Parsing

Identify all noun phrases (groups of words that include a noun and all related words). These will be good candidates for the participants of the event:

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("On Friday, board members meet with senior managers " +
          "to discuss future development of the company.")

for chunk in doc.noun_chunks:
    print('\t'.join([chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text]))

Friday	Friday	pobj	On
board members	members	nsubj	meet
senior managers	managers	pobj	with
future development	development	dobj	discuss
the company	company	pobj	of


Here is how you can visualize the dependencies – call on `displacy` and store output in a file:

In [6]:
from spacy import displacy
from pathlib import Path

svg = displacy.render(doc, style='dep', jupyter=False)
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + ".svg"
output_path = Path(file_name)
output_path.open("w", encoding="utf-8").write(svg)

12233

Get all the dependencies for all the words in the sentence:

In [7]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

On prep meet VERB [Friday]
Friday pobj On ADP []
, punct meet VERB []
board compound members NOUN []
members nsubj meet VERB [board]
meet ROOT meet VERB [On, ,, members, with, discuss, .]
with prep meet VERB [managers]
senior amod managers NOUN []
managers pobj with ADP [senior]
to aux discuss VERB []
discuss advcl meet VERB [to, development]
future amod development NOUN []
development dobj discuss VERB [future, of]
of prep development NOUN [company]
the det company NOUN []
company pobj of ADP [the]
. punct meet VERB []


Now let's iterate through the words and only identify the participants of the action when the action is expressed with "meet":

In [8]:
for token in doc:
    if token.lemma_=="meet" and token.pos_=="VERB" and token.dep_=="ROOT":
        action = token.text
        children = [child for child in token.children]
        participant1 = ""
        participant2 = ""
        for child1 in children:
            if child1.dep_=="nsubj":
                participant1 = " ".join([attr.text for 
                                         attr in child1.children]) + " " + child1.text
            elif child1.text=="with":
                action += " " + child1.text
                child1_children = [child for child in child1.children]
                for child2 in child1_children:
                    if child2.pos_ == "NOUN":
                        participant2 = " ".join([attr.text for 
                                             attr in child2.children]) + " " + child2.text
print (f"Participant1 = {participant1}")
print (f"Action = {action}")
print (f"Participant2 = {participant2}")    

Participant1 = board members
Action = meet with
Participant2 = senior managers


Use various sentences and improve the code so that it can deal with different formats of the expression:

In [9]:
sentences = ["On Friday, board members meet with senior managers " +
             "to discuss future development of the company.", 
             "Boris Johnson met with the Queen last week.",
             "Donald Trump meets the Queen at Buckingham Palace.",
             "The two leaders also posed for photographs and " +
             "the President talked to reporters."]

def extract_information(doc):
    action=""
    participant1 = ""
    participant2 = ""
    for token in doc:
        if token.lemma_=="meet" and token.pos_=="VERB" and token.dep_=="ROOT":
            action = token.text
            children = [child for child in token.children]   
            for child1 in children:
                if child1.dep_=="nsubj":
                    participant1 = " ".join([attr.text for 
                                             attr in child1.children]) + " " + child1.text
                elif child1.text=="with":
                    action += " " + child1.text
                    child1_children = [child for child in child1.children]
                    for child2 in child1_children:
                        if child2.pos_ == "NOUN" or child2.pos_ == "PROPN":
                            participant2 = " ".join([attr.text for 
                                                 attr in child2.children]) + " " + child2.text
                elif child1.dep_=="dobj" and (child1.pos_ == "NOUN"
                                              or child1.pos_ == "PROPN"):
                    participant2 = " ".join([attr.text for 
                                             attr in child1.children]) + " " + child1.text
    print (f"Participant1 = {participant1}")
    print (f"Action = {action}")
    print (f"Participant2 = {participant2}")

for sent in sentences:
    print(f"\nSentence = {sent}")
    doc = nlp(sent)
    extract_information(doc)


Sentence = On Friday, board members meet with senior managers to discuss future development of the company.
Participant1 = board members
Action = meet with
Participant2 = senior managers

Sentence = Boris Johnson met with the Queen last week.
Participant1 = Boris Johnson
Action = met with
Participant2 = the Queen

Sentence = Donald Trump meets the Queen at Buckingham Palace.
Participant1 = Donald Trump
Action = meets
Participant2 = the Queen

Sentence = The two leaders also posed for photographs and the President talked to reporters.
Participant1 = 
Action = 
Participant2 = 
