In [1]:
'''
! pip install inflect
! python -m spacy download en_core_web_md
! pip install textacy
! pip install neuralcoref
'''

'\n! pip install inflect\n! python -m spacy download en_core_web_md\n! pip install textacy\n! pip install neuralcoref\n'

## Counting nouns – plural and singular nouns

* Determine whether a noun is plural or singular
* Turn plural nouns into singular nouns and vice versa

In [2]:
import sys
sys.path.append('..')
from Chapter01.pos_tagging import pos_tag_nltk

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer
import inflect

In [4]:
with open('../Chapter01/sherlock_holmes_1.txt', 'r', encoding="utf-8") as f:
    text = f.read()

In [5]:
text = text.replace('\n', ' ')

In [6]:
text

'To Sherlock Holmes she is always _the_ woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position. He never spoke of the softer passions, save with a gibe and a sneer. They were admirable things for the observer—excellent for drawing the veil from men’s motives and actions. But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results. Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not be more disturbing than a strong e

In [7]:
words_with_pos = pos_tag_nltk(text)
words_with_pos[:5]

[('To', 'TO'),
 ('Sherlock', 'NNP'),
 ('Holmes', 'NNP'),
 ('she', 'PRP'),
 ('is', 'VBZ')]

In [8]:
def get_nouns(words_with_pos):
    nouns_set = ["NN", "NNS"]
    nouns = [word for word in words_with_pos if word[1] in nouns_set]
    return nouns

In [9]:
nouns = get_nouns(words_with_pos)
print(nouns)

[('woman', 'NN'), ('name', 'NN'), ('eyes', 'NNS'), ('whole', 'NN'), ('sex', 'NN'), ('emotion', 'NN'), ('akin', 'NN'), ('emotions', 'NNS'), ('cold', 'NN'), ('precise', 'NN'), ('mind', 'NN'), ('reasoning', 'NN'), ('machine', 'NN'), ('world', 'NN'), ('lover', 'NN'), ('position', 'NN'), ('passions', 'NNS'), ('gibe', 'NN'), ('sneer', 'NN'), ('things', 'NNS'), ('observer—excellent', 'NN'), ('veil', 'NN'), ('men', 'NNS'), ('motives', 'NNS'), ('actions', 'NNS'), ('reasoner', 'NN'), ('intrusions', 'NNS'), ('delicate', 'NN'), ('temperament', 'NN'), ('distracting', 'NN'), ('factor', 'NN'), ('doubt', 'NN'), ('results', 'NNS'), ('instrument', 'NN'), ('crack', 'NN'), ('high-power', 'NN'), ('lenses', 'NNS'), ('emotion', 'NN'), ('nature', 'NN'), ('woman', 'NN'), ('woman', 'NN'), ('memory', 'NN')]


In [10]:
def is_plural_nltk(noun_info):
    pos = noun_info[1]
    if pos == "NNS":
        return True
    else:
        return False

In [11]:
is_plural_nltk(nouns[2])

True

In [12]:
def is_plural_wn(noun):
    wnl = WordNetLemmatizer()
    lemma = wnl.lemmatize(noun, 'n')
    plural = True if noun is not lemma else False
    return plural

In [13]:
is_plural_wn('women')

True

In [14]:
def get_plural(singular_noun):
    p = inflect.engine()
    return p.plural(singular_noun)

In [15]:
get_plural('run')

'runs'

In [16]:
def get_singular(plural_noun):
    p = inflect.engine()
    plural = p.singular_noun(plural_noun)
    if plural :
        return plural
    else:
        return plural_noun

In [17]:
get_singular('emotions')

'emotion'

In [18]:
def plurals_wn(words_with_pos):
    other_nouns = []
    for noun_info in words_with_pos:
        word = noun_info[0]
        plural = is_plural_wn(word)
        if plural:
            singular = get_singular(word)
            other_nouns.append(singular)
        else:
            plural = get_plural(word)
            other_nouns.append(plural)
    return other_nouns

In [19]:
other_nouns_wn = plurals_wn(nouns)
other_nouns_wn

['women',
 'names',
 'eye',
 'wholes',
 'sexes',
 'emotions',
 'akins',
 'emotion',
 'colds',
 'precises',
 'minds',
 'reasonings',
 'machines',
 'worlds',
 'lovers',
 'positions',
 'passion',
 'gibes',
 'sneers',
 'thing',
 'observer—excellents',
 'veils',
 'mens',
 'motive',
 'action',
 'reasoners',
 'intrusion',
 'delicates',
 'temperaments',
 'distractings',
 'factors',
 'doubts',
 'result',
 'instruments',
 'cracks',
 'high-powers',
 'lens',
 'emotions',
 'natures',
 'women',
 'women',
 'memories']

## Getting the dependency parse

* A dependency parse is a tool that shows dependencies in a sentence. 

* For example, in the sentence The cat wore a hat, the root of the sentence in the verb, wore, and both the subject, the cat, and the object, a hat, are dependents. 

In [20]:
import spacy

In [21]:
sentence = 'I have seldom heard him mention her under any other name.'

In [22]:
nlp = spacy.load('en_core_web_sm')

In [23]:
doc = nlp(sentence)

In [24]:
for token in doc:
    print(token.text, '\t', token.dep_,'\t', spacy.explain(token.dep_))

I 	 nsubj 	 nominal subject
have 	 aux 	 auxiliary
seldom 	 advmod 	 adverbial modifier
heard 	 ROOT 	 None
him 	 nsubj 	 nominal subject
mention 	 ccomp 	 clausal complement
her 	 dobj 	 direct object
under 	 prep 	 prepositional modifier
any 	 det 	 determiner
other 	 amod 	 adjectival modifier
name 	 pobj 	 object of preposition
. 	 punct 	 punctuation


* ROOT is the main word that all the other words depend on, usually the verb.


* 
---

* To explore the dependency parse structure, we can use the attributes of the Token class. Using its ancestors and children attributes,


* We can get the **tokens that this token depends on** and the **tokens that depend on it**, respectively.

In [25]:
for token in doc:
    print(token.text)
    ancestor = [t.text for t in token.ancestors]
    print(ancestor)

I
['heard']
have
['heard']
seldom
['heard']
heard
[]
him
['mention', 'heard']
mention
['heard']
her
['mention', 'heard']
under
['mention', 'heard']
any
['name', 'under', 'mention', 'heard']
other
['name', 'under', 'mention', 'heard']
name
['under', 'mention', 'heard']
.
['heard']


In [26]:
for token in doc:
    print(token.text)
    children = [t.text for t in token.children]
    print(children)

I
[]
have
[]
seldom
[]
heard
['I', 'have', 'seldom', 'mention', '.']
him
[]
mention
['him', 'her', 'under']
her
[]
under
['name']
any
[]
other
[]
name
['any', 'other']
.
[]


In [27]:
for token in doc:
    print(token.text)
    subtree = [t.text for t in token.subtree]
    print(subtree)

I
['I']
have
['have']
seldom
['seldom']
heard
['I', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', '.']
him
['him']
mention
['him', 'mention', 'her', 'under', 'any', 'other', 'name']
her
['her']
under
['under', 'any', 'other', 'name']
any
['any']
other
['other']
name
['any', 'other', 'name']
.
['.']


In [28]:
from spacy import displacy
displacy.render(doc,jupyter=True)

* https://www.analyticsvidhya.com/blog/2020/07/part-of-speechpos-tagging-dependency-parsing-and-constituency-parsing-in-nlp/

## Splitting sentences into clauses

In [29]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [30]:
sentence = "He eats cheese, but he won't eat ice cream."

In [31]:
doc = nlp(sentence)

In [32]:
# dependency parse information. It will help us determine how to split the sentence into clauses.

for token in doc:
    ancestors = [t.text for t in token.ancestors]
    children = [t.text for t in token.children]
    print(token.text, "\t", token.i, "\t", token.pos_, "\t",
         token.dep_, "\t", ancestors, '\t', children)

He 	 0 	 PRON 	 nsubj 	 ['eats'] 	 []
eats 	 1 	 VERB 	 ROOT 	 [] 	 ['He', 'cheese', ',', 'but', 'eat']
cheese 	 2 	 NOUN 	 dobj 	 ['eats'] 	 []
, 	 3 	 PUNCT 	 punct 	 ['eats'] 	 []
but 	 4 	 CCONJ 	 cc 	 ['eats'] 	 []
he 	 5 	 PRON 	 nsubj 	 ['eat', 'eats'] 	 []
wo 	 6 	 AUX 	 aux 	 ['eat', 'eats'] 	 []
n't 	 7 	 PART 	 neg 	 ['eat', 'eats'] 	 []
eat 	 8 	 VERB 	 conj 	 ['eats'] 	 ['he', 'wo', "n't", 'cream', '.']
ice 	 9 	 NOUN 	 compound 	 ['cream', 'eat', 'eats'] 	 []
cream 	 10 	 NOUN 	 dobj 	 ['eat', 'eats'] 	 ['ice']
. 	 11 	 PUNCT 	 punct 	 ['eat', 'eats'] 	 []


In [33]:
# returns the token that has a dependency tag of ROOT

def find_root_of_sentence(doc):
    root_token = None
    for token in doc:
        if token.dep_ =='ROOT':
            root_token = token
    return root_token

In [34]:
root_token = find_root_of_sentence(doc)
root_token

eats

In [35]:
# We can now use the following function to find the other verbs in the sentence:

def find_other_verbs(doc, root_token):
    other_verbs = []
    for token in doc:
        ancestors = list(token.ancestors)

        if token.pos_ == 'VERB' and len(ancestors)==1 and ancestors[0] == root_token:
            other_verbs.append(token)
            
    return other_verbs  

In [36]:
root_token

eats

In [37]:
other_verbs = find_other_verbs(doc, root_token)
other_verbs

[eat]

In [38]:
# We will use the following function to find the token spans for each verb:
# find the beginning and ending index for the verb.

def get_clause_token_span_for_verb(verb, doc, all_verbs):
    first_token_index = len(doc)
    last_token_index = 0
    this_verb_children = list(verb.children)
    
    for child in this_verb_children:
        if child not in all_verbs:
            if child.i < first_token_index:
                first_token_index = child.i
            
            if child.i > last_token_index:
                last_token_index = child.i
    return (first_token_index, last_token_index)


In [39]:
# We will put together all the verbs in one array and process each using the preceding function. 
# This will return a tuple of start and end indices for each verb's clause:

token_spans = []
all_verbs = [root_token] + other_verbs

for other_verb in all_verbs:
    first_token_index, last_token_index = get_clause_token_span_for_verb(other_verb, doc, all_verbs)
    token_spans.append((first_token_index, last_token_index))
token_spans    

[(0, 4), (5, 11)]

In [40]:
# Using the start and end indices, we can now put together token spans for each clause. 
# We sort the sentence_clauses list at the end so that the clauses are in the order they appear in the sentence:

sentence_clauses = []

for token_span in token_spans:
    start = token_span[0]
    end = token_span[1]
    
    if start < end:
        clause = doc[start:end]
        sentence_clauses.append(clause)
        
sentence_clauses = sorted(sentence_clauses, key=lambda tup: tup[0])
print(sentence_clauses)

[He eats cheese,, he won't eat ice cream]


In [41]:
clauses_text = [clause.text for clause in sentence_clauses]
print(clauses_text)

['He eats cheese,', "he won't eat ice cream"]


# Extracting noun chunks

In [42]:
import spacy
from Chapter01.dividing_into_sentences import read_text_file

In [43]:
text = read_text_file("../Chapter01/sherlock_holmes_1.txt")

In [44]:
nlp = spacy.load('en_core_web_md')
doc = nlp(text)

In [45]:
# print out the noun chunks

for noun_chunk in doc.noun_chunks:
    print(noun_chunk.text)

Sherlock Holmes
she
the_ woman
I
him
her
any other name
his eyes
she
the whole
her sex
It
he
any emotion
Irene Adler
All emotions
his cold, precise but admirably balanced mind
He
I
it
the most perfect reasoning
observing machine
the world
a lover
he
himself
a
false position
He
the softer passions
a sneer
They
admirable things
the observer
the veil
men’s motives
actions
the trained
reasoner
such intrusions
his own delicate and finely
adjusted temperament
a distracting factor
a doubt
all his mental results
Grit
a sensitive
instrument
a crack
his own high-power lenses
a strong emotion
a nature
his
but one woman
him
that woman
the late Irene
Adler
dubious and questionable memory


In [46]:
sentence = "All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind."

In [47]:
# nlp = spacy.load('en_core_web_md')

In [48]:
doc = nlp(sentence)

In [49]:
for noun_chunk in doc.noun_chunks:
    print(noun_chunk)

All emotions
his cold, precise but admirably balanced mind


In [50]:
# basic properties of noun chunks are its start and end offsets

for noun_chunk in doc.noun_chunks:
    print(noun_chunk.text, '\t', noun_chunk.start, '\t', noun_chunk.end)

All emotions 	 0 	 2
his cold, precise but admirably balanced mind 	 11 	 19


In [51]:
# print out the sentence where the noun chunk

for noun_chunk in doc.noun_chunks:
    print(noun_chunk.text, '\t', noun_chunk.sent)

All emotions 	 All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind.
his cold, precise but admirably balanced mind 	 All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind.


In [52]:
#  any noun chunk includes a root, which is the token that all other tokens depend on. 
# In a noun phrase, that is the noun:

for noun_chunk in doc.noun_chunks:
    print(noun_chunk.text, '\t', noun_chunk.root.text)


All emotions 	 emotions
his cold, precise but admirably balanced mind 	 mind


In [53]:
# semantic similarity of different texts. 

other_span = "emotions"
other_doc = nlp(other_span)

In [54]:
for noun_chunk in doc.noun_chunks:
    print(noun_chunk.similarity(other_doc))

0.8876554549427152
0.5102475977383759


# Extracting entities and relations

In [55]:
import spacy
import textacy
from Chapter02.split_into_clauses_3 import find_root_of_sentence

In [56]:
nlp = spacy.load('en_core_web_sm')

In [57]:
sentences = ["All living things are made of cells.", "Cells have organelles."]

In [58]:
verb_patterns = [[{"POS":"AUX"}, {"POS":"VERB"}, {"POS":"ADP"}], [{"POS":"AUX"}],[{"POS":"VERB"}]]

In [59]:
# The contains_root function checks if a verb phrase contains the root of the sentence:

def contains_root(verb_phrase, root):
    vp_start = verb_phrase.start
    vp_end = verb_phrase.end-1
    if (root.i >= vp_start and root.i <= vp_end):
        return True
    else:
        return False

In [60]:
# The get_verb_phrases function gets the verb phrases from a spaCy Doc object:

def get_verb_phrases(doc):
    root = find_root_of_sentence(doc)
    verb_phrases = list(textacy.extract.matches.token_matches(doc, verb_patterns))
    new_vps = []
    for verb_phrase in verb_phrases:
        if contains_root(verb_phrase,root):
            new_vps.append(verb_phrase)
    return new_vps
    

In [61]:
# The longer_verb_phrase function finds the longest verb phrase:

def longer_verb_phrase(verb_phrases):
    longest_length = 0
    longest_verb_phrase = None
    for verb_phrase in verb_phrases:
        if len(verb_phrase) > longest_length:
            longest_verb_phrase = verb_phrase
    return longest_verb_phrase

In [62]:
# The find_noun_phrase function will look for noun phrases 
# either on the left- or right-hand side of the main verb phrase:

def find_noun_phrase(verb_phrase, noun_phrases, side):
    for noun_phrase in noun_phrases:
        if (side == "left" and noun_phrase.start < verb_phrase.start):
            return noun_phrase
        elif (side == "right" and noun_phrase.start > verb_phrase.start):
            return noun_phrase
        else:
            pass

In [63]:
def find_triplet(sentence):
    doc = nlp(sentence)
    verb_phrases = get_verb_phrases(doc)
    noun_phrases = doc.noun_chunks
    verb_phrase = None
    if (len(verb_phrases) > 1):
        verb_phrase = longer_verb_phrase(list(verb_phrases))
    else:
        verb_phrase = verb_phrases[0]
    left_noun_phrase = find_noun_phrase(verb_phrase,noun_phrases,"left")
    right_noun_phrase = find_noun_phrase(verb_phrase,noun_phrases,"right")
    return (left_noun_phrase,verb_phrase,right_noun_phrase)

In [64]:
for sentence in sentences:
    (left_np, vp, right_np) = find_triplet(sentence)
    print(left_np, "\t", vp, "\t", right_np)

All living things 	 are made of 	 cells
Cells 	 have 	 organelles


## Extracting subjects and objects of the sentence

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
sentences=["The big black cat stared at the small dog.",
           "Jane watched her brother in the evenings.",
          "Laura gave Sam a very interesting book."]

In [14]:
# Function that find subtree that contains the token with subj or dobj in the dependency tag

def get_subject_phrase(doc):
    for token in doc:
        if "subj" in token.dep_:
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]

In [15]:
# direct object function. If the sentence does not have a direct object, it will return None

def get_object_phrase(doc):
    for token in doc:
        if "dobj" in token.dep_:
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]

In [16]:
for sentence in sentences:
    doc = nlp(sentence)
    subject_phrase = get_subject_phrase(doc)
    object_phrase = get_object_phrase(doc)
    print(subject_phrase)
    print(object_phrase)
    print('************')
    

The big black cat
None
************
Jane
her brother
************
Laura
a very interesting book
************


In [17]:
# The dative object function checks the tokens for the dative tag. It returns None if there are no dative objects:

def get_dative_phrase(doc):
    for token in doc:
        if "dative" in token.dep_:
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]
    return None

In [18]:
# the prepositional object function. It returns a list of objects of prepositions, but will be empty if there are none:

def get_prepositional_phrase_objs(doc):
    prep_spans = []
    for token in doc:
        if ("pobj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            prep_spans.append(doc[start:end])
    return prep_spans

In [19]:
for sentence in sentences:
    doc = nlp(sentence)
    dative_object_phrase = get_dative_phrase(doc)
    prepositional_object_phrase = get_prepositional_phrase_objs(doc)
    print(dative_object_phrase)
    print(prepositional_object_phrase)
    print('************')

None
[the small dog]
************
None
[the evenings]
************
Sam
[]
************


## Finding references – anaphora resolution

* When we work on problems of extracting entities and relations from text. 


* we are faced with real text, and many of our entities might end up being extracted as pronouns, such as she or him. 


* In order to tackle this issue, we need to perform anaphora resolution, or the process of substituting the pronouns with their referents.

In [11]:
import spacy
import neuralcoref

In [12]:
# ! python -m spacy download en_core_web_sm

In [13]:
nlp = spacy.load('en_core_web_sm')

In [14]:
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fb1717d1910>

In [15]:
text = "Earlier this year, Olga appeared on a new song. She was featured on one of the tracks.\
        The singer is assuring that her next album will be worth the wait."

In [16]:
doc = nlp(text)
print(doc._.coref_resolved)

Earlier this year, Olga appeared on a new song. Olga was featured on one of the tracks.        Olga is assuring that Olga next album will be worth the wait.


In [17]:
text = "Deepika has a dog. She loves him. The movie star has always been fond of animals."

In [18]:
doc = nlp(text)
print(doc._.coref_resolved)

Deepika has a dog. Deepika loves Deepika. Deepika has always been fond of animals.


In [20]:
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp, conv_dict={'Deepika': ['woman']})

<spacy.lang.en.English at 0x7fb176400f50>

In [21]:
doc = nlp(text)
print(doc._.coref_resolved)

Deepika has a dog. Deepika loves a dog. Deepika has always been fond of animals.
