In [10]:
import spacy
from spacy.lang.en import English
import pandas as pd

In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
text = "The rain in Spain falls mainly on the plain."
doc = nlp(text)

In [13]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

The the DET True
rain rain NOUN False
in in ADP True
Spain Spain PROPN False
falls fall VERB False
mainly mainly ADV False
on on ADP True
the the DET True
plain plain NOUN False
. . PUNCT False


In [14]:
cols = ("text", "lemma", "POS", "explain", "stopword")
rows = []
for t in doc:
    row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.is_stop]
    rows.append(row)
df = pd.DataFrame(rows, columns=cols)
print(df)

     text   lemma    POS      explain  stopword
0     The     the    DET   determiner      True
1    rain    rain   NOUN         noun     False
2      in      in    ADP   adposition      True
3   Spain   Spain  PROPN  proper noun     False
4   falls    fall   VERB         verb     False
5  mainly  mainly    ADV       adverb     False
6      on      on    ADP   adposition      True
7     the     the    DET   determiner      True
8   plain   plain   NOUN         noun     False
9       .       .  PUNCT  punctuation     False


#### 
For each word in that sentence spaCy has created a token, to show:
raw text
lemma – a root form of the word
part of speech
a flag for whether the word is a stopword

### Visualize the parse tree for this sentence

#### This visualization shows the predictions from the loaded spaCy model. 
#### When you load a model, like en_core_web_sm, you load a pipeline of models that spaCy runs on your behalf. 
#### One of these models is called the "tagger," and it predicts linguistic features for all of the tokens. 
#### The tagger is the model that indicates that "spaCy" is a proper noun (PROPN) in the sentence above, and that "handles" is a verb (VERB). The visualization also shows the syntactic dependencies between the tokens.
#### Another model in the spaCy pipeline, the "parser", is responsible for predicting this grammatical structure.

In [15]:
from spacy import displacy
displacy.render(doc, style="dep")

### Handling multiple sentences 

In [16]:
text = "We were all out at the zoo one day, I was doing some acting, walking on the railing of the gorilla exhibit. I fell in.Everyone screamed and Tommy jumped in after me, forgetting that he had blueberries in his front pocket. The gorillas just went wild."
doc = nlp(text)
for sent in doc.sents:
    print(">", sent)

> We were all out at the zoo one day, I was doing some acting, walking on the railing of the gorilla exhibit.
> I fell in.
> Everyone screamed and Tommy jumped in after me, forgetting that he had blueberries in his front pocket.
> The gorillas just went wild.


#### spaCy doesn't carve the text stream into little pieces. So each sentence is a span with a start and an end index into the document array 

In [17]:
for sent in doc.sents:
    print(">", sent.start, sent.end)

> 0 25
> 25 29
> 29 48
> 48 54


In [18]:
## Pull out tokens for one sentence
doc[48:54]

The gorillas just went wild.

In [19]:
##index into a specific token
token = doc[51]
print(token.text, token.lemma_, token.pos_)

## the lemma for the word went is go

went go VERB


### Getting texts using Beautiful Soup

In [20]:
import sys
import warnings
warnings.filterwarnings("ignore")

In [22]:
from bs4 import BeautifulSoup
import requests
import traceback

In [23]:
##find and extract text from <p> tags
def get_text (url):
    buf = []
    try:
        soup = BeautifulSoup(requests.get(url).text, "html.parser")
        for p in soup.find_all("p"):
            buf.append(p.get_text())
        return "".join(buf)
    except:
        print(traceback.format_exc())
        sys.exit(-1)

In [27]:
lic = {}
lic["mit"] = nlp(get_text("https://opensource.org/licenses/MIT"))
lic["asl"] = nlp(get_text("https://opensource.org/licenses/Apache-20"))
lic["bsd"] = nlp(get_text("https://opensource.org/licenses/BSD-3-Clause"))

for sent in lic["bsd"].sents:
    print(">", sent)

> SPDX short identifier: BSD-3-Clause Note: This license has also been called the "New BSD License" or "Modified BSD License".
> See also the 2-clause BSD License.
> Copyright <YEAR> <COPYRIGHT HOLDER>Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:1.
> Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.3.
> Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
> THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING

#### Comparing the similarity metrics in the text found from open sources above

In [28]:
pairs = [
    ["mit", "asl"],
    ["asl", "bsd"],
    ["bsd", "mit"]]

for a, b in pairs:
    print(a, b, lic[a].similarity(lic[b]))

mit asl 0.7416972794190073
asl bsd 0.7637987327062771
bsd mit 0.9773236055944917


In [29]:
## BSD and MIT appaear to be almost similar

### Natural Language Understanding

In [30]:
## get noun checks 
text = "Steve Jobs and Steve Wozniak incorporated Apple Computer on January 3, 1977, in Cupertino, California."
doc = nlp(text)

for chunk in doc.noun_chunks:
    print(chunk.text)

Steve Jobs
Steve Wozniak
Apple Computer
January
Cupertino
California


In [31]:
## get named entities (proper nouns)

for ent in doc.ents:
    print(ent.text, ent.label_)

Steve Jobs PERSON
Steve Wozniak PERSON
Apple Computer ORG
January 3, 1977 DATE
Cupertino GPE
California GPE


#### Visualizing the named entities

In [32]:
displacy.render(doc, style="ent")

In [34]:
### Knowledge graphs

In [35]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dvellanki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [43]:
## Customizing the pipeline
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
print("before", nlp.pipe_names)
if "WordnetAnnotator" not in nlp.pipe_names:
    nlp.add_pipe("spacy_wordnet", after="tagger")
    print("after", nlp.pipe_names)

before ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
after ['tok2vec', 'tagger', 'spacy_wordnet', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [None]:
## Click through the results online in a WordNet search to find the meanings related to the word withdraw

In [44]:
token = nlp("withdraw")[0]
token._.wordnet.synsets()

[Synset('withdraw.v.01'),
 Synset('retire.v.02'),
 Synset('disengage.v.01'),
 Synset('recall.v.07'),
 Synset('swallow.v.05'),
 Synset('seclude.v.01'),
 Synset('adjourn.v.02'),
 Synset('bow_out.v.02'),
 Synset('withdraw.v.09'),
 Synset('retire.v.08'),
 Synset('retreat.v.04'),
 Synset('remove.v.01')]

In [45]:
token._.wordnet.lemmas()

[Lemma('withdraw.v.01.withdraw'),
 Lemma('withdraw.v.01.retreat'),
 Lemma('withdraw.v.01.pull_away'),
 Lemma('withdraw.v.01.draw_back'),
 Lemma('withdraw.v.01.recede'),
 Lemma('withdraw.v.01.pull_back'),
 Lemma('withdraw.v.01.retire'),
 Lemma('withdraw.v.01.move_back'),
 Lemma('retire.v.02.retire'),
 Lemma('retire.v.02.withdraw'),
 Lemma('disengage.v.01.disengage'),
 Lemma('disengage.v.01.withdraw'),
 Lemma('recall.v.07.recall'),
 Lemma('recall.v.07.call_in'),
 Lemma('recall.v.07.call_back'),
 Lemma('recall.v.07.withdraw'),
 Lemma('swallow.v.05.swallow'),
 Lemma('swallow.v.05.take_back'),
 Lemma('swallow.v.05.unsay'),
 Lemma('swallow.v.05.withdraw'),
 Lemma('seclude.v.01.seclude'),
 Lemma('seclude.v.01.sequester'),
 Lemma('seclude.v.01.sequestrate'),
 Lemma('seclude.v.01.withdraw'),
 Lemma('adjourn.v.02.adjourn'),
 Lemma('adjourn.v.02.withdraw'),
 Lemma('adjourn.v.02.retire'),
 Lemma('bow_out.v.02.bow_out'),
 Lemma('bow_out.v.02.withdraw'),
 Lemma('withdraw.v.09.withdraw'),
 Lemma('wit

In [46]:
token._.wordnet.wordnet_domains()

['astronomy',
 'school',
 'telegraphy',
 'industry',
 'psychology',
 'ethnology',
 'ethnology',
 'administration',
 'school',
 'finance',
 'economy',
 'exchange',
 'banking',
 'commerce',
 'medicine',
 'ethnology',
 'university',
 'school',
 'buildings',
 'factotum',
 'agriculture',
 'mechanics',
 'gastronomy',
 'meteorology',
 'physics',
 'basketball',
 'anatomy',
 'skiing',
 'nautical',
 'engineering',
 'racing',
 'home',
 'drawing',
 'dentistry',
 'ethnology',
 'mathematics',
 'furniture',
 'animal_husbandry',
 'industry',
 'economy',
 'body_care',
 'chemistry',
 'medicine',
 'surgery',
 'vehicles',
 'transport',
 'atomic_physic',
 'archaeology',
 'hydraulics',
 'oceanography',
 'golf',
 'sculpture',
 'earth',
 'applied_science',
 'artisanship']

#### NLU results that are within Finance and Banking 

In [47]:
domains = ["finance", "banking"]
sentence = nlp("I want to withdraw 5,000 euros.")

enriched_sent = []

for token in sentence:
    # get synsets within the desired domains
    synsets = token._.wordnet.wordnet_synsets_for_domain(domains)

    if synsets:
        lemmas_for_synset = []

        for s in synsets:
        # get synset variants and add to the enriched sentence
            lemmas_for_synset.extend(s.lemma_names())
            enriched_sent.append("({})".format("|".join(set(lemmas_for_synset))))
    else:
        enriched_sent.append(token.text)
print(" ".join(enriched_sent))

I (deprivation|neediness|want|privation) (deprivation|want|lack|neediness|privation|deficiency) (deprivation|want|lack|need|require|neediness|privation|deficiency) to (draw|draw_off|take_out|withdraw) 5,000 euros .
