In [1]:
"""
Spacy(2015)-Open source NLP for python using maximally efficient algorithms
-for many tasks, Spacy has only one algorithm available, the 
most-efficient-currently-available one
-This makes Spacy generally much more efficient than alternatives for COMMON tasks
"""
import spacy
import pandas as pd

In [11]:
"""
The nlp() function from Spacy automatically takes raw text and processes its data
includes: tokenizer -> tagger -> parser -> ner

tags can extract individual data from these models, e.g.
.text : word literal e.g. Alibaba
.lemma_ : base form, lemmatization, lowercase of word e.g. alibaba
.pos_ : simple part-of-speech tag e.g. PROPN, prop noun
.tag_ : detailed part-of-speech tag e.g. NNP, proper singular noun
.shape_ : the word shape - capitalization, punctuation, digits e.g. XxXxX
.is_alpha : is the token an alphanumeric character? e.g. true/false
.is_stop : is the token part of a stop list, i.e. the most common words of the language? e.g. true/false
"""
#create a text model in english called "nlp"
nlp=spacy.load('en_core_web_sm')
#Display your model's pipeline; the basic pipeline consists of tagging, parsing, and ner
print(nlp.pipeline)
print("\n")
print(nlp.pipe_names)

[('tagger', <spacy.pipeline.Tagger object at 0x7fe296f38ed0>), ('parser', <spacy.pipeline.DependencyParser object at 0x7fe287534710>), ('ner', <spacy.pipeline.EntityRecognizer object at 0x7fe287521650>)]


['tagger', 'parser', 'ner']


In [18]:
#creates a document around the "nlp" model of the given text
#Spacy also builds a companion "vocab" object for vocabulary
#u here stands for "unicode string" to be passed into spacy's nlp function.
doc = nlp(u'"We are welcoming a new age." Jack Ma of Alibaba says China will enter a \
new age of rural revitalization in 2023. Alibaba can be reached at 555-555-5555; it is \
now worth $14 billion.')

In [20]:
"""
Spacy goes through several steps before returning processed text:
1. split on whitespace
2. separate prefix characters, like the ",(,$,¿ characters
3. separate infix words again like contractions, hyphenated words, -,--,/,... characters
4. separate suffix characters, like ",km,),.,! characters
5. separate exception characters like usage in let's and U.S.

Note that this doesn't include lemmatization (finding base etymological forms of words) or 
Named Entity Recognition (NER), the identification of names/orgs/places/values 
"""
tokens=[] #tokens are the components of raw text, split by some delimiter character like space
for token in doc: #load data, 1 token per row, from each word's Spacy details
    tokens.append([token.text, #string literal
                 token.pos_, #simple Part-Of-Speech, e.g. PROPN=proper noun
                 token.tag_, #detailed Part-Of-Speech, e.g. NNP=Noun, proper singular
                 token.dep_, #dependency(ies)
                 token.shape_, #graphical shape of the token & chartypes
                 token.is_alpha, #is it an alphanumeric character?
                 token.is_stop, #is it a stop-word, or a word not requiring tagging?
                 token.is_sent_start]) #is it the first word of a sentence?
index=range(0,len(tokens)) #we have .text for the string literal, so title rows by word index
columns=[".text", ".pos_", ".tag_", ".dep_", ".shape_", ".is_alpha", ".is_stop", "is_sent_start"]
table=pd.DataFrame(tokens,index,columns)
table #display the DataFrame

Unnamed: 0,.text,.pos_,.tag_,.dep_,.shape_,.is_alpha,.is_stop,is_sent_start
0,"""",PUNCT,``,punct,"""",False,False,
1,We,PRON,PRP,nsubj,Xx,True,False,
2,are,VERB,VBP,aux,xxx,True,True,
3,welcoming,VERB,VBG,ROOT,xxxx,True,False,
4,a,DET,DT,det,x,True,True,
5,new,ADJ,JJ,amod,xxx,True,False,
6,age,NOUN,NN,dobj,xxx,True,False,
7,.,PUNCT,.,punct,.,False,False,
8,"""",PUNCT,'',punct,"""",False,False,
9,Jack,PROPN,NNP,compound,Xxxx,True,False,True


In [16]:
#easily index into different words in the given document
print(f"Document type: {type(doc)}")
print(doc[0].pos_)
print(doc[3].dep_)
quote=doc[0:9] #document index does NOT allow value reassignment via doc[index]=newValue
print(f"Selected quote:{quote}")
print(f"Quote data type: {type(quote)}")
print("\nSentences:")
for sentence in doc.sents: #separate document into sentences
    print(sentence)

Document type: <class 'spacy.tokens.doc.Doc'>
PUNCT
ROOT
Selected quote:"We are welcoming a new age."
Quote data type: <class 'spacy.tokens.span.Span'>

Sentences:
"We are welcoming a new age."
Jack Ma of Alibaba says China will enter a new age of rural revitalization in 2023.
Alibaba can be reached at 555-555-5555; it is now worth $14 billion.


In [17]:
nlp=spacy.load('en_core_web_sm')

mystring='"We\'re here and we won\'t dilly-dally, email hi@site.com or visit at \
http://site.com! Every 5km run is matched by a $10.50 donation to LAUSD from Apple. \
The City of Los Angeles stands firmly behind its Department of Public Works. Our \
fundraising goal is $8 million dollars by the second quarter of financial year 2023."'

doc=nlp(mystring)
print(f"# of characters: {len(mystring)}")
print(f"# of words: {len(doc)}")
print(f"Vocab size of language library: {len(doc.vocab)} \n")
for token in doc:
    print(token.text,end=' | ')

# of characters: 318
# of words: 67
Vocab size of language library: 57852 

" | We | 're | here | and | we | wo | n't | dilly | - | dally | , | email | hi@site.com | or | visit | at | http://site.com | ! | Every | 5 | km | run | is | matched | by | a | $ | 10.50 | donation | to | LAUSD | from | Apple | . | The | City | of | Los | Angeles | stands | firmly | behind | its | Department | of | Public | Works | . | Our | fundraising | goal | is | $ | 8 | million | dollars | by | the | second | quarter | of | financial | year | 2023 | . | " | 

In [111]:
entities=[] #separate named entites, noun-descriptor phrases
for entity in doc.ents:
    entities.append([entity, entity.label_, spacy.explain(entity.label_)])
index=range(0,len(entities))
columns=["entity","label","explanation"]
table2=pd.DataFrame(entities,index,columns)    
table2

Unnamed: 0,entity,label,explanation
0,(hi@site.com),ORG,"Companies, agencies, institutions, etc."
1,"(5, km)",QUANTITY,"Measurements, as of weight or distance"
2,(10.50),MONEY,"Monetary values, including unit"
3,(LAUSD),ORG,"Companies, agencies, institutions, etc."
4,(Apple),ORG,"Companies, agencies, institutions, etc."
5,"(The, City, of, Los, Angeles)",GPE,"Countries, cities, states"
6,"(Department, of, Public, Works)",ORG,"Companies, agencies, institutions, etc."
7,"($, 8, million, dollars)",MONEY,"Monetary values, including unit"
8,"(the, second, quarter, of, financial, year, 2023)",DATE,Absolute or relative dates or periods


In [113]:
chunks=[] #separate noun chunks, non-entity noun phrases which are proximate.
for chunk in doc.noun_chunks: #it's clear that the use cases here are narrower.
    print(chunk)

We
we
email
Every 5km run
a $10.50 donation
LAUSD
Apple
The City
Los Angeles
its Department
Public Works
Our fundraising goal
$8 million dollars
the second quarter
financial year


In [114]:
from spacy import displacy

In [116]:
doc=nlp(u"Our fundraising goal is $8 million dollars by second quarter financial year 2023.")
displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

In [117]:
displacy.render(doc,style='ent',jupyter=True) #display document with entities highlighted

In [None]:
displacy.serve(doc,style='ent') #serve the displacy display to a new port


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer



127.0.0.1 - - [18/Jul/2023 13:51:26] "GET / HTTP/1.1" 200 1255
127.0.0.1 - - [18/Jul/2023 13:51:26] "GET /favicon.ico HTTP/1.1" 200 1255
