# SpaCy!

`pip install spacy or pip install -U 'spacy[cuda-autodetect]'`

`python -m spacy download en_core_web_sm`   #trained using cnn

`python -m spacy download en_core_web_md`   #has word embedding (gloVe); trained using cnn

`python -m spacy download en_core_web_trf`  #everything is trained using transformer

In [1]:
import spacy
spacy.__version__

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]


'3.5.1'

## 1. Basics

### 1.1 Intro

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
text = 'Chaky is currently teaching NLP on youtube\'s channels!  Hopefully the students can gain some knowledge Thailand'

In [4]:
doc = nlp(text)

In [5]:
type(doc)

spacy.tokens.doc.Doc

In [6]:
for token in doc:
    print(token)

Chaky
is
currently
teaching
NLP
on
youtube
's
channels
!
 
Hopefully
the
students
can
gain
some
knowledge
Thailand


In [7]:
token

Thailand

In [8]:
for sent in doc.sents:
    print(sent)

Chaky is currently teaching NLP on youtube's channels!  
Hopefully the students can gain some knowledge Thailand


In [9]:
token.ent_type_  #geo political entity

'GPE'

In [10]:
spacy.explain('GPE')

'Countries, cities, states'

In [11]:
token.ent_iob_  #beginning of the entity

'B'

In [12]:
token.pos_

'PROPN'

In [13]:
token.dep_

'npadvmod'

In [14]:
token.head

gain

In [15]:
sentence1 = list(doc.sents)[0]

In [16]:
sentence1

Chaky is currently teaching NLP on youtube's channels!  

In [17]:
from spacy import displacy

displacy.render(sentence1, style="ent")

### 1.2 Word Vectors

In [18]:
nlp = spacy.load("en_core_web_md")

In [19]:
text = "Chaky likes to eat sushi"

In [20]:
doc = nlp(text)

In [21]:
sentence = list(doc.sents)[0]

In [22]:
sentence[0]

Chaky

In [23]:
len(sentence[0].vector)  #---> trained by glove as 300 dim of embedding

300

### 1.3 Similarity

In [24]:
doc = nlp("I love coffee.")

In [25]:
#hash value of coffee object
nlp.vocab.strings['coffee']

3197928453018144401

In [26]:
nlp.vocab.strings[3197928453018144401]

'coffee'

In [27]:
integer = nlp.vocab.strings['dog']

In [28]:
vector = nlp.vocab.vectors[integer]

In [29]:
vector[:5]

array([  1.233 ,   4.2963,  -7.9738, -10.121 ,   1.8207], dtype=float32)

In [30]:
import numpy as np

close_words = nlp.vocab.vectors.most_similar(np.asarray([vector]), n=10)

In [31]:
close_words

(array([[ 7918624946109788756,  4969328240109515165,  4560869431627726864,
         17429802345416193488,  6017664905485703127, 14534804554944721111,
           173986088034745168, 15668852121853073894, 11567120971096873637,
         15872191516786115817]], dtype=uint64),
 array([[ 1147,  2545,  3201,  9003,  3828, 18829,  5845, 11580,  7045,
         18612]], dtype=int32),
 array([[1.    , 0.8334, 0.8221, 0.8108, 0.7856, 0.7195, 0.685 , 0.6328,
         0.6148, 0.5966]], dtype=float32))

In [32]:
nlp.vocab.strings[close_words[0][0][4]]

'pet(s'

### 1.4 Doc and span similarity

In [33]:
doc1 = nlp("Chaky likes french fries")
doc2 = nlp("Tom likes sweet potatoes and nuggets")

In [34]:
doc1.similarity(doc2)

0.6210958701891666

In [35]:
doc3 = nlp("I play football yesterday")

In [36]:
doc1.similarity(doc3)

0.3184322755818882

In [37]:
span1 = doc1[2:4]
span1

french fries

In [38]:
span2 = doc2[3:5]
span2

potatoes and

In [39]:
span1.similarity(span2)

0.5160608887672424

## 2. Entity Ruler

In [40]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [41]:
# analysis = nlp.analyze_pipes(pretty=True)

In [42]:
ruler = nlp.add_pipe('entity_ruler', before='ner')

patterns = [
    {"label": 'LOC', "pattern": "Rangsit"}
]

ruler.add_patterns(patterns)

In [43]:
# analysis = nlp.analyze_pipes(pretty=True)

In [44]:
text = "AIT is at Rangsit."
doc = nlp(text)

In [45]:
for ent in doc.ents:
    print(ent.text, ent.label_)

AIT ORG
Rangsit LOC


### 2.1 More patterns!!!

In [46]:
import spacy

text = "My phone number is (555) 666-5555"

nlp = spacy.blank("en")

In [47]:
# analysis = nlp.analyze_pipes(pretty=True)
# analysis

In [48]:
ruler = nlp.add_pipe('entity_ruler')

In [49]:
patterns = [
    {"label": "PHONE NUMBER", "pattern": [{"ORTH": "("},  {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
                                            {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
]

ruler.add_patterns(patterns)

In [50]:
doc = nlp(text)

In [51]:
for ent in doc.ents:
    print(ent.text, ent.label_)

(555) 666-5555 PHONE NUMBER


### 2.2 Matcher

In [52]:
from spacy.matcher import Matcher

In [53]:
nlp = spacy.load("en_core_web_sm")

In [54]:
matcher = Matcher(nlp.vocab)

In [55]:
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL", [pattern])

doc = nlp("My email is chaklam@ait.asia.")
matches = matcher(doc)

In [56]:
matches

[(17587345535198158200, 3, 4)]

In [57]:
nlp.vocab[matches[0][0]].text

'EMAIL'

In [58]:
with open('data/wiki_king.txt', "r") as f:
    text = f.read()
    
text

'Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.\n\nKing participated in and led marches for blacks\' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his fam

In [59]:
nlp = spacy.load("en_core_web_sm")

In [60]:
matcher = Matcher(nlp.vocab)

In [61]:
pattern = [{"POS": "PROPN"}]
matcher.add("PROPER_NOUN", [pattern])

In [62]:
doc = nlp(text)
matches = matcher(doc)

In [63]:
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 23, 24) Baptist


In [64]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")

In [65]:
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])

In [66]:
for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 23, 24) Baptist
(451313080118390996, 49, 50) King
(451313080118390996, 69, 71) Mahatma Gandhi
(451313080118390996, 83, 88) Martin Luther King Sr.
(451313080118390996, 89, 90) King
(451313080118390996, 113, 114) King


### 2.3 Regex - regular expression

In [67]:
import spacy

text = "This is a sample number 5555555."

nlp = spacy.blank("en")

ruler = nlp.add_pipe("entity_ruler")

patterns = [
    {"label": "PHONE NUMBER",  "pattern": [{"TEXT": {"REGEX": "((\d){7})"}}]}
]

ruler.add_patterns(patterns)

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

5555555 PHONE NUMBER
