# SpaCy!

- huge library
- every thing, e.g., tagger, ner, pos, uses CNN
- #1 library for NLP (e.g., NLTK, gensim)
- HuggingFace (mostly for deep learning)

    `pip install spacy or pip install -U 'spacy[cuda-autodetect]'`

    `python -m spacy download en_core_web_sm`   #trained using cnn

    `python -m spacy download en_core_web_md`   #has word embedding (gloVe); trained using cnn

    `python -m spacy download en_core_web_trf`  #everything is trained using transformer

In [1]:
import spacy
spacy.__version__

2023-02-02 09:16:02.599371: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


'3.3.2'

### 1. Basics

In [2]:
#create a spacy object that can parse a lot of stuffs
#based on some learned model

nlp = spacy.load('en_core_web_sm')

In [10]:
text = 'Thailand really like to eat naan and masala.  He also likes to eat sushi.'

In [11]:
doc = nlp(text)

In [12]:
type(doc)

spacy.tokens.doc.Doc

In [14]:
#there are so many things in this doc
for tokens in doc[:10]:
    print(tokens)  #this spacy.tokens.doc.Doc already tokenize it!!!
    break

Thailand


In [15]:
tokens

Thailand

In [8]:
for sent in doc.sents:
    print(sent)  #it also has sentence 

Chaky really like to eat naan and masala.
 He also likes to eat sushi.


In [16]:
tokens

Thailand

In [17]:
tokens.ent_type #entity type ids

384

In [18]:
tokens.ent_type_ #geo political entity

'GPE'

In [19]:
spacy.explain('GPE')

'Countries, cities, states'

In [20]:
tokens.ent_iob_  #beginning of an entity

'B'

In [21]:
tokens.pos_  #proper noun

'PROPN'

In [22]:
tokens.dep_

'nsubj'

In [23]:
tokens.head

like

In [25]:
sentence1 = list(doc.sents)[0]

In [26]:
sentence1

Thailand really like to eat naan and masala.

In [28]:
from spacy import displacy  #displaying stuffs
displacy.render(sentence1, style="dep")

In [29]:
displacy.render(sentence1, style="ent")

### 2. Word Vectors

In [30]:
nlp = spacy.load("en_core_web_md")

In [31]:
text = "Chaky likes to eat sushi."

In [32]:
doc = nlp(text)

In [33]:
sentence = list(doc.sents)[0]

In [34]:
sentence[1]

likes

In [36]:
len(sentence[1].vector)  #what is the size?? --> 300 glove embedding

300

## 3. Similarity

In [37]:
#before similarity, let's about nlp.vocab.strings
doc = nlp("I love coffee.")

In [38]:
nlp.vocab.strings['coffee']  #hash value

3197928453018144401

In [39]:
nlp.vocab.strings[3197928453018144401]

'coffee'

In [40]:
#first numericalize dog
integer = nlp.vocab.strings['dog']
integer

7562983679033046312

In [43]:
#get the vector based on this id
vector = nlp.vocab.vectors[integer]
vector[:5] #size 300 - vector of dog

array([-0.72483 ,  0.42538 ,  0.025489, -0.39807 ,  0.037463],
      dtype=float32)

In [44]:
import numpy as np

close_words = nlp.vocab.vectors.most_similar(np.asarray([vector]), n=10)
close_words

(array([[13192779106523156987,  4476338517347267351, 14199852958745354380,
          3615545391617869586,  6740239789784345073,  9120157979859245900,
          6189118356939658504, 17686863692678987895,  8330890959751529634,
          4295179733490603801]], dtype=uint64),
 array([[9980, 9979, 9981, 4791, 4792, 4793, 7916, 7918, 7917,  451]],
       dtype=int32),
 array([[1.    , 1.    , 1.    , 0.7044, 0.7044, 0.7044, 0.6588, 0.6588,
         0.6588, 0.6366]], dtype=float32))

In [47]:
close_words[0].shape

(1, 10)

In [52]:
nlp.vocab.strings[close_words[0][0][0]]

'puppies'

## 4. Doc and span similarity

In [55]:
doc1 = nlp("Chaky likes french fries")
doc2 = nlp("Tonson likes sweet potato nuggets")

In [56]:
doc1.similarity(doc2)  #higher means more similar

0.8036982338461257

In [59]:
#doc ---> sents ---> span ---> tokens

#do span similarity
span1 = doc1[2:4]
span1

french fries

In [60]:
span2 = doc2[2:6]
span2

sushi

In [61]:
span1.similarity(span2)

0.6052150726318359