# Spacy Demo01

In [1]:
# import libraries
!PIP install spacy
import spacy



Collecting spacy
  Downloading spacy-3.2.3-cp38-cp38-win_amd64.whl (11.6 MB)
Collecting wasabi<1.1.0,>=0.8.1
  Downloading wasabi-0.9.0-py3-none-any.whl (25 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.6-cp38-cp38-win_amd64.whl (21 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.15-cp38-cp38-win_amd64.whl (1.0 MB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.1-py3-none-any.whl (7.0 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.6-cp38-cp38-win_amd64.whl (36 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.7-cp38-cp38-win_amd64.whl (6.6 MB)
Collecting pydantic!=1.8,!=1.8.1,

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# download english language model
# There are 3 models: small, medoum, large
# The small model takes around 4 secs to load
# The large model takes around 3 mins
# Note that Some functions works only in medoium or large models
# %time !python -m spacy download en
%time !python -m spacy download en_core_web_lg

In [118]:
!python -m spacy link --force en_core_web_lg en

[38;5;2m✔ Linking successful[0m
/Users/ahmedfattah/opt/anaconda3/lib/python3.7/site-packages/en_core_web_lg -->
/Users/ahmedfattah/opt/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [119]:
# intantiate nlp class for english
nlp = spacy.load("en")

In [140]:
# tokenisation
# doc = nlp('Drinking a glass of wine is good for your wellbeing!')
doc = nlp('''Such an analysis can reveal features that are not easily visible from the variation in the individual genes and can lead to a picture of expression that is more biologically transparent and accessible to interpretation
''')
for token in doc:
    print(f"token:{token}\t tag:{token.tag_}\t\tPOS:{token.pos_}\t\t text:'{token.text}' \tlemma:{token.lemma_}\t ")

token:Such	 tag:PDT		POS:DET		 text:'Such' 	lemma:such	 
token:an	 tag:DT		POS:DET		 text:'an' 	lemma:an	 
token:analysis	 tag:NN		POS:NOUN		 text:'analysis' 	lemma:analysis	 
token:can	 tag:MD		POS:VERB		 text:'can' 	lemma:can	 
token:reveal	 tag:VB		POS:VERB		 text:'reveal' 	lemma:reveal	 
token:features	 tag:NNS		POS:NOUN		 text:'features' 	lemma:feature	 
token:that	 tag:WDT		POS:DET		 text:'that' 	lemma:that	 
token:are	 tag:VBP		POS:AUX		 text:'are' 	lemma:be	 
token:not	 tag:RB		POS:PART		 text:'not' 	lemma:not	 
token:easily	 tag:RB		POS:ADV		 text:'easily' 	lemma:easily	 
token:visible	 tag:JJ		POS:ADJ		 text:'visible' 	lemma:visible	 
token:from	 tag:IN		POS:ADP		 text:'from' 	lemma:from	 
token:the	 tag:DT		POS:DET		 text:'the' 	lemma:the	 
token:variation	 tag:NN		POS:NOUN		 text:'variation' 	lemma:variation	 
token:in	 tag:IN		POS:ADP		 text:'in' 	lemma:in	 
token:the	 tag:DT		POS:DET		 text:'the' 	lemma:the	 
token:individual	 tag:JJ		POS:ADJ		 text:'individual' 	lemma:in

In [102]:
# Named Entity Recognition
# doc = nlp("He was born in Canberra, Australia in 14/1/1974")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(f"Entity: {ent.text} \t\t type:{ent.label_}")

Entity: Apple 		 type:ORG
Entity: U.K. 		 type:GPE
Entity: $1 billion 		 type:MONEY


In [58]:
# Display tag alongside the text
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [59]:
# noun-phrase chunking
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(f"Text:{chunk.text},\t label:{chunk.label_},\t root:{chunk.root.text}")

Text:Wall Street Journal,	 label:NP,	 root:Journal
Text:an interesting piece,	 label:NP,	 root:piece
Text:crypto currencies,	 label:NP,	 root:currencies


In [60]:
# grammer dependency tree parsing and visualisation
from spacy import displacy
 
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [84]:
# Rule-base matcher
# import spacy Matcher
from spacy.matcher import Matcher
# create a matcher
matcher = Matcher(nlp.vocab)
# define a function to extract full name
def extract_full_name(text: str):
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('FULL_NAME', None, pattern)
    doc = nlp(text)
    names = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        names.append(span.text)
    return names

In [85]:
# Find full name in sentence
full_names = extract_full_name("I met John Richardson almost a year after Daniel Zhang married Lucy Khan")
print(f"Full names: {full_names}")

Full names: ['John Richardson', 'Daniel Zhang', 'Lucy Khan']


In [90]:
# Removing stop words
from spacy.lang.en.stop_words import STOP_WORDS
text = """He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were."""
filtered_text =[] 
nlp_text = nlp(text)
token_list = []
for token in nlp_text:
    token_list.append(token.text)
for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_text.append(word)
print(f"Token list: \n{token_list}")
print(f"\nFiltered text: \n{filtered_text}")

Token list: 
['He', 'determined', 'to', 'drop', 'his', 'litigation', 'with', 'the', 'monastry', ',', 'and', 'relinguish', 'his', 'claims', 'to', 'the', 'wood', '-', 'cuting', 'and', '\n', 'fishery', 'rihgts', 'at', 'once', '.', 'He', 'was', 'the', 'more', 'ready', 'to', 'do', 'this', 'becuase', 'the', 'rights', 'had', 'become', 'much', 'less', 'valuable', ',', 'and', 'he', 'had', '\n', 'indeed', 'the', 'vaguest', 'idea', 'where', 'the', 'wood', 'and', 'river', 'in', 'question', 'were', '.']

Filtered text: 
['determined', 'drop', 'litigation', 'monastry', ',', 'relinguish', 'claims', 'wood', '-', 'cuting', '\n', 'fishery', 'rihgts', '.', 'ready', 'becuase', 'rights', 'valuable', ',', '\n', 'vaguest', 'idea', 'wood', 'river', 'question', '.']


In [129]:
# word vectors
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(f"Token:\t{token.text}, has vector:\t{token.has_vector}, token.vector_norm, token.is_oov")
print(f"\nToken 1: {tokens[0]}\n Vector:\n{tokens[0].vector}")

Token:	dog, has vector:	True, token.vector_norm, token.is_oov
Token:	cat, has vector:	True, token.vector_norm, token.is_oov
Token:	banana, has vector:	True, token.vector_norm, token.is_oov
Token:	afskfsd, has vector:	False, token.vector_norm, token.is_oov

Token 1: dog
 Vector:
[-4.0176e-01  3.7057e-01  2.1281e-02 -3.4125e-01  4.9538e-02  2.9440e-01
 -1.7376e-01 -2.7982e-01  6.7622e-02  2.1693e+00 -6.2691e-01  2.9106e-01
 -6.7270e-01  2.3319e-01 -3.4264e-01  1.8311e-01  5.0226e-01  1.0689e+00
  1.4698e-01 -4.5230e-01 -4.1827e-01 -1.5967e-01  2.6748e-01 -4.8867e-01
  3.6462e-01 -4.3403e-02 -2.4474e-01 -4.1752e-01  8.9088e-02 -2.5552e-01
 -5.5695e-01  1.2243e-01 -8.3526e-02  5.5095e-01  3.6410e-01  1.5361e-01
  5.5738e-01 -9.0702e-01 -4.9098e-02  3.8580e-01  3.8000e-01  1.4425e-01
 -2.7221e-01 -3.7016e-01 -1.2904e-01 -1.5085e-01 -3.8076e-01  4.9583e-02
  1.2755e-01 -8.2788e-02  1.4339e-01  3.2537e-01  2.7226e-01  4.3632e-01
 -3.1769e-01  7.9405e-01  2.6529e-01  1.0135e-01 -3.3279e-01  4.

In [99]:
# Word similarity
tokens = nlp("dog cat banana apple")
for token1 in tokens:
    for token2 in tokens:
        print(f"Token 1:\t{token1.text},\t token 2:{token2.text},\t similarity:{token1.similarity(token2)}")

Token 1:	dog,	 token 2:dog,	 similarity:1.0
Token 1:	dog,	 token 2:cat,	 similarity:0.5080135464668274
Token 1:	dog,	 token 2:banana,	 similarity:0.3817077875137329
Token 1:	dog,	 token 2:apple,	 similarity:0.24964354932308197
Token 1:	cat,	 token 2:dog,	 similarity:0.5080135464668274
Token 1:	cat,	 token 2:cat,	 similarity:1.0
Token 1:	cat,	 token 2:banana,	 similarity:0.5098239183425903
Token 1:	cat,	 token 2:apple,	 similarity:0.27301254868507385
Token 1:	banana,	 token 2:dog,	 similarity:0.3817077875137329
Token 1:	banana,	 token 2:cat,	 similarity:0.5098239183425903
Token 1:	banana,	 token 2:banana,	 similarity:1.0
Token 1:	banana,	 token 2:apple,	 similarity:0.4156864285469055
Token 1:	apple,	 token 2:dog,	 similarity:0.24964354932308197
Token 1:	apple,	 token 2:cat,	 similarity:0.27301254868507385
Token 1:	apple,	 token 2:banana,	 similarity:0.4156864285469055
Token 1:	apple,	 token 2:apple,	 similarity:1.0


In [104]:
# Sentiment analys with textBlob
!pip install textBlob
from textblob import TextBlob



In [136]:
# Detect sentiment of a text
# text = "Textblob is amazingly simple to use. What great fun!"
text="I am so happy with my progress in the data science course"
textBlob = TextBlob(text)
print(f"{textBlob.sentiment}")

Sentiment(polarity=0.8, subjectivity=1.0)
