# Spacy Demo01

In [None]:
# import libraries
!pip install spacy
import spacy

In [None]:
# download english language model
# There are 3 models: small, medium, large
# The small model takes around 4 secs to load
# The large model takes around 3 mins
# Note that some functions work only in medium or large models

%time !python -m spacy download en_core_web_md

In [None]:
# intantiate nlp class for english
nlp = spacy.load("en_core_web_md")

In [None]:
# tokenisation
# doc = nlp('Drinking a glass of wine is good for your wellbeing!')
doc = nlp('''Such an analysis can reveal features that are not easily visible from the variation in the individual genes and can lead to a picture of expression that is more biologically transparent and accessible to interpretation
''')
for token in doc:
    print(f"token:{token}\t tag:{token.tag_}\t\tPOS:{token.pos_}\t\t text:'{token.text}' \tlemma:{token.lemma_}\t ")

In [None]:
# Named Entity Recognition
# doc = nlp("He was born in Canberra, Australia in 14/1/1974")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(f"Entity: {ent.text} \t\t type:{ent.label_}")

In [None]:
# Display tag alongside the text
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# noun-phrase chunking
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(f"Text:{chunk.text},\t label:{chunk.label_},\t root:{chunk.root.text}")

In [None]:
# grammar dependency tree parsing and visualisation
from spacy import displacy
 
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [None]:
# Rule-base matcher
# import spacy Matcher
from spacy.matcher import Matcher
# create a matcher
matcher = Matcher(nlp.vocab)
# define a function to extract full name
def extract_full_name(text: str):
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('FULL_NAME', None, pattern)
    doc = nlp(text)
    names = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        names.append(span.text)
    return names

In [None]:
# Find full name in sentence
full_names = extract_full_name("I met John Richardson almost a year after Daniel Zhang married Lucy Khan")
print(f"Full names: {full_names}")

In [None]:
# Removing stop words
from spacy.lang.en.stop_words import STOP_WORDS
text = """He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were."""
filtered_text =[] 
nlp_text = nlp(text)
token_list = []
for token in nlp_text:
    token_list.append(token.text)
for word in token_list:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_text.append(word)
print(f"Token list: \n{token_list}")
print(f"\nFiltered text: \n{filtered_text}")

In [None]:
# word vectors
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(f"Token:\t{token.text}, has vector:\t{token.has_vector}, token.vector_norm, token.is_oov")
print(f"\nToken 1: {tokens[0]}\n Vector:\n{tokens[0].vector}")

In [None]:
# Word similarity
tokens = nlp("dog cat banana apple")
for token1 in tokens:
    for token2 in tokens:
        print(f"Token 1:\t{token1.text},\t token 2:{token2.text},\t similarity:{token1.similarity(token2)}")

In [None]:
# Sentiment analys with textBlob
!pip install textBlob
from textblob import TextBlob

In [None]:
# Detect sentiment of a text
# text = "Textblob is amazingly simple to use. What great fun!"
text="I am so happy with my progress in the data science course"
textBlob = TextBlob(text)
print(f"{textBlob.sentiment}")



---



---



> > > > > > > > > © 2022 Institute of Data


---



---



