In [3]:
import spacy

In [4]:

print(spacy.__path__)



['/Users/macbookpro/anaconda3/envs/bobenv1/lib/python3.11/site-packages/spacy']


In [5]:
nlp = spacy.load('en_core_web_sm')

doc = nlp('My name is Eniang, and i live in Russia. I am studying for my master\'s degree')

for sentence in doc.sents:
    print(sentence)

My name is Eniang, and i live in Russia.
I am studying for my master's degree


In [6]:
import nltk

In [7]:
from nltk.tokenize import sent_tokenize

sent_tokenize('My name is Eniang, and i live in Russia. I am studying for my master\'s degree')

['My name is Eniang, and i live in Russia.',
 "I am studying for my master's degree"]

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [8]:
text = '''Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.'''
doc_1 = nlp(text)
websites = [toks.text for toks in doc_1 if toks.like_url]
     


In [9]:
websites

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [10]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc_2 = nlp(transactions)
money = [money.text for money in doc_2 if money.like_num]
money

['two', '500']

In [11]:
# a better solution
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)        

two $
500 €


In [12]:
# let's see what we have in spacy pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

The above list would have been empty if we had instead used a spacy.blank('en). This would have created an empty pipeline. From the spacy documentation, we can download a pre-trained pipeline. We already did that at the start and used this en_core_web_sm pre-trained pipeline.

Additional, i can create a blank pipeline and add items to it manually assuming i don't need everything in the pipeline.

In [13]:
# an example
source_nlp = spacy.load('en_core_web_sm')

new_nlp = spacy.blank('en')

#add a new pipe to our blank pipeline from the source_nlp
new_nlp.add_pipe('ner', source=source_nlp)
new_nlp.pipe_names

['ner']

In [14]:
type(doc)

spacy.tokens.doc.Doc

In [15]:
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

#downloading all neccessary packages related to nltk
#nltk.download('all')

Exercise1:

Convert these list of words into base form using Stemming and Lemmatization and observe the transformations
Write a short note on the words that have different base words using stemming and Lemmatization

In [16]:
#using stemming in nltk
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']
for token in lst_words:
    print(token, "|", stemmer.stem(token))

running | run
painting | paint
walking | walk
dressing | dress
likely | like
children | children
whom | whom
good | good
ate | ate
fishing | fish


In [17]:
#using lemmatization in spacy

doc = nlp("running painting walking dressing likely children who good ate fishing")
for token in doc:
    print(token.text, "|", token.lemma_)

running | run
painting | paint
walking | walk
dressing | dress
likely | likely
children | child
who | who
good | good
ate | eat
fishing | fishing


Exercise2:

convert the given text into it's base form using both stemming and lemmatization

In [18]:
text = """Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a 
habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.
"""

In [19]:
#using stemming in nltk

#step1: Word tokenizing
word_token = nltk.word_tokenize(text)

#step2: getting the base form for each token using stemmer
base_word = [stemmer.stem(toks) for toks in word_token]

#step3: joining all words in a list into string using 'join()'
all_together = " ".join(base_word)
all_together

'latha is veri multi talent girl.sh is good at mani skill like danc , run , sing , playing.sh also like eat pav bhagi . she ha a habit of fish and swim too.besid all thi , she is a wonder at cook too .'

In [20]:
#using lemmatisation in spacy


#step1: Creating the object for the given text
doc_3 = nlp(text)

#step2: getting the base form for each token using spacy 'lemma_'
base_form = [i.lemma_ for i in doc_3]



#step3: joining all words in a list into string using 'join()'
joined_text = " ".join(base_form)
joined_text

'Latha be very multi talented girl . she be good at many skill like dancing , running , singing , play . she also like eat Pav Bhagi . she have a \n habit of fishing and swim too . besides all this , she be a wonderful at cook too . \n'

Name Entity Recognition(NER)

In [21]:
doc_4 = nlp('Tesla is going to aquire Twitter for $45 billion')

for ent in doc_4.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla | ORG | Companies, agencies, institutions, etc.
Twitter | PRODUCT | Objects, vehicles, foods, etc. (not services)
$45 billion | MONEY | Monetary values, including unit


In [22]:
#  a more appealing graphical interface can be done with displacy
from spacy import displacy

displacy.render(doc_4, style='ent')

In [23]:
# let's get a peek at all entities that spacy supports
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In the previous case, we see that it didn't quite recognize twitter as an organization. It wrongly classified it as a product. 
To solve similar issues like this, we can customize the span of our token.

In [24]:
from spacy.tokens import Span
s1 = Span(doc_4, 0,1, label='ORG')
s2 = Span(doc_4, 5,6, label='ORG')
doc_4.set_ents([s1, s2], default='unmodified')

In [25]:
displacy.render(doc_4, style='ent')

In [26]:
from spacy.lang.en.stop_words import STOP_WORDS

Exercise1:

1. From a Given Text, Count the number of stop words in it.
2. Print the percentage of stop word tokens compared to all tokens in a given text

In [27]:
text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and 
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''

#step1: Create the object 'doc' for the given text using nlp()
doc_5 = nlp(text)


#step2: define the variables to keep track of stopwords count and total words count
stop_words = []
total_stop_words = len(stop_words)
#step3: iterate through all the words in the document
words = [token.text for token in doc_5]
num_words = len(words)


#step4: print the count of stop words
for token in doc_5:
    if token.is_stop:
        stop_words.append(token.text )
    

#step5: print the percentage of stop words compared to total words in the text
percent_stop_words = (total_stop_words/num_words) * 100


In [28]:
num_words

160

In [29]:
len(stop_words)

40

Text Representation

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(ngram_range=(2,2))
v.fit(['Thor Hathodawala is looking for a job'])
v.vocabulary_

{'thor hathodawala': 4,
 'hathodawala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [37]:
def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

preprocess('Thor ate pizza')        

'thor eat pizza'

In [39]:
# his function
# def preprocess(text):
#     doc = nlp(text)
    
#     filtered_tokens = []
    
#     for token in doc:
#         if token.is_stop or token.is_punct:
#             continue
#         filtered_tokens.append(token.lemma_)
#     return " ".join(filtered_tokens)


In [33]:
corpus  = ['Thor ate pizza', 
           'Loki is tall', 'Loki is eating pizza']

In [38]:
cleaned_corpus = [preprocess(text) for text in corpus]
cleaned_corpus

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [40]:
v_1 = CountVectorizer(ngram_range=(1,2))
v_1.fit(cleaned_corpus)
v_1.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}