# SPACY

In [None]:
from spacy.lang.es import Spanish

In [2]:
nlp = Spanish()

In [3]:
doc = nlp("¡Probando!")

In [4]:
for token in doc:
    print(token.text)

¡
Probando
!


In [5]:
token = doc[2]
print(token.text)

!


In [6]:
span = doc[0:2]
print(span.text)

¡Probando


In [7]:
doc_2 = nlp("Prueba número 2.")

In [8]:
print('Index:', [token.i for token in doc_2])

Index: [0, 1, 2, 3]


In [9]:
print('Text:', [token.text for token in doc_2])
print('is_alpha:', [token.is_alpha for token in doc_2])
print('is_punct:', [token.is_punct for token in doc_2])
print('like_num:', [token.like_num for token in doc_2])

Text: ['Prueba', 'número', '2', '.']
is_alpha: [True, True, False, False]
is_punct: [False, False, False, True]
like_num: [False, False, True, False]


In [10]:
doc_3 = nlp('La importancia del procesamiento del lenguaje natural')
procesamiento_del_lenguaje = doc_3[3:6]
print(procesamiento_del_lenguaje)

procesamiento del lenguaje


In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [12]:
text = """The Republican president is being challenged by Democratic Party nominee Joe Biden, who is best known as Barack Obama’s vice-president but has been in US politics since the 1970s.As election day approaches, pollingcompanies will be trying to gauge the mood of the nation by asking voters which candidate they prefer."""

In [13]:
doc = nlp(text)

In [14]:
type(doc)

spacy.tokens.doc.Doc

In [15]:
sentences = list(doc.sents)
len(sentences)

2

In [16]:
for sent in sentences:
    print(sent)

The Republican president is being challenged by Democratic Party nominee Joe Biden, who is best known as Barack Obama’s vice-president but has been in US politics since the 1970s.
As election day approaches, pollingcompanies will be trying to gauge the mood of the nation by asking voters which candidate they prefer.


In [17]:
text_2 = "The Republican president is being challenged by Democratic Party nominee Joe Biden,"

In [18]:
doc = nlp(text_2)

In [19]:
for token in doc:
    print(token.text)

The
Republican
president
is
being
challenged
by
Democratic
Party
nominee
Joe
Biden
,


### STOPWORDS:

In [20]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Length of stopwords:', len(stopwords))

Length of stopwords: 326


In [21]:
for i in list(stopwords)[:5]:
    print(i)

are
may
less
to
first


In [24]:
doc = nlp(text)

In [25]:
for token in doc:
    if token.is_stop:
        print(token)

The
is
being
by
who
is
as
’s
but
has
been
in
US
since
the
As
will
be
to
the
of
the
by
which
they


In [26]:
print('Total of tokens:', len(doc))

Total of tokens: 58


In [28]:
doc_2 = []

for token in doc:
    if not token.is_stop:
        doc_2.append(token)

In [29]:
print('New total of tokens:', len(doc_2))

New total of tokens: 33


In [30]:
for punct in doc_2:
    if not punct.is_punct:
        print(punct)

Republican
president
challenged
Democratic
Party
nominee
Joe
Biden
best
known
Barack
Obama
vice
president
politics
1970s
election
day
approaches
pollingcompanies
trying
gauge
mood
nation
asking
voters
candidate
prefer


### LEMMATIZATION:

In [31]:
for token in doc:
    print(token, token.lemma_)

The the
Republican republican
president president
is be
being be
challenged challenge
by by
Democratic Democratic
Party Party
nominee nominee
Joe Joe
Biden Biden
, ,
who who
is be
best well
known know
as as
Barack Barack
Obama Obama
’s ’s
vice vice
- -
president president
but but
has have
been be
in in
US US
politics politic
since since
the the
1970s 1970
. .
As as
election election
day day
approaches approach
, ,
pollingcompanies pollingcompanie
will will
be be
trying try
to to
gauge gauge
the the
mood mood
of of
the the
nation nation
by by
asking ask
voters voter
which which
candidate candidate
they -PRON-
prefer prefer
. .


In [33]:
for token in doc:
    if token.is_lower:
        print(token)

president
is
being
challenged
by
nominee
who
is
best
known
as
’s
vice
president
but
has
been
in
politics
since
the
1970s
election
day
approaches
pollingcompanies
will
be
trying
to
gauge
the
mood
of
the
nation
by
asking
voters
which
candidate
they
prefer


In [35]:
for token in doc:
    if token.is_stop:
        print(token)

The
is
being
by
who
is
as
’s
but
has
been
in
US
since
the
As
will
be
to
the
of
the
by
which
they


### PARTS OF SPEECH TAGGING:

In [36]:
text = 'The Republican president is being challenged by Democratic Party nominee Joe Biden'

In [37]:
doc = nlp(text)

In [38]:
for token in doc:
    print(token, token.tag_, 
          token.pos_, spacy.explain(token.tag_))

The DT DET determiner
Republican JJ ADJ adjective
president NN NOUN noun, singular or mass
is VBZ AUX verb, 3rd person singular present
being VBG AUX verb, gerund or present participle
challenged VBN VERB verb, past participle
by IN ADP conjunction, subordinating or preposition
Democratic NNP PROPN noun, proper singular
Party NNP PROPN noun, proper singular
nominee NN NOUN noun, singular or mass
Joe NNP PROPN noun, proper singular
Biden NNP PROPN noun, proper singular


In [39]:
from spacy import displacy

In [41]:
displacy.render(doc, style='dep', 
               jupyter=True, options={'distance':90})

In [42]:
text = '''The Republican president is being 
          challenged by Democratic Party nominee 
          Joe Biden, who is best known 
          as Barack Obama’s vice-president but has 
          been in US politics since the 1970s'''

In [43]:
doc = nlp(text)

In [44]:
print(doc.ents)

(Republican, Democratic Party, Joe Biden, Barack, US, the 1970s)


In [45]:
for entity in doc.ents:
    print(entity.text, entity.label_)

Republican NORP
Democratic Party ORG
Joe Biden PERSON
Barack GPE
US GPE
the 1970s DATE


In [46]:
displacy.render(doc, style='ent', jupyter=True)

In [47]:
text = '''The Republican president is being challenged
          by Democratic Party nominee Joe Biden'''

In [48]:
doc = nlp(text)

In [49]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

The Republican president president nsubjpass challenged
Democratic Party nominee Joe Biden Biden pobj by


In [50]:
from spacy.matcher import Matcher

In [51]:
matcher = Matcher(nlp.vocab)

In [52]:
pattern = [{'TEXT': 'Joe'}, {'TEXT': 'Biden'}]

In [53]:
matcher.add('rule_1', None, pattern)

In [54]:
matches = matcher(doc)
matches

[(7604275899133490726, 11, 13)]

# NLTK

In [12]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
import nltk.corpus
import os

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guido\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [13]:
text = '''In Brazil they drive on the right-hand 
          side of the road. Brazil has a large coastline on the eastern
          side of South America'''

In [14]:
from nltk.tokenize import word_tokenize

In [15]:
token = word_tokenize(text)

In [16]:
token

['In',
 'Brazil',
 'they',
 'drive',
 'on',
 'the',
 'right-hand',
 'side',
 'of',
 'the',
 'road',
 '.',
 'Brazil',
 'has',
 'a',
 'large',
 'coastline',
 'on',
 'the',
 'eastern',
 'side',
 'of',
 'South',
 'America']

In [17]:
from nltk.probability import FreqDist
fdist = FreqDist(token)
fdist

FreqDist({'the': 3, 'Brazil': 2, 'on': 2, 'side': 2, 'of': 2, 'In': 1, 'they': 1, 'drive': 1, 'right-hand': 1, 'road': 1, ...})

In [18]:
fdist_2 = fdist.most_common(10)
fdist_2

[('the', 3),
 ('Brazil', 2),
 ('on', 2),
 ('side', 2),
 ('of', 2),
 ('In', 1),
 ('they', 1),
 ('drive', 1),
 ('right-hand', 1),
 ('road', 1)]

In [22]:
from nltk.stem import PorterStemmer
pst = PorterStemmer()
pst.stem('waiting'), pst.stem('waited')

('wait', 'wait')

In [24]:
stm = ['waited', 'waiting', 'waits']

for word in stm:
    
    print(word + ':' + pst.stem(word))

waited:wait
waiting:wait
waits:wait


In [26]:
from nltk.stem import LancasterStemmer

lst = LancasterStemmer()

stm = ['giving', 'given', 'given', 'gave']

for word in stm:
    
    print(word + ':' + lst.stem(word))

giving:giv
given:giv
given:giv
gave:gav


In [28]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

print('rocks:', lemmatizer.lemmatize('rocks'))
print('corpora:', lemmatizer.lemmatize('corpora'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\guido\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


rocks: rock
corpora: corpus


In [34]:
print('nationalities:', lemmatizer.lemmatize('nationalities'))

nationalities: nationality


In [36]:
from nltk.corpus import stopwords

nltk.download('stopwords')

a = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guido\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [37]:
text = '''Cristiano Ronaldo was born on February 5, 
          1985, in Funchal, Madeira, Portugal.'''

In [39]:
text_2 = word_tokenize(text.lower())
print(text_2)

['cristiano', 'ronaldo', 'was', 'born', 'on', 'february', '5', ',', '1985', ',', 'in', 'funchal', ',', 'madeira', ',', 'portugal', '.']


In [40]:
stopwords = [x for x in text_2 if x not in a]
print(stopwords)

['cristiano', 'ronaldo', 'born', 'february', '5', ',', '1985', ',', 'funchal', ',', 'madeira', ',', 'portugal', '.']


In [41]:
text = '''vote to choose a particular man or a group
          (party) to represent them in parliament'''

In [42]:
tokenized_text = word_tokenize(text)

In [44]:
nltk.download('averaged_perceptron_tagger')

for token in tokenized_text:
    
    print(nltk.pos_tag([token]))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\guido\AppData\Roaming\nltk_data...


[('vote', 'NN')]
[('to', 'TO')]
[('choose', 'NN')]
[('a', 'DT')]
[('particular', 'JJ')]
[('man', 'NN')]
[('or', 'CC')]
[('a', 'DT')]
[('group', 'NN')]
[('(', '(')]
[('party', 'NN')]
[(')', ')')]
[('to', 'TO')]
[('represent', 'NN')]
[('them', 'PRP')]
[('in', 'IN')]
[('parliament', 'NN')]


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
