In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import bigrams
from nltk import trigrams
from nltk import ngrams

In [None]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)
  [ ] cess_cat............ CESS-CAT Treebank
  [

True

In [None]:
#corpus words
nltk.corpus.mac_morpho.words()

['Jersei', 'atinge', 'média', 'de', 'Cr$', '1,4', ...]

In [None]:
len(nltk.corpus.mac_morpho.words())

1170095

In [None]:
#corpus sents
nltk.corpus.mac_morpho.sents()

[['Jersei', 'atinge', 'média', 'de', 'Cr$', '1,4', 'milhão', 'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'São', 'Paulo'], ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposição', 'Nacional', 'do', 'Zebu', ',', 'que', 'começa', 'dia', '25'], ...]

In [None]:
#tagged words
nltk.corpus.mac_morpho.tagged_words()

[('Jersei', 'N'), ('atinge', 'V'), ('média', 'N'), ...]

In [None]:
#tagged sents
nltk.corpus.mac_morpho.tagged_sents()

[[('Jersei', 'N'), ('atinge', 'V'), ('média', 'N'), ('de', 'PREP'), ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milhão', 'N'), ('em', 'PREP|+'), ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'), ('Pinhal', 'NPROP'), ('em', 'PREP'), ('São', 'NPROP'), ('Paulo', 'NPROP')], [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'), ('a', 'ART'), ('Exposição', 'NPROP'), ('Nacional', 'NPROP'), ('do', 'NPROP'), ('Zebu', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'), ('começa', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...]

In [None]:
text = "The player, who is wearing the green shirt, scored the winning goal!"

In [None]:
#tokenizing words from a text

nltk.word_tokenize(text, language='english')

['The',
 'player',
 ',',
 'who',
 'is',
 'wearing',
 'the',
 'green',
 'shirt',
 ',',
 'scored',
 'the',
 'winning',
 'goal',
 '!']

In [None]:
text_portuguese = "O jogador, que está com a camiseta verde, marcou o gol da vitória!"

In [None]:
nltk.word_tokenize(text_portuguese, language='portuguese')

['O',
 'jogador',
 ',',
 'que',
 'está',
 'com',
 'a',
 'camiseta',
 'verde',
 ',',
 'marcou',
 'o',
 'gol',
 'da',
 'vitória',
 '!']

In [None]:
#Tokenizing with RegEx, getting only words
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
tokens

['The',
 'player',
 'who',
 'is',
 'wearing',
 'the',
 'green',
 'shirt',
 'scored',
 'the',
 'winning',
 'goal']

In [None]:
text_2 = "The player, who is wearing the green shirt with number 10, scored the winning goal!"

In [None]:
#excluding numbers and punctuation
tokenizer_2 = RegexpTokenizer(r'[A-z]\w*')
tokens_2 = tokenizer_2.tokenize(text_2)
tokens_2

['The',
 'player',
 'who',
 'is',
 'wearing',
 'the',
 'green',
 'shirt',
 'with',
 'number',
 'scored',
 'the',
 'winning',
 'goal']

In [None]:
text_3 = "Natural Language Processing (NLP) is a subfield of linguistics, Computer Science, and Artificial Intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of 'understanding' the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves."

In [None]:
#Frequency
tokens_3 = tokenizer.tokenize(text_3)
frequency = nltk.FreqDist(tokens_3)

In [None]:
frequency.most_common()

[('the', 6),
 ('of', 5),
 ('and', 5),
 ('language', 3),
 ('documents', 3),
 ('is', 2),
 ('a', 2),
 ('computers', 2),
 ('in', 2),
 ('to', 2),
 ('The', 2),
 ('as', 2),
 ('Natural', 1),
 ('Language', 1),
 ('Processing', 1),
 ('NLP', 1),
 ('subfield', 1),
 ('linguistics', 1),
 ('Computer', 1),
 ('Science', 1),
 ('Artificial', 1),
 ('Intelligence', 1),
 ('concerned', 1),
 ('with', 1),
 ('interactions', 1),
 ('between', 1),
 ('human', 1),
 ('particular', 1),
 ('how', 1),
 ('program', 1),
 ('process', 1),
 ('analyze', 1),
 ('large', 1),
 ('amounts', 1),
 ('natural', 1),
 ('data', 1),
 ('goal', 1),
 ('computer', 1),
 ('capable', 1),
 ('understanding', 1),
 ('contents', 1),
 ('including', 1),
 ('contextual', 1),
 ('nuances', 1),
 ('within', 1),
 ('them', 1),
 ('technology', 1),
 ('can', 1),
 ('then', 1),
 ('accurately', 1),
 ('extract', 1),
 ('information', 1),
 ('insights', 1),
 ('contained', 1),
 ('well', 1),
 ('categorize', 1),
 ('organize', 1),
 ('themselves', 1)]

In [None]:
frequency.most_common(5)

[('the', 6), ('of', 5), ('and', 5), ('language', 3), ('documents', 3)]

In [None]:
frequency = nltk.FreqDist(w.lower() for w in tokens_3)

In [None]:
frequency.most_common()

[('the', 8),
 ('of', 5),
 ('and', 5),
 ('language', 4),
 ('documents', 3),
 ('natural', 2),
 ('is', 2),
 ('a', 2),
 ('computer', 2),
 ('computers', 2),
 ('in', 2),
 ('to', 2),
 ('as', 2),
 ('processing', 1),
 ('nlp', 1),
 ('subfield', 1),
 ('linguistics', 1),
 ('science', 1),
 ('artificial', 1),
 ('intelligence', 1),
 ('concerned', 1),
 ('with', 1),
 ('interactions', 1),
 ('between', 1),
 ('human', 1),
 ('particular', 1),
 ('how', 1),
 ('program', 1),
 ('process', 1),
 ('analyze', 1),
 ('large', 1),
 ('amounts', 1),
 ('data', 1),
 ('goal', 1),
 ('capable', 1),
 ('understanding', 1),
 ('contents', 1),
 ('including', 1),
 ('contextual', 1),
 ('nuances', 1),
 ('within', 1),
 ('them', 1),
 ('technology', 1),
 ('can', 1),
 ('then', 1),
 ('accurately', 1),
 ('extract', 1),
 ('information', 1),
 ('insights', 1),
 ('contained', 1),
 ('well', 1),
 ('categorize', 1),
 ('organize', 1),
 ('themselves', 1)]

In [None]:
#Stopwords
print(len(nltk.corpus.stopwords.words('portuguese')))
print(len(nltk.corpus.stopwords.words('english')))

204
179


In [None]:
stopwords = nltk.corpus.stopwords.words('english')
tokens_without_stopwrods = [w.lower() for w in tokens_3 if w not in stopwords]
freq_without_stopwords = nltk.FreqDist(tokens_without_stopwrods)
freq_without_stopwords.most_common()

[('language', 4),
 ('documents', 3),
 ('natural', 2),
 ('computer', 2),
 ('computers', 2),
 ('the', 2),
 ('processing', 1),
 ('nlp', 1),
 ('subfield', 1),
 ('linguistics', 1),
 ('science', 1),
 ('artificial', 1),
 ('intelligence', 1),
 ('concerned', 1),
 ('interactions', 1),
 ('human', 1),
 ('particular', 1),
 ('program', 1),
 ('process', 1),
 ('analyze', 1),
 ('large', 1),
 ('amounts', 1),
 ('data', 1),
 ('goal', 1),
 ('capable', 1),
 ('understanding', 1),
 ('contents', 1),
 ('including', 1),
 ('contextual', 1),
 ('nuances', 1),
 ('within', 1),
 ('technology', 1),
 ('accurately', 1),
 ('extract', 1),
 ('information', 1),
 ('insights', 1),
 ('contained', 1),
 ('well', 1),
 ('categorize', 1),
 ('organize', 1)]

In [None]:
#Bigrams
list(bigrams(tokens_3))

[('Natural', 'Language'),
 ('Language', 'Processing'),
 ('Processing', 'NLP'),
 ('NLP', 'is'),
 ('is', 'a'),
 ('a', 'subfield'),
 ('subfield', 'of'),
 ('of', 'linguistics'),
 ('linguistics', 'Computer'),
 ('Computer', 'Science'),
 ('Science', 'and'),
 ('and', 'Artificial'),
 ('Artificial', 'Intelligence'),
 ('Intelligence', 'concerned'),
 ('concerned', 'with'),
 ('with', 'the'),
 ('the', 'interactions'),
 ('interactions', 'between'),
 ('between', 'computers'),
 ('computers', 'and'),
 ('and', 'human'),
 ('human', 'language'),
 ('language', 'in'),
 ('in', 'particular'),
 ('particular', 'how'),
 ('how', 'to'),
 ('to', 'program'),
 ('program', 'computers'),
 ('computers', 'to'),
 ('to', 'process'),
 ('process', 'and'),
 ('and', 'analyze'),
 ('analyze', 'large'),
 ('large', 'amounts'),
 ('amounts', 'of'),
 ('of', 'natural'),
 ('natural', 'language'),
 ('language', 'data'),
 ('data', 'The'),
 ('The', 'goal'),
 ('goal', 'is'),
 ('is', 'a'),
 ('a', 'computer'),
 ('computer', 'capable'),
 ('cap

In [None]:
#Trigrams
list(trigrams(tokens_3))

[('Natural', 'Language', 'Processing'),
 ('Language', 'Processing', 'NLP'),
 ('Processing', 'NLP', 'is'),
 ('NLP', 'is', 'a'),
 ('is', 'a', 'subfield'),
 ('a', 'subfield', 'of'),
 ('subfield', 'of', 'linguistics'),
 ('of', 'linguistics', 'Computer'),
 ('linguistics', 'Computer', 'Science'),
 ('Computer', 'Science', 'and'),
 ('Science', 'and', 'Artificial'),
 ('and', 'Artificial', 'Intelligence'),
 ('Artificial', 'Intelligence', 'concerned'),
 ('Intelligence', 'concerned', 'with'),
 ('concerned', 'with', 'the'),
 ('with', 'the', 'interactions'),
 ('the', 'interactions', 'between'),
 ('interactions', 'between', 'computers'),
 ('between', 'computers', 'and'),
 ('computers', 'and', 'human'),
 ('and', 'human', 'language'),
 ('human', 'language', 'in'),
 ('language', 'in', 'particular'),
 ('in', 'particular', 'how'),
 ('particular', 'how', 'to'),
 ('how', 'to', 'program'),
 ('to', 'program', 'computers'),
 ('program', 'computers', 'to'),
 ('computers', 'to', 'process'),
 ('to', 'process', 'a

In [None]:
#Ngrams
list(ngrams(tokens_3, 4))

[('Natural', 'Language', 'Processing', 'NLP'),
 ('Language', 'Processing', 'NLP', 'is'),
 ('Processing', 'NLP', 'is', 'a'),
 ('NLP', 'is', 'a', 'subfield'),
 ('is', 'a', 'subfield', 'of'),
 ('a', 'subfield', 'of', 'linguistics'),
 ('subfield', 'of', 'linguistics', 'Computer'),
 ('of', 'linguistics', 'Computer', 'Science'),
 ('linguistics', 'Computer', 'Science', 'and'),
 ('Computer', 'Science', 'and', 'Artificial'),
 ('Science', 'and', 'Artificial', 'Intelligence'),
 ('and', 'Artificial', 'Intelligence', 'concerned'),
 ('Artificial', 'Intelligence', 'concerned', 'with'),
 ('Intelligence', 'concerned', 'with', 'the'),
 ('concerned', 'with', 'the', 'interactions'),
 ('with', 'the', 'interactions', 'between'),
 ('the', 'interactions', 'between', 'computers'),
 ('interactions', 'between', 'computers', 'and'),
 ('between', 'computers', 'and', 'human'),
 ('computers', 'and', 'human', 'language'),
 ('and', 'human', 'language', 'in'),
 ('human', 'language', 'in', 'particular'),
 ('language', '

In [None]:
for info in list(bigrams(tokens_3)):
  if (info[0][0].isupper() and info[1][0].isupper()):
    print(info)

for info in list(trigrams(tokens_3)):
  if (info[0][0].isupper() and info[1][0].isupper() and info[2][0].isupper()):
    print(info)

('Natural', 'Language')
('Language', 'Processing')
('Processing', 'NLP')
('Computer', 'Science')
('Artificial', 'Intelligence')
('Natural', 'Language', 'Processing')
('Language', 'Processing', 'NLP')


In [None]:
#Stemmer

stemmer = nltk.RSLPStemmer()

In [None]:
print(stemmer.stem('amigo'))
print(stemmer.stem('casarão'))
print(stemmer.stem('cachorro'))
print(stemmer.stem('gato'))
print(stemmer.stem('carro'))
print(stemmer.stem('estudar'))


amig
cas
cachorr
gat
carr
estud


In [None]:
text_4 = 'Processamento de língua natural (PLN) é uma subárea da ciência da computação, inteligência artificial e da linguística que estuda os problemas da geração e compreensão automática de línguas humanas naturais. Sistemas de geração de língua natural convertem informação de bancos de dados de computadores em linguagem compreensível ao ser humano e sistemas de compreensão de língua natural convertem ocorrências de linguagem humana em representações mais formais, mais facilmente manipuláveis por programas de computador.'

In [None]:
#Tags

from nltk.corpus import mac_morpho
from nltk.tag import UnigramTagger

tokens_4 = nltk.word_tokenize(text_4)
sentencas_treinadoras = mac_morpho.tagged_sents()
etiq = UnigramTagger(sentencas_treinadoras)

tag = etiq.tag(tokens_4)
print(tag)

[('Processamento', 'NPROP'), ('de', 'PREP'), ('língua', 'N'), ('natural', 'ADJ'), ('(', '('), ('PLN', None), (')', ')'), ('é', 'V'), ('uma', 'ART'), ('subárea', None), ('da', 'NPROP'), ('ciência', 'N'), ('da', 'NPROP'), ('computação', 'N'), (',', ','), ('inteligência', 'N'), ('artificial', 'ADJ'), ('e', 'KC'), ('da', 'NPROP'), ('linguística', 'ADJ'), ('que', 'PRO-KS-REL'), ('estuda', 'V'), ('os', 'ART'), ('problemas', 'N'), ('da', 'NPROP'), ('geração', 'N'), ('e', 'KC'), ('compreensão', 'N'), ('automática', 'ADJ'), ('de', 'PREP'), ('línguas', 'N'), ('humanas', 'ADJ'), ('naturais', 'ADJ'), ('.', '.'), ('Sistemas', 'NPROP'), ('de', 'PREP'), ('geração', 'N'), ('de', 'PREP'), ('língua', 'N'), ('natural', 'ADJ'), ('convertem', 'V'), ('informação', 'N'), ('de', 'PREP'), ('bancos', 'N'), ('de', 'PREP'), ('dados', 'N'), ('de', 'PREP'), ('computadores', 'N'), ('em', 'PREP|+'), ('linguagem', 'N'), ('compreensível', 'ADJ'), ('ao', 'PREP'), ('ser', 'VAUX'), ('humano', 'ADJ'), ('e', 'KC'), ('sistem

In [None]:
from nltk.tag import DefaultTagger


etiq_pad = DefaultTagger('N')

sentencas_treinadoras = mac_morpho.tagged_sents()
etiq = UnigramTagger(sentencas_treinadoras, backoff = etiq_pad)

tag = etiq.tag(tokens_4)
print(tag)

[('Processamento', 'NPROP'), ('de', 'PREP'), ('língua', 'N'), ('natural', 'ADJ'), ('(', '('), ('PLN', 'N'), (')', ')'), ('é', 'V'), ('uma', 'ART'), ('subárea', 'N'), ('da', 'NPROP'), ('ciência', 'N'), ('da', 'NPROP'), ('computação', 'N'), (',', ','), ('inteligência', 'N'), ('artificial', 'ADJ'), ('e', 'KC'), ('da', 'NPROP'), ('linguística', 'ADJ'), ('que', 'PRO-KS-REL'), ('estuda', 'V'), ('os', 'ART'), ('problemas', 'N'), ('da', 'NPROP'), ('geração', 'N'), ('e', 'KC'), ('compreensão', 'N'), ('automática', 'ADJ'), ('de', 'PREP'), ('línguas', 'N'), ('humanas', 'ADJ'), ('naturais', 'ADJ'), ('.', '.'), ('Sistemas', 'NPROP'), ('de', 'PREP'), ('geração', 'N'), ('de', 'PREP'), ('língua', 'N'), ('natural', 'ADJ'), ('convertem', 'V'), ('informação', 'N'), ('de', 'PREP'), ('bancos', 'N'), ('de', 'PREP'), ('dados', 'N'), ('de', 'PREP'), ('computadores', 'N'), ('em', 'PREP|+'), ('linguagem', 'N'), ('compreensível', 'ADJ'), ('ao', 'PREP'), ('ser', 'VAUX'), ('humano', 'ADJ'), ('e', 'KC'), ('sistemas