In [6]:
# Importando modulos necessarios
import nltk
import nltk.data

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
grande_sertao_text = 'Sou só o sr. sertanejo, nessas altas ideias navego mal. Sou muito pobre coitado. Inveja minha pura é de uns conforme o senhor, com toda leitura e suma doutoração.'

In [14]:
# Tokenização de sentenças em pt-br
from nltk.tokenize import sent_tokenize

sentence_list = sent_tokenize(grande_sertao_text)
sentence_list

['Sou só o sr. sertanejo, nessas altas ideias navego mal.',
 'Sou muito pobre coitado.',
 'Inveja minha pura é de uns conforme o senhor, com toda leitura e suma doutoração.']

In [16]:
# Carregando tokenizer PUNKT adaptado para lingua portuguesa
ptbr_sentence = nltk.data.load('tokenizers/punkt/PY3/portuguese.pickle')

# Tokenização de sentenças em pt-br
sentence_list = ptbr_sentence.tokenize(grande_sertao_text)
sentence_list

['Sou só o sr. sertanejo, nessas altas ideias navego mal.',
 'Sou muito pobre coitado.',
 'Inveja minha pura é de uns conforme o senhor, com toda leitura e suma doutoração.']

In [19]:
# Tokenização de palavras
from nltk.tokenize import word_tokenize

first_sentence = word_tokenize(sentence_list[0])
print(first_sentence)

['Sou', 'só', 'o', 'sr.', 'sertanejo', ',', 'nessas', 'altas', 'ideias', 'navego', 'mal', '.']


In [20]:
# Tokenização utilizando expressões regulares
from nltk.tokenize import RegexpTokenizer

regex_tker = RegexpTokenizer('[\w]+[\.]*')
print(regex_tker.tokenize(sentence_list[0]))

['Sou', 'só', 'o', 'sr.', 'sertanejo', 'nessas', 'altas', 'ideias', 'navego', 'mal.']


In [21]:
outra_sentenca = 'Bons pães de queijo custam $2.50\n em Belo Horizonte. Por favor compre\ncinco.\n\nObrigado.'

tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
print(tokenizer.tokenize(outra_sentenca))

['Bons', 'pães', 'de', 'queijo', 'custam', '$2.50', 'em', 'Belo', 'Horizonte', '.', 'Por', 'favor', 'compre', 'cinco', '.', 'Obrigado', '.']


In [22]:
# O dicionário de sinonimos (thesaurus) funciona bem no inglês
from nltk.corpus import wordnet

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [24]:
word_synset = wordnet.synsets('study')
word_synset

[Synset('survey.n.01'),
 Synset('study.n.02'),
 Synset('report.n.01'),
 Synset('study.n.04'),
 Synset('study.n.05'),
 Synset('discipline.n.01'),
 Synset('sketch.n.01'),
 Synset('cogitation.n.02'),
 Synset('study.n.09'),
 Synset('study.n.10'),
 Synset('analyze.v.01'),
 Synset('study.v.02'),
 Synset('study.v.03'),
 Synset('learn.v.04'),
 Synset('study.v.05'),
 Synset('study.v.06')]

In [25]:
for ws in word_synset:
  print(ws, ws.definition())

Synset('survey.n.01') a detailed critical inspection
Synset('study.n.02') applying the mind to learning and understanding a subject (especially by reading)
Synset('report.n.01') a written document describing the findings of some individual or group
Synset('study.n.04') a state of deep mental absorption
Synset('study.n.05') a room used for reading and writing and studying
Synset('discipline.n.01') a branch of knowledge
Synset('sketch.n.01') preliminary drawing for later elaboration
Synset('cogitation.n.02') attentive consideration and meditation
Synset('study.n.09') someone who memorizes quickly and easily (as the lines for a part in a play)
Synset('study.n.10') a composition intended to develop one aspect of the performer's technique
Synset('analyze.v.01') consider in detail and subject to an analysis in order to discover essential features or meaning
Synset('study.v.02') be a student; follow a course of study; be enrolled at an institute of learning
Synset('study.v.03') give careful c

In [27]:
# Obendo algoritmo de stemming para ptg-br
nltk.download('rslp')

stemmer_porguese = nltk.stem.RSLPStemmer()

[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [28]:
print(stemmer_porguese.stem('copiar'))

copi


In [29]:
print(stemmer_porguese.stem('casarão'))

cas


In [30]:
# kit de ferramentas Python PNL
!pip install stanza

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.5.0-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.5/802.5 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=39ede14e4bf6da7de13573a2416d177343248e70fc4c010ce398af81f0fa938f
  Stored in directory: /root/.cache/pip/wheels/02/3d/88/51a592b9ad17e7899126563698b4e3961983ebe85747228ba6
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stan

In [31]:
import stanza
import random

In [32]:
stanza.download('pt')
nlp = stanza.Pipeline('pt')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: pt (Portuguese) ...


Downloading https://huggingface.co/stanfordnlp/stanza-pt/resolve/v1.5.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: pt (Portuguese):
| Processor    | Package |
--------------------------
| tokenize     | bosque  |
| mwt          | bosque  |
| pos          | bosque  |
| lemma        | bosque  |
| constituency | cintil  |
| depparse     | bosque  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [34]:
cs_lewis_quote = 'Eu descobri em mim mesmo desejos os quais nada nesta Terra pode satisfazer. A única explicação lógica é que fui feito para outro mundo.'

for token in nlp(cs_lewis_quote).sentences:
  text = str()
  lemma = str()
  for word in token.words:
    text += word.text + '\t'
    lemma += word.lemma + '\t'
  print('Texto:', text)
  print('Lemma:', lemma)

Texto: Eu	descobri	em	mim	mesmo	desejos	os	quais	nada	em	esta	Terra	pode	satisfazer	.	
Lemma: eu	descobrir	em	eu	mesmo	desejo	o	qual	nada	em	este	terra	poder	satisfazer	.	
Texto: A	única	explicação	lógica	é	que	fui	feito	para	outro	mundo	.	
Lemma: o	único	explicação	lógico	ser	que	ser	fazer	para	outro	mundo	.	


In [35]:
for token in nlp(cs_lewis_quote).sentences:
  print(token)

[
  {
    "id": 1,
    "text": "Eu",
    "lemma": "eu",
    "upos": "PRON",
    "feats": "Case=Nom|Number=Sing|Person=1|PronType=Prs",
    "head": 2,
    "deprel": "nsubj",
    "start_char": 0,
    "end_char": 2
  },
  {
    "id": 2,
    "text": "descobri",
    "lemma": "descobrir",
    "upos": "VERB",
    "feats": "Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin",
    "head": 0,
    "deprel": "root",
    "start_char": 3,
    "end_char": 11
  },
  {
    "id": 3,
    "text": "em",
    "lemma": "em",
    "upos": "ADP",
    "head": 4,
    "deprel": "case",
    "start_char": 12,
    "end_char": 14
  },
  {
    "id": 4,
    "text": "mim",
    "lemma": "eu",
    "upos": "PRON",
    "feats": "Number=Sing|Person=1|PronType=Prs",
    "head": 2,
    "deprel": "obl",
    "start_char": 15,
    "end_char": 18
  },
  {
    "id": 5,
    "text": "mesmo",
    "lemma": "mesmo",
    "upos": "ADJ",
    "feats": "Gender=Masc|Number=Sing",
    "head": 4,
    "deprel": "amod",
    "start_char": 19,
   

In [41]:
from nltk.corpus import wordnet as wn

poses = { 'n' : 'non', 'v' : 'verb', 's' : 'adj (s)', 'a' : 'adj', 'r' : 'adv'}

for synset in wn.synsets('good'):
  print('{} : {}'.format(poses[synset.pos()], ', '.join([l.name() for l in synset.lemmas()])))

non : good
non : good, goodness
non : good, goodness
non : commodity, trade_good, good
adj : good
adj (s) : full, good
adj : good
adj (s) : estimable, good, honorable, respectable
adj (s) : beneficial, good
adj (s) : good
adj (s) : good, just, upright
adj (s) : adept, expert, good, practiced, proficient, skillful, skilful
adj (s) : good
adj (s) : dear, good, near
adj (s) : dependable, good, safe, secure
adj (s) : good, right, ripe
adj (s) : good, well
adj (s) : effective, good, in_effect, in_force
adj (s) : good
adj (s) : good, serious
adj (s) : good, sound
adj (s) : good, salutary
adj (s) : good, honest
adj (s) : good, undecomposed, unspoiled, unspoilt
adj (s) : good
adv : well, good
adv : thoroughly, soundly, good
