# Tokens de pensamento (palavras de linguagem natural)

In [None]:
text = ("Trust me, though, the words were on their way, and when "
         "they arrived, Liesel would hold them in her hands like "
         "the clouds, and she would wring them out, like the rain.")
tokens = text.split()
tokens[:8]


['Trust', 'me,', 'though,', 'the', 'words', 'were', 'on', 'their']

In [None]:
# Usando Regex para tokenizar texto do livro Blindsight

import re
pattern = r'\w+(?:\'\w+)?|[^\w\s]'
texts = [text]
texts.append("There's no such thing as survival of the fittest. "
              "Survival of the most adequate, maybe.")
tokens = list(re.findall(pattern, texts[-1]))
tokens[:8]


tokens[8:16]

tokens[16:]

['maybe', '.']

In [None]:
import numpy as np
vocab = sorted(set(tokens))
' '.join(vocab[:12])

num_tokens = len(tokens)
num_tokens

vocab_size = len(vocab)
vocab_size


15

In [None]:
# spacy
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(texts[-1])
type(doc)


tokens = [tok.text for tok in doc]
tokens[:9]
# ['There', "'s", 'no', 'such', 'thing', 'as', 'survival', 'of', 'the']

tokens[9:17]
# ['fittest', '.', 'Survival', 'of', 'the', 'most', 'adequate', ',']

['fittest', '.', 'Survival', 'of', 'the', 'most', 'adequate', ',']

In [None]:
from spacy import displacy
sentence = list(doc.sents)[0]
displacy.serve(sentence, style="dep")
!firefox 127.0.0.1:5000


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
/bin/bash: line 1: firefox: command not found


In [None]:
# tokenizer race
import requests
text = requests.get('https://proai.org/nlpia2-ch2.adoc').text
f'{round(len(text) / 10_000)}0k'


'190k'

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
%timeit nlp(text)
# 4.67 s ± 45.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

f'{round(len(text) / 10_000)}0k'
# '160k'
doc = nlp(text)
f'{round(len(list(doc)) / 10_000)}0k'
#'30k'
f'{round(len(doc) / 1_000 / 4.67)}kWPS'
#'7kWPS'

KeyboardInterrupt: ignored

In [None]:
nlp.pipe_names
#['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
nlp = spacy.load('en_core_web_sm', disable=nlp.pipe_names)
%timeit nlp(text)
#199 ms ± 6.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

605 ms ± 219 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
%timeit word_tokenize(text)

tokens = word_tokenize(text)
f'{round(len(tokens) / 10_000)}0k'


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


227 ms ± 5.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


'40k'

In [None]:
pattern = r'\w+(?:\'\w+)?|[^\w\s]'
tokens = re.findall(pattern, text)
f'{round(len(tokens) / 10_000)}0k'

%timeit re.findall(pattern, text)


21.4 ms ± 7.21 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## BPE

O BPE preserva parte do significado de novas palavras usando tokens de caracteres e tokens de palavras para soletrar quaisquer palavras ou partes de palavras desconhecidas. Por exemplo, se “sizígia” não estiver em nosso vocabulário, poderíamos representá-lo como os seis tokens “s”, “y”, “z”, “y”, “g” e “y”. Talvez “smartz” pudesse ser representado como os dois tokens "smart" e “z”.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), analyzer='char')
vectorizer.fit(texts)


bpevocab = vectorizer.get_feature_names_out()
bpevocab[:7]
# [' ', ' a', ' c', ' f', ' h', ' i', ' l']

array([' ', ' a', ' c', ' f', ' h', ' i', ' l'], dtype=object)

In [None]:
vectors = vectorizer.transform(texts) # Converte cada texto em uma representação vetorial baseada na contagem de caracteres e/ou pares de caracteres
df = pd.DataFrame(vectors.todense(), columns=bpevocab)
df.index = [t[:8] + '...' for t in texts]
df = df.T
df['total'] = df.T.sum()
df

Unnamed: 0,Trust me...,There's ...,total
,31,14,45
a,3,2,5
c,1,0,1
f,0,1,1
h,3,0,3
...,...,...,...
wr,1,0,1
y,2,1,3
y,1,0,1
"y,",1,0,1


In [None]:
df.sort_values('total').tail()

Unnamed: 0,Trust me...,There's ...,total
he,10,3,13
h,14,5,19
t,11,9,20
e,18,8,26
,31,14,45


In [None]:
df['n'] = [len(tok) for tok in bpevocab]
df[df['n'] > 1].sort_values('total').tail()


Unnamed: 0,Trust me...,There's ...,total,n
",",6,1,7,2
e,7,2,9,2
t,8,3,11,2
th,8,4,12,2
he,10,3,13,2


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Textos de exemplo
texts = [
    "This is a simple example.",
    "Another example for illustration."
]

# Inicialização do CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), analyzer='char')

# Ajuste do vetorizador aos textos de exemplo
vectorizer.fit(texts)

# Aplicação do vetorizador aos textos de exemplo
vectors = vectorizer.transform(texts)

# Exibindo os resultados
print("Matriz Esparsa Resultante:")
print(vectors)


Matriz Esparsa Resultante:
  (0, 0)	4
  (0, 1)	1
  (0, 2)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	2
  (0, 8)	1
  (0, 9)	1
  (0, 12)	3
  (0, 13)	1
  (0, 14)	1
  (0, 16)	1
  (0, 19)	1
  (0, 21)	1
  (0, 22)	3
  (0, 24)	1
  (0, 26)	2
  (0, 27)	2
  (0, 28)	2
  (0, 31)	2
  (0, 32)	2
  (0, 40)	2
  (0, 41)	2
  (0, 45)	3
  :	:
  (1, 30)	1
  (1, 31)	1
  (1, 32)	1
  (1, 33)	2
  (1, 34)	1
  (1, 35)	1
  (1, 36)	3
  (1, 37)	1
  (1, 38)	1
  (1, 39)	1
  (1, 40)	1
  (1, 41)	1
  (1, 42)	3
  (1, 43)	2
  (1, 44)	1
  (1, 45)	1
  (1, 48)	1
  (1, 49)	3
  (1, 50)	1
  (1, 51)	1
  (1, 52)	1
  (1, 53)	1
  (1, 54)	1
  (1, 55)	1
  (1, 56)	1


In [None]:
import re

hi_text = 'Hiking home now'
hi_text.startswith('Hi')

pattern = r'\w+(?:\'\w+)?|[^\w\s]'
'Hi' in re.findall(pattern, hi_text)



False

In [None]:
'Hi' == re.findall(pattern, hi_text)[0]


False

In [None]:
tokens

['=',
 'Natural',
 'Language',
 'Processing',
 'in',
 'Action',
 ',',
 'Second',
 'Edition',
 ':',
 'chapter',
 ':',
 '2',
 ':',
 'part',
 ':',
 '1',
 ':',
 'sectnumoffset',
 ':',
 '1',
 ':',
 'sectnums',
 ':',
 ':',
 'imagesdir',
 ':',
 '.',
 ':',
 'xrefstyle',
 ':',
 'short',
 ':',
 'figure',
 '-',
 'caption',
 ':',
 'Figure',
 '{',
 'chapter',
 '}',
 '.',
 ':',
 'listing',
 '-',
 'caption',
 ':',
 'Listing',
 '{',
 'chapter',
 '}',
 '.',
 ':',
 'table',
 '-',
 'caption',
 ':',
 'Table',
 '{',
 'chapter',
 '}',
 '.',
 ':',
 'leveloffset',
 ':',
 '1',
 '/',
 '/',
 ':',
 'icons',
 '!',
 ':',
 ':',
 'stem',
 ':',
 'latexmath',
 ':',
 'toc',
 ':',
 ':',
 'source',
 '-',
 'highlighter',
 ':',
 'coderay',
 ':',
 'bibliography',
 '-',
 'database',
 ':',
 'dl4nlp',
 '.',
 'bib',
 ':',
 'bibliography',
 '-',
 'style',
 ':',
 'ieee',
 ':',
 'index',
 ':',
 ':',
 '[',
 ']',
 '=',
 'Tokens',
 'of',
 'thought',
 '(',
 'natural',
 'language',
 'words',
 ')',
 'This',
 'chapter',
 'covers',
 '*',
 

In [None]:
import pandas as pd
onehot_vectors = np.zeros(
     (len(tokens), vocab_size), int)

onehot_vectors[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
for i, word in enumerate(tokens):
     onehot_vectors[i, vocab.index(word)] = 1
df_onehot = pd.DataFrame(onehot_vectors, columns=vocab)
df_onehot.shape

df_onehot.iloc[:,:8].replace(0, '')

ValueError: ignored

In [None]:
bow = sorted(set(re.findall(pattern, text)))
bow[:9]
bow[9:19]
bow[19:27]

['01', '0123456789', '03', '03125', '0k', '1', "1's", '10']