In [1]:
import sys, os
sys.path.append("..")
os.chdir("..")
# !{sys.executable} -m pip install -r ../requirements.txt

In [2]:
import numpy as np
import nltk
from basix import files
import tensorflow as tf
from tensorflow import keras

from llm import data, models
from llm.config import config
from llm.tokenize import SentencesTokenizer
from llm.embed import CBOWEmbedder

nltk.download('punkt', download_dir = "datasets/nltk_data")

[nltk_data] Downloading package punkt to datasets/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text_len = 100
title_len = 20
latent_dim = 256
batch_size = 50

In [4]:
%%time

texts, titles = data.load_corpus(sample=10000)

processer = models.TextProcesser(verbose=1)

texts_vecs = processer.transform(texts, maxlen=text_len)

titles_input_vecs = processer.transform(titles, maxlen=title_len, add_bos=True)
titles_output_vecs = processer.transform(titles, maxlen=title_len, add_eos=True)

2023-03-22 22:04:03.066 | DEBUG    | llm.data:load_corpus:18 - Corpus already exists in datasets/raw/brazilian-news.parquet. Skipping downloading corpus.
2023-03-22 22:04:03.066 | DEBUG    | llm.data:load_corpus:20 - Importing news from datasets/raw/brazilian-news.parquet
2023-03-22 22:04:04.781 | DEBUG    | llm.data:load_corpus:38 - Using a sample of size 10000
2023-03-22 22:04:04.802 | DEBUG    | llm.data:load_corpus:43 - Importing news titles from datasets/raw/title.parquet
2023-03-22 22:04:04.803 | DEBUG    | llm.data:load_corpus:46 - Importing news texts from datasets/raw/text.parquet
2023-03-22 22:04:05.299 | DEBUG    | llm.models:transform:37 - Tokenizing sentences
2023-03-22 22:04:17.819 | DEBUG    | llm.models:transform:46 - Getting embedding vectors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 2544.32it/s]
2023-03-22 22:04:21.883 | DEBUG    | llm.models:transform:37 - Tokenizing sentenc

CPU times: user 18.5 s, sys: 1.07 s, total: 19.6 s
Wall time: 19.5 s





In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_inp_train, y_inp_test, y_out_train, y_out_test = \
    train_test_split(texts_vecs, titles_input_vecs, titles_output_vecs, test_size=0.2, random_state=42)

In [6]:
encoder_inputs = keras.Input(shape=(None, config.CBOW_VECTOR_SIZE))

encoder = keras.layers.LSTM(latent_dim, return_state=True)

encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]


# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, config.CBOW_VECTOR_SIZE))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(config.CBOW_VECTOR_SIZE)
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [7]:
import keras
import tensorflow as tf

def r2_keras(y_true, y_pred):
    SS_res =  keras.backend.sum(keras.backend.square( y_true - y_pred )) 
    SS_tot = keras.backend.sum(keras.backend.square( y_true - keras.backend.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + keras.backend.epsilon()) )


In [None]:
%%time
model.compile(loss='mse', optimizer='adam',  metrics=['mse', 'mae', r2_keras])

model.fit(
    [X_train, y_inp_train],
    y_out_train,
    batch_size=50,
    epochs=100,
    validation_split=0.2,
)

# Save model
ENCODER_DECODER_PATH = os.path.join(config.MODEL_PATH, f"version={config.MODEL_VERSION}", "encoder-decoder.bin")
model.save(ENCODER_DECODER_PATH)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

In [None]:
# Define sampling models
# Restore the model and construct the encoder and decoder.
model = keras.models.load_model("s2s")

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,))
decoder_state_input_c = keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence


In [5]:
import tensorflow as tf
slices = tf.data.Dataset.from_tensor_slices(texts_vecs)
type(slices)

tensorflow.python.data.ops.from_tensor_slices_op.TensorSliceDataset

In [10]:
texts_vecs[0]

array([[-2.8422527 ,  1.5201029 ,  3.6546187 , ..., -2.1878119 ,
         1.2598858 ,  3.8599381 ],
       [ 0.15048188, -2.5627968 , -5.4844007 , ...,  3.2361865 ,
         4.6852965 ,  2.4972484 ],
       [ 0.3609909 , -0.23715064,  3.7527487 , ...,  3.8275928 ,
        -4.6390014 , -0.80124545],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

In [9]:
titles_vecs.shape

(2000, 28, 50)

In [None]:
pad_sequences(texts_vecs, padding='post', dtype='float32', )

In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


def padding(vec_lists):
    return  pad_sequences(vec_lists, padding='post', dtype='float32')

In [16]:
padded = padding(texts_vecs)

In [17]:
type(padded)

numpy.ndarray

In [13]:
tf.__version__

'2.11.0'

In [50]:
import pandas as pd
pd.Series([len(x) for x in texts_vecs]).quantile(0.99)

1685.1599999999999

In [43]:
MAX_LENGTH = len(max(texts_vecs, key=len))
print(f"Length of longest input sequence: {MAX_LENGTH}")

Length of longest input sequence: 4426


In [44]:
MAX_LENGTH = len(max(titles_vecs, key=len))
print(f"Length of longest input sequence: {MAX_LENGTH}")

Length of longest input sequence: 28


In [22]:
# aqu ieu tenho uma lista de 4 sentenças represendas com word embedding
# num espaço de embedding de dimenão 3. O número de palavras nas senenças
# são respectivamente 4, 3, 4 e 2. Como utilizar o tensorflow para fazer um padding
# de modo que todas as senteças tenha o mesmo número de palavras

[np.random.randn(np.random.randint(2,6),3).round(2) for i in range(4)]

[array([[ 0.06,  2.86, -0.39],
        [-1.34, -1.12, -1.03],
        [ 0.48,  0.83, -0.93],
        [-0.54,  0.37, -0.92]]),
 array([[-0.57, -1.26,  0.6 ],
        [-1.82,  0.39,  1.25],
        [-0.51,  2.12, -1.35]]),
 array([[ 0.89,  1.25,  0.25],
        [ 1.05,  1.15,  1.  ],
        [-0.37,  0.86, -0.79],
        [-1.69,  0.01,  0.34]]),
 array([[ 0.17,  1.33, -0.33],
        [ 0.58, -0.54,  1.01]])]

In [None]:
[
    np.array([[0,1,0,0],[0,3,2,1],[4,6,3,6],[1,2,3,4],[6,7,9,8]]),
    np.array([[0,1,0,0],[0,3,2,1]]),
    np.array([[0,1,0,0],[0,3,2,1],[4,6,3,6],[1,2,3,4],[6,7,9,8]]),
    np.array([[0,1,0,0],[0,3,2,1],[4,6,3,6],[1,2,3,4],[6,7,9,8]]),
]

In [23]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Definir as sequências de entrada
sentences = [
    np.array([[ 0.06,  2.86, -0.39],
              [-1.34, -1.12, -1.03],
              [ 0.48,  0.83, -0.93],
              [-0.54,  0.37, -0.92]]),
    np.array([[-0.57, -1.26,  0.6 ],
              [-1.82,  0.39,  1.25],
              [-0.51,  2.12, -1.35]]),
    np.array([[ 0.89,  1.25,  0.25],
              [ 1.05,  1.15,  1.  ],
              [-0.37,  0.86, -0.79],
              [-1.69,  0.01,  0.34]]),
    np.array([[ 0.17,  1.33, -0.33],
              [ 0.58, -0.54,  1.01]])
]

# Fazer o padding das sequências
padded_sequences = pad_sequences(sentences, padding='post', dtype='float32')

# Imprimir as sequências de entrada originais
print("Sequências originais:")
for s in sentences:
    print(s)

# Imprimir as sequências com padding
print("\nSequências com padding:")
print(padded_sequences)


Sequências originais:
[[ 0.06  2.86 -0.39]
 [-1.34 -1.12 -1.03]
 [ 0.48  0.83 -0.93]
 [-0.54  0.37 -0.92]]
[[-0.57 -1.26  0.6 ]
 [-1.82  0.39  1.25]
 [-0.51  2.12 -1.35]]
[[ 0.89  1.25  0.25]
 [ 1.05  1.15  1.  ]
 [-0.37  0.86 -0.79]
 [-1.69  0.01  0.34]]
[[ 0.17  1.33 -0.33]
 [ 0.58 -0.54  1.01]]

Sequências com padding:
[[[ 0.06  2.86 -0.39]
  [-1.34 -1.12 -1.03]
  [ 0.48  0.83 -0.93]
  [-0.54  0.37 -0.92]]

 [[-0.57 -1.26  0.6 ]
  [-1.82  0.39  1.25]
  [-0.51  2.12 -1.35]
  [ 0.    0.    0.  ]]

 [[ 0.89  1.25  0.25]
  [ 1.05  1.15  1.  ]
  [-0.37  0.86 -0.79]
  [-1.69  0.01  0.34]]

 [[ 0.17  1.33 -0.33]
  [ 0.58 -0.54  1.01]
  [ 0.    0.    0.  ]
  [ 0.    0.    0.  ]]]


In [9]:
print(len(texts_vecs), len(titles_vecs))

2000 2000


In [11]:
processer.get_tokens_from_vectors(titles_vecs[0])

[['▁municípios',
  '▁com',
  '▁regimes',
  '▁próprios',
  '▁de',
  '▁previdência',
  '▁terão',
  '▁dívidas',
  '▁renego',
  'ciadas']]

In [10]:
processer.get_most_similar_token(vec)

'▁teste'

In [9]:
emb.wv.similar_by_vector

<bound method KeyedVectors.similar_by_vector of <gensim.models.keyedvectors.KeyedVectors object at 0x7f32d0b3fac0>>

In [64]:
tokens = processer.tokenize(["teste de mensagem. Hoje eu vou.", "Agora desisti"])
transf = processer.get_vectors(tokens)
transf[1].shape

(3, 50)

In [57]:
len(transf[1])

3

['Parabéns por mostrar que a filatelia (que é o somatório de arte, educação, cultura, lazer e terapia) continua sendo o colecionismo que ainda agrada e atrai jovens de 8 a 80 anos ("Figurinhas raras, "Folhinha", 16/5).  JOSÉ ANTONIO BITTENCOURT FERRAZ, presidente do Clube Filatélico e Numismático de Lorena (Lorena, SP)  *  Gostei muito da reportagem. Tudo o que envolve coleções tem minha atenção, pois envolve organização, atenção e carinho. Tenho uma coleção especial: fios de cabelos. São quase mil fios, uma coleção com mais de 30 anos (tenho 46) e fios de pessoas que passaram por minha vida.  SIMONE GIUDICI (São Paulo, SP)    *  PARTICIPAÇÃO  Os leitores podem colaborar com o conteúdo da Folha enviando notícias, fotos e vídeos (de acontecimentos ou comentários) que sejam relevantes no Brasil e no mundo. Para isso, basta acessar Envie sua Notícia ou enviar mensagem para leitor@uol.com.br',
 'A reedição de "Viagem Pitoresca e Histórica ao Brasil", de Jean-Baptiste Debret, reflete a long

In [8]:
embedder.wv.most_similar("▁dois")

[('▁três', 0.8723641037940979),
 ('▁oito', 0.8701666593551636),
 ('▁cinco', 0.8675403594970703),
 ('▁nove', 0.8654859662055969),
 ('▁seis', 0.8631857633590698),
 ('▁quatro', 0.8607264161109924),
 ('▁dez', 0.8575947284698486),
 ('▁sete', 0.8480710387229919),
 ('▁doze', 0.8359367251396179),
 ('▁vários', 0.828758180141449)]

In [5]:
embedder.get_vector("▁homem")

array([-2.3468242e+00,  6.8137341e+00, -1.4643404e+00, -1.1822750e-01,
       -3.8878152e+00, -3.4637120e-01,  3.0803206e+00, -6.8627906e+00,
        3.5182815e+00,  3.8290794e+00, -8.5138006e+00, -2.4277949e+00,
       -1.4926243e+00, -5.6872697e+00, -8.4092039e-01,  1.0705171e+00,
        5.4289937e+00, -3.4295261e+00, -1.2320751e+00, -5.1659662e-01,
        5.3019896e+00,  1.5884974e+00, -5.4626627e+00,  5.6278157e+00,
        1.9585477e+00, -2.8322184e-01,  1.2161356e+00, -3.4904180e+00,
        8.0094612e-01, -6.4040214e-02, -4.4229865e-01,  2.2009010e+00,
        3.4180372e+00, -2.2932494e+00,  1.4665638e+01, -3.2469783e+00,
       -1.5051919e+00,  4.7470379e-01, -2.3170009e+00,  7.3689550e-02,
       -8.5182238e+00,  3.4171028e+00, -7.9247413e+00, -2.2553673e+00,
        1.8577051e+00,  4.4309014e-05,  1.7016845e+00,  4.8755603e+00,
       -1.0073901e+00, -6.7963362e+00], dtype=float32)

In [6]:
embedder.wv.most_similar("▁homem")

[('▁rapaz', 0.8976976275444031),
 ('▁garoto', 0.8968115448951721),
 ('▁menino', 0.8940575122833252),
 ('▁assassino', 0.8306339383125305),
 ('▁pinguim', 0.8259885311126709),
 ('▁monstro', 0.8233861327171326),
 ('▁soldado', 0.8025341629981995),
 ('▁assaltante', 0.7978926301002502),
 ('▁felino', 0.7970114350318909),
 ('▁cão', 0.7948576211929321)]

In [7]:
embedder.wv.most_similar("▁mulher")

[('▁mãe', 0.9230387806892395),
 ('▁filha', 0.9159747958183289),
 ('▁esposa', 0.9079855680465698),
 ('▁irmã', 0.90696781873703),
 ('▁namorada', 0.901147723197937),
 ('▁amiga', 0.8933922052383423),
 ('▁menina', 0.8767526149749756),
 ('▁enteada', 0.8766278028488159),
 ('▁avó', 0.875297486782074),
 ('▁companheira', 0.8721087574958801)]

In [8]:
embedder.wv.most_similar("▁carro")

[('▁veículo', 0.9137380123138428),
 ('▁caminhão', 0.9056364893913269),
 ('▁motorista', 0.8465127348899841),
 ('▁helicóptero', 0.8387037515640259),
 ('▁carrinho', 0.8258286714553833),
 ('▁elevador', 0.8252330422401428),
 ('▁barraco', 0.8244249820709229),
 ('▁pneu', 0.821903645992279),
 ('▁vagão', 0.8143818378448486),
 ('▁barco', 0.814140260219574)]

In [9]:
embedder.wv.most_similar("▁avião")

[('▁helicóptero', 0.9103664755821228),
 ('▁navio', 0.8810691833496094),
 ('▁voo', 0.8689661026000977),
 ('▁barco', 0.8431477546691895),
 ('▁caminhão', 0.81184983253479),
 ('▁carro', 0.8110244274139404),
 ('▁drone', 0.8100888133049011),
 ('▁cargueiro', 0.8027729988098145),
 ('▁comboio', 0.7944729328155518),
 ('▁veículo', 0.7877556681632996)]

In [10]:
embedder.wv.most_similar("▁carnaval")

[('▁réveillon', 0.8524163365364075),
 ('▁sambódromo', 0.8183432817459106),
 ('▁desfile', 0.8170649409294128),
 ('▁baile', 0.7560688257217407),
 ('▁círio', 0.7525671720504761),
 ('▁feriado', 0.7357999682426453),
 ('▁show', 0.7356415390968323),
 ('▁palco', 0.7276540994644165),
 ('▁samba', 0.7203671336174011),
 ('▁verão', 0.7199791073799133)]

In [11]:
embedder.wv.most_similar("▁corinthians")

[('▁palmeiras', 0.9916632771492004),
 ('▁flamengo', 0.9533402919769287),
 ('▁grêmio', 0.946609377861023),
 ('▁audax', 0.9291609525680542),
 ('▁vasco', 0.926048219203949),
 ('▁coritiba', 0.9118771553039551),
 ('▁sport', 0.8998237252235413),
 ('▁figueirense', 0.8963984847068787),
 ('▁barça', 0.8899548053741455),
 ('▁avaí', 0.8875383138656616)]

In [12]:
embedder.wv.similarity("▁carro", "▁avião")

0.8110244

In [13]:
embedder.wv.similarity("▁velho", "▁homem")

0.48851633

In [14]:
embedder.wv.similarity("▁velha", "▁mulher")

0.33917713

In [15]:
embedder.wv.similarity("▁novela", "▁globo")

0.53674597

In [28]:
embedder.wv.most_similar("▁barco")

[('▁navio', 0.8869611024856567),
 ('▁avião', 0.8431476950645447),
 ('▁trem', 0.8282049298286438),
 ('▁jipe', 0.8266261219978333),
 ('▁caminhão', 0.8176475167274475),
 ('▁veleiro', 0.8168447017669678),
 ('▁carro', 0.814140260219574),
 ('▁helicóptero', 0.7995263338088989),
 ('▁píer', 0.7976621389389038),
 ('▁voo', 0.7964310646057129)]

In [29]:
embedder.wv.most_similar("▁notebook")

[('▁laptop', 0.8611619472503662),
 ('▁tablet', 0.8534693717956543),
 ('▁celular', 0.8467183709144592),
 ('▁chip', 0.8394709229469299),
 ('▁computador', 0.8318821787834167),
 ('▁smartphone', 0.8209793567657471),
 ('▁envelope', 0.8171899318695068),
 ('▁carregador', 0.7928250432014465),
 ('▁ipad', 0.7898997068405151),
 ('▁aparelho', 0.7794108390808105)]

In [37]:
embedder.wv.most_similar("▁árvore")

[('▁pedra', 0.8377714157104492),
 ('▁poça', 0.826803982257843),
 ('▁cratera', 0.8137418627738953),
 ('▁lona', 0.8065041303634644),
 ('▁poeira', 0.7948131561279297),
 ('▁cabana', 0.788944661617279),
 ('▁laje', 0.7798771858215332),
 ('▁montanha', 0.7764739394187927),
 ('▁escada', 0.7695632576942444),
 ('▁parede', 0.7683671116828918)]