In [60]:
import json
import numpy as np
import pandas as pd
import os
import sys
import time
sys.path.append(os.path.abspath("../src"))
import config
from typing import Union
import sqlite3

from sentence_transformers import SentenceTransformer

import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
import logging
from utils import log
import functools

In [None]:
model_name = config.MODELS[0]
language = "en"


In [3]:
model = SentenceTransformer('neuralmind/bert-base-portuguese-cased', device='cuda')

No sentence-transformers model found with name /home/decarv/.cache/torch/sentence_transformers/neuralmind_bert-base-portuguese-cased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/decarv/.cache/torch/sentence_transformers/neuralmind_bert-base-portuguese-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializin

In [28]:
metadata = pd.read_csv(os.path.join(config.DATA_DIR, "metadata.csv"), keep_default_na=False)

### Encodings Input

In [33]:
data, indices = structure_data_for_embedding(metadata)

[ 11/08 11:06:08 ] - structure_data_for_embedding - Running structure_data_for_embedding
[ 11/08 11:06:12 ] - structure_data_for_embedding - structure_data_for_embedding took 4.378753423690796 seconds


In [35]:
# with open(os.path.join(config.DATA_DIR, "training_data/data.pkl"), "wb") as f:
#     pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

with open(os.path.join(config.DATA_DIR, "indices/indices.pkl"), "wb") as f:
    pickle.dump(indices, f, pickle.HIGHEST_PROTOCOL)

### Load Data

In [None]:
with open(os.path.join(config.DATA_DIR, "training_data/data.pkl"), "rb") as f:
    data = pickle.load(f)
    
with open(os.path.join(config.DATA_DIR, "indices/indices.pkl"), "rb") as f:
    indices = pickle.load(f)

### Encoding

In [None]:
print(cosine_similarity(model.encode("amor").reshape(1, -1), model.encode("ódio").reshape(1, -1)))
print(cosine_similarity(model.encode("amor").reshape(1, -1), model.encode("paixão").reshape(1, -1)))
print(cosine_similarity(model.encode("amor").reshape(1, -1), model.encode("teclado").reshape(1, -1)))

In [None]:
def batch_generator(data, batch_size=64):
    for i in range(0, len(data), batch_size):
        yield data[i:i+batch_size]

@log
def encode_embedding_units(embedding_units):
    embeddings = []
    for batch in batch_generator(embedding_units):
        embeddings.append(model.encode(batch, batch_size=len(batch), show_progress_bar=False))
    embeddings = np.concatenate(embeddings)
    return embeddings

@log
def encode_data(data, save=True):
    embeddings_by_type = {}
    for _type, units in data.items():
        embeddings = encode_embedding_units(units)
        embeddings_by_type[_type] = embeddings
        if save:
            np.save(os.path.join(config.DATA_DIR, f"npy/{_type}_embeddings.npy"), embeddings, allow_pickle=False)

In [None]:
@log
def encode(query):
    return model.encode(query)

@log
def search(vector, vectors):
    return cosine_similarity(vector.reshape(1, -1), vectors)[0]

@log
def query(query_string, vectors, data, indices):
    vector = encode(query_string)
    scores = search(vector, vectors)
    sorted_scores_ids = np.argsort(scores)[::-1]
    top_scores_ids = sorted_scores_ids[:15]
    for i in top_scores_ids:
        row = data.iloc[indices[i]]
        print("Posição:", i+1)
        print("Index:", indices[i])
        print("Título: ", row['title_pt'])
        print("Autor: ", row['author'])
        print("Resumo: ", row['abstract_pt'])
        print("Palavras-Chave: ", row['keywords_pt'])
        print("\n----------------------------------------------------------\n")

## Vetorizando Todo o Input

In [5]:
vectors = np.load(os.path.join(config.DATA_DIR, "npy/sentences_embeddings.npy"))

In [6]:
from searcher import LocalSearcher

In [40]:
with open(os.path.join(config.DATA_DIR, "indices/indices.pkl"), "rb") as f:
    indices = pickle.load(f)

In [42]:
indices = indices['sentences']

In [41]:
indices.keys()

dict_keys(['text', 'sentences', 'sentences_and_text'])

In [7]:
searcher = LocalSearcher(
    collection_name='abstracts',
    encoder_model=model,
    ranking_model=None,
    vectors=vectors
)

In [8]:
queries = [
    "ataque ddos",
    "ataque ddos com machine learning"
]

In [57]:
hits = searcher.retrieve(queries[1])

[ 11/08 11:13:10 ] - retrieve - Running retrieve
[ 11/08 11:13:10 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 11:13:11 ] - process_query - process_query took 0.16666865348815918 seconds
[ 11/08 11:15:00 ] - retrieve - retrieve took 109.41779446601868 seconds


In [58]:
top_scores_ids = hits[:15]
for i in top_scores_ids:
    row = metadata.iloc[indices[i]]
    print("Posição:", i+1)
    print("Index:", indices[i])
    print("Título: ", row['title_pt'])
    print("Autor: ", row['author'])
    print("Resumo: ", row['abstract_pt'])
    print("Palavras-Chave: ", row['keywords_pt'])
    print("\n----------------------------------------------------------\n")

Posição: 1682284
Index: 92434
Título:  Controle supervisório de microrredes utilizando deep reinforcement learning
Autor:  Barbalho, Pedro Inácio de Nascimento e 
Resumo:  As microrredes surgiram devido à necessidade de se coordenar um novo modelo de geração em ascensão, a geração distribuída, com sistemas de armazenamento de energia e cargas locais. Esse tipo de rede é um sistema complexo e há diversas proposições de como controlá-lo de forma a permitir a sua operação conectada ao sistema de distribuição ou ilhada. Além disso, para um melhor desempenho do controle da microrrede, é preciso que este se adapte aos seus diferentes pontos de operação. O controle de microrredes pode ser dividido em três níveis hierárquicos que diferem em tempo de resposta, objetivos de controle e necessidade de comunicação. Por tratar-se de um tema recente, ainda há espaço para novas análises e contribuições. Neste sentido, o objetivo deste estudo foi modelar o controlador supervisório de uma microrrede ilh

## Vetorizando Sentenças

In [None]:
# encoding in batches can reduce overhead costs and speed up the process
vectors = []
batch_size = 64
batch = []
for row in tqdm(data.itertuples()):
    descriptions = []
    pre = (row.title_en + ". " + row.abstract_en + " " + row.keywords_en).split(". ")
    encoding = model.encode(pre)
    vectors.append(encoding)

vectors_concat = np.concatenate(vectors)

In [None]:
np.save(os.path.join(config.DATA_DIR,"vectors_sentences.npy"), vectors_concat, allow_pickle=False)

In [None]:
vectors_concat = np.load(os.path.join(config.DATA_DIR, "vectors_sentences.npy"))

In [None]:
query("ddos attack")

In [None]:
query("ddos attack with machine learning")

In [None]:
query("ddos attack and networking")

In [None]:
query("linux systems and networking")

In [None]:
query("Message Queuing Telemetry Transport")

In [None]:
query("governance artificial inteligence")

In [None]:
query("how to train your dragon")

In [None]:
query("population perceptions of dengue")