In [31]:
import json
import numpy as np
import pandas as pd
import os
import sys
import time
sys.path.append(os.path.abspath("../src"))
import config
from typing import Union
import sqlite3

from sentence_transformers import SentenceTransformer

import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
import logging
from utils import log
import functools

In [3]:
model_name = config.MODELS[0]
language = "en"
units_type = "sentences"

model = SentenceTransformer(model_name, device='cuda')

In [4]:
metadata = pd.read_csv(os.path.join(config.DATA_DIR, "metadata.csv"), keep_default_na=False)

In [29]:
from searcher import LocalSearcher

In [6]:
from utils import load_embeddings, load_imap

In [17]:
pt_searcher = LocalSearcher(
    collection_name='abstracts',
    encoder_model=SentenceTransformer(config.MODELS[1], device='cuda'),
    vectors=load_embeddings(config.EMBEDDINGS_DIR, config.MODELS[1].split('/')[-1], units_type, "pt"),
    language="pt",
    data=metadata,
    indices=load_imap(config.INDICES_DIR, units_type, "pt"),
    ranking_model=None
)

[ 12/08 15:46:28 ] - sentence_transformers.SentenceTransformer - Load pretrained SentenceTransformer: neuralmind/bert-base-portuguese-cased
[ 12/08 15:46:28 ] - urllib3.connectionpool - Starting new HTTPS connection (4): huggingface.co:443


ConnectionError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/models/neuralmind/bert-base-portuguese-cased (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f58ec381420>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))"), '(Request ID: 8064df0a-0842-4ef6-9153-2c1cef1d9486)')

In [30]:
en_searcher = LocalSearcher(
    collection_name='abstracts',
    encoder_model=SentenceTransformer(config.MODELS[0], device='cuda'),
    ranking_model=None,
    vectors=load_embeddings(config.EMBEDDINGS_DIR, config.MODELS[0].split('/')[-1], units_type, "en"),
    language=language,
    data=metadata,
    indices=load_imap(config.INDICES_DIR, units_type, "en")
)

[ 12/08 15:58:13 ] - sentence_transformers.SentenceTransformer - Load pretrained SentenceTransformer: distilbert-base-nli-stsb-mean-tokens


In [1]:
queries = {
    "pt": [
        "ataque ddos",
        "ataque ddos com machine learning",
        "entendimento popular sobre dengue",
        "computação quântica",
        "políticas públicas sobre inteligência artificial",
        "sindicatos e atribuições sociais",
        "busca semântica"
    ],
    "en": [
        "ddos attack",
        "ddos attack with machine learning",
        "popular understanding of dengue",
        "quantum computing",
        "public policies about artificial inteligence",
        "unions and their social attributions",
        "semantic search"
    ]
}

In [33]:
import datetime

In [34]:
dt = datetime.datetime.now().date()

In [35]:
def save_experiment_result(filepath, filename, result):
    with open(os.path.join(filepath, filename), "w") as f:
        f.write(result)

In [12]:
for query in queries["pt"]:
    results["pt"].append(pt_searcher.retrieve(query))

[ 11/08 21:31:01 ] - retrieve - Running retrieve
[ 11/08 21:31:01 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 21:31:40 ] - process_query - process_query took 38.89206075668335 seconds
[ 11/08 21:37:29 ] - retrieve - retrieve took 388.3621516227722 seconds
[ 11/08 21:37:29 ] - retrieve - Running retrieve
[ 11/08 21:37:29 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 21:37:29 ] - process_query - process_query took 0.27484798431396484 seconds
[ 11/08 21:46:13 ] - retrieve - retrieve took 524.0488886833191 seconds
[ 11/08 21:46:13 ] - retrieve - Running retrieve
[ 11/08 21:46:13 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 21:46:13 ] - process_query - process_query took 0.17519450187683105 seconds
[ 11/08 21:54:30 ] - retrieve - retrieve took 496.893746137619 seconds
[ 11/08 21:54:30 ] - retrieve - Running retrieve
[ 11/08 21:54:30 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 21:54:30 ] - process_query - process_query took 0.17254114151000977 seconds
[ 11/08 22:03:23 ] - retrieve - retrieve took 532.6180694103241 seconds
[ 11/08 22:03:23 ] - retrieve - Running retrieve
[ 11/08 22:03:23 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 22:03:23 ] - process_query - process_query took 0.18836140632629395 seconds
[ 11/08 22:11:36 ] - retrieve - retrieve took 493.1788263320923 seconds
[ 11/08 22:11:36 ] - retrieve - Running retrieve
[ 11/08 22:11:36 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 22:11:36 ] - process_query - process_query took 0.16845178604125977 seconds
[ 11/08 22:20:02 ] - retrieve - retrieve took 506.33262825012207 seconds


In [36]:
query = queries["en"][0]

In [38]:
result = en_searcher.retrieve(query)

[ 12/08 15:59:45 ] - retrieve - Running retrieve
[ 12/08 15:59:45 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 12/08 15:59:46 ] - process_query - process_query took 0.5210015773773193 seconds
[ 12/08 16:08:01 ] - retrieve - retrieve took 495.8082513809204 seconds


In [39]:
result

[url               https://www.teses.usp.br/teses/disponiveis/3/3...
 doi                           10.11606/D.3.2019.tde-25032019-114624
 type                                        Dissertação de Mestrado
 author                          Almeida, Thiago Rodrigues Meira de 
 institute                                        Escola Politécnica
 knowledge_area                             Engenharia de Computação
 committee         Simplicio Junior, Marcos Antonio (Presidente) ...
 title_pt          Uma arquitetura colaborativa contra ataques di...
 title_en          A collaborative architecture against DDOS atta...
 keywords_pt                  Computação em nuvem Segurança de redes
 keywords_en                   Cloud computing DDOS SDN Security SFC
 abstract_pt                                              Sem resumo
 abstract_en       Distributed attacks, such as Distributed Denia...
 publish_date                                             2019-03-27
 Name: 78081, dtype: object,
 url 

In [13]:
for query in queries["en"]:
    result = en_searcher.retrieve(query)
    text = re
    save_results(f"en_queries_{date}.txt")

[ 11/08 22:20:02 ] - retrieve - Running retrieve
[ 11/08 22:20:02 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 22:20:03 ] - process_query - process_query took 0.4843001365661621 seconds
[ 11/08 22:26:19 ] - retrieve - retrieve took 376.13826274871826 seconds
[ 11/08 22:26:19 ] - retrieve - Running retrieve
[ 11/08 22:26:19 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 22:26:19 ] - process_query - process_query took 0.16835951805114746 seconds
[ 11/08 22:32:33 ] - retrieve - retrieve took 374.496310710907 seconds
[ 11/08 22:32:33 ] - retrieve - Running retrieve
[ 11/08 22:32:33 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 22:32:33 ] - process_query - process_query took 0.1658613681793213 seconds
[ 11/08 22:37:12 ] - retrieve - retrieve took 278.5585906505585 seconds
[ 11/08 22:37:12 ] - retrieve - Running retrieve
[ 11/08 22:37:12 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 22:37:12 ] - process_query - process_query took 0.15593338012695312 seconds
[ 11/08 22:43:33 ] - retrieve - retrieve took 381.69354224205017 seconds
[ 11/08 22:43:33 ] - retrieve - Running retrieve
[ 11/08 22:43:33 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 22:43:34 ] - process_query - process_query took 0.17257142066955566 seconds
[ 11/08 22:48:04 ] - retrieve - retrieve took 270.9978723526001 seconds
[ 11/08 22:48:04 ] - retrieve - Running retrieve
[ 11/08 22:48:04 ] - process_query - Running process_query


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[ 11/08 22:48:05 ] - process_query - process_query took 0.3231821060180664 seconds
[ 11/08 22:53:15 ] - retrieve - retrieve took 310.363224029541 seconds
