In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

import faiss
import numpy as np
import os
import pandas as pd

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Configurações
diretorio_arquivos = '../arquivos'
embeddings_file_path = os.path.join(diretorio_arquivos, 'embeddings.h5')
faiss_index_path = os.path.join(diretorio_arquivos, 'amazon_products_index.faiss')
parquet_file_path = os.path.join(diretorio_arquivos, 'trn.parquet')

# Carregar DataFrame do Parquet
dados = pd.read_parquet(parquet_file_path)

# Configurações de amostragem
batch_size = 100
use_sample = True

# Amostrar dados se necessário
if use_sample:
    dados = dados.sample(n=batch_size, random_state=42)

ArrowMemoryError: realloc of size 1073741824 failed

In [None]:
# Criar o DataFrame com os dados fornecidos
"""
data = {
    "uid": ["A1B2C3", "D4E5F6", "G7H8I9", "J1K2L3", "M4N5O6", "P7Q8R9", "S1T2U3", "V4W5X6", "Y7Z8A9", "B1C2D3"],
    "title": [
        "Deluxe Coffee Maker", "Smart Home Thermostat", "Ergonomic Office Chair",
        "Portable Air Purifier", "High-Definition Smart TV", "Multi-Function Kitchen Blender",
        "Wireless Noise-Canceling Headphones", "Leather Executive Briefcase",
        "Outdoor Adventure Tent", "Digital Instant Camera"
    ],
    "content": [
        "The Deluxe Coffee Maker is the ultimate kitchen appliance for coffee enthusiasts...",
        "The Smart Home Thermostat offers advanced climate control with cutting-edge technology...",
        "Experience unparalleled comfort with the Ergonomic Office Chair...",
        "The Portable Air Purifier is perfect for improving air quality in any room...",
        "Enjoy cinematic-quality visuals with the High-Definition Smart TV...",
        "The Multi-Function Kitchen Blender is a versatile tool that handles everything from smoothies to soups...",
        "Immerse yourself in high-fidelity sound with the Wireless Noise-Canceling Headphones...",
        "The Leather Executive Briefcase combines elegance and practicality for the modern professional...",
        "The Outdoor Adventure Tent is designed for explorers who seek comfort and durability in the wild...",
        "Capture memories instantly with the Digital Instant Camera..."
    ]
}

dados = pd.DataFrame(data)
"""

# Carregar o modelo de embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Criar embeddings dos conteúdos
embeddings = model.encode(dados['content'].tolist(), convert_to_tensor=True)
normalized_embeddings = normalize(embeddings.numpy())

# Criar o índice de busca
dimension = normalized_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(normalized_embeddings)

def retrieve_and_generate(query, top_k=1, distance_threshold=1.2):
    # Codificar e normalizar a consulta
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding = normalize(query_embedding.numpy())
    
    # Buscar no índice
    distances, indices = index.search(query_embedding, top_k)
    
    # Ajustar o limite para distâncias
    print(f"Distâncias: {distances}")
    print(f"Índices: {indices}")
    
    if distances[0][0] < distance_threshold:  # Ajuste conforme necessário
        # Recuperar o conteúdo mais relevante
        relevant_content = dados.iloc[indices[0][0]]['content']
        return relevant_content
    else:
        return "Nenhuma resposta encontrada."

def validate_answer(query, expected_answer, top_k=1, distance_threshold=1.2):
    result = retrieve_and_generate(query, top_k, distance_threshold)
    if result == expected_answer:
        return f"Resposta correta: {result}"
    else:
        return f"Resposta incorreta. Esperado: {expected_answer}. Obtido: {result}"

# Perguntas e Respostas Esperadas
print("Perguntas em português:")
portuguese_queries = [
    ("O que faz o purificador de ar portátil?", "The Portable Air Purifier is perfect for improving air quality in any room..."),
    ("Qual é a descrição do purificador de ar portátil?", "The Portable Air Purifier is perfect for improving air quality in any room..."),
    ("Qual é a função do liquidificador multifuncional?", "The Multi-Function Kitchen Blender is a versatile tool that handles everything from smoothies to soups...")
]

for query, expected_answer in portuguese_queries:
    print(f"Pergunta: {query}")
    print(validate_answer(query, expected_answer))
    print()

print("Questions in English:")
english_queries = [
    ("What is the function of the portable air purifier?", "The Portable Air Purifier is perfect for improving air quality in any room..."),
    ("Describe the ergonomic office chair.", "Experience unparalleled comfort with the Ergonomic Office Chair..."),
    ("How does the multi-function kitchen blender work?", "The Multi-Function Kitchen Blender is a versatile tool that handles everything from smoothies to soups..."),
    ("What does the smart home thermostat do?", "The Smart Home Thermostat offers advanced climate control with cutting-edge technology..."),
    ("Explain the features of the high-definition smart TV.", "Enjoy cinematic-quality visuals with the High-Definition Smart TV...")
]

for query, expected_answer in english_queries:
    print(f"Question: {query}")
    print(validate_answer(query, expected_answer))
    print()

print("Testing with queries that should return 'No answer found':")
test_queries_no_answer = [
    ("What is the best way to use a coffee maker?", "Nenhuma resposta encontrada."),
    ("How do I set up a home theater system?", "Nenhuma resposta encontrada."),
    ("Describe the warranty on the digital camera.", "Nenhuma resposta encontrada."),
    ("What is the price of the leather briefcase?", "Nenhuma resposta encontrada."),
    ("Explain how the outdoor tent is set up.", "Nenhuma resposta encontrada.")
]

for query, expected_answer in test_queries_no_answer:
    print(f"Query: {query}")
    print(validate_answer(query, expected_answer))
    print()

In [1]:
from transformers import GPTJForCausalLM, GPT2Tokenizer

# Criar o DataFrame com os dados fornecidos
data = {
    "uid": ["A1B2C3", "D4E5F6", "G7H8I9", "J1K2L3", "M4N5O6", "P7Q8R9", "S1T2U3", "V4W5X6", "Y7Z8A9", "B1C2D3"],
    "title": [
        "Deluxe Coffee Maker", "Smart Home Thermostat", "Ergonomic Office Chair",
        "Portable Air Purifier", "High-Definition Smart TV", "Multi-Function Kitchen Blender",
        "Wireless Noise-Canceling Headphones", "Leather Executive Briefcase",
        "Outdoor Adventure Tent", "Digital Instant Camera"
    ],
    "content": [
        "The Deluxe Coffee Maker is the ultimate kitchen appliance for coffee enthusiasts...",
        "The Smart Home Thermostat offers advanced climate control with cutting-edge technology...",
        "Experience unparalleled comfort with the Ergonomic Office Chair...",
        "The Portable Air Purifier is perfect for improving air quality in any room...",
        "Enjoy cinematic-quality visuals with the High-Definition Smart TV...",
        "The Multi-Function Kitchen Blender is a versatile tool that handles everything from smoothies to soups...",
        "Immerse yourself in high-fidelity sound with the Wireless Noise-Canceling Headphones...",
        "The Leather Executive Briefcase combines elegance and practicality for the modern professional...",
        "The Outdoor Adventure Tent is designed for explorers who seek comfort and durability in the wild...",
        "Capture memories instantly with the Digital Instant Camera..."
    ]
}

dados = pd.DataFrame(data)

# Carregar o modelo de embeddings
model_embedding = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Criar embeddings dos conteúdos
embeddings = model_embedding.encode(dados['content'].tolist(), convert_to_tensor=True)
normalized_embeddings = normalize(embeddings.numpy())

# Criar o índice de busca
dimension = normalized_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(normalized_embeddings)

# Função para buscar os documentos mais relevantes
def buscar_contexto(pergunta, model, index, dados):
    pergunta_embedding = model.encode([pergunta], convert_to_tensor=True)
    pergunta_embedding = normalize(pergunta_embedding.numpy())
    
    # Buscar os documentos mais relevantes
    D, I = index.search(pergunta_embedding, k=3)  # k é o número de documentos a recuperar
    documentos_relevantes = [dados.iloc[i]['content'] for i in I[0]]
    
    return documentos_relevantes

# Carregar o modelo GPT-J
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model_gptj = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

# Função para gerar a resposta
def gerar_resposta(pergunta, contextos):
    contextos_completos = " ".join(contextos)
    prompt = f"Pergunta: {pergunta}\n\nContexto:\n{contextos_completos}\n\nResposta:"

    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model_gptj.generate(inputs['input_ids'], max_length=150, num_return_sequences=1)
    resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return resposta

# Exemplo de pergunta
pergunta = "Qual é o melhor produto para melhorar a qualidade do ar?"

# Buscar os documentos relevantes
documentos_relevantes = buscar_contexto(pergunta, model_embedding, index, dados)

# Gerar e imprimir a resposta
resposta = gerar_resposta(pergunta, documentos_relevantes)
print("Resposta:", resposta)

  from tqdm.autonotebook import tqdm, trange


tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

KeyboardInterrupt: 