In [9]:
import pandas as pd
import json
from openai import AzureOpenAI
import os
import numpy as np
import pickle
from tqdm import tqdm

In [2]:
with open('config.json','r') as config_file:
    config = json.load(config_file)
aoai_endpoint = config['AzureOpenAIEndpoint']
aoai_apikey = config['AzureOpenAIAPIKey']
aoai_chat_deployment_name = config['AzureOpenAIChatDeploymentName']
aoai_embeddings_deployment_name = config['AzureOpenAIEmbeddingsDeploymentName']
aoai_client = AzureOpenAI(
    api_key=aoai_apikey,  
    api_version="2024-02-01",
    azure_endpoint = aoai_endpoint
)

In [3]:
%%time
df_qna = pd.read_csv("local_data/prepared.csv")
print(f"QnA shape: {df_qna.shape}")
df_qna

QnA shape: (661, 6)
CPU times: total: 125 ms
Wall time: 624 ms


Unnamed: 0,id,pergunta,resposta,tags,title,uid
0,17,Quais os benefícios dos cartões Click?,<p><strong>Assunto: Benefícios do cartão de cr...,['final'],[PF] [Benefícios] [IA gen] Benefícios dos cart...,blt4f5a77237fdcf7ac
1,61,Meu cartão tem direito à sala vip?,"<p>Dependendo do tipo de cartão, você tem aces...",[],[PF] [Benefícios] [IA gen] Meu cartão tem dire...,blt1d112fef0d835dbf
2,68,Quais são os benefícios do cartão The One?,<p><strong>O que é e como pedir o cartão The O...,[],[PF] [Benefícios] [IA gen] Benefícios do cartã...,bltd9528ee11abd7873
3,69,Instituto Ayrton Senna Platinum (mastercard),"<p>Renda mínimaR$ 1.000,00&nbsp;</p><p>Anuidad...",[],[PF] [Benefícios] [IA gen] Instituto Ayrton Se...,bltd2bb3abc1adb7852
4,70,Instituto Ayrton Senna Platinum (visa),"<p>Renda mínima R$ 1.000,00&nbsp;</p><p>Anuida...",[],[PF] [Benefícios] [IA gen] Instituto Ayrton Se...,blt16b7d04514a969b3
...,...,...,...,...,...,...
656,34531,Quais os benefícios dos cartões Vivo Platinum?,<p>Benefícios dos cartões Vivo Platinum</p><p>...,[],[PF][Benefícios] [IA gen] Benefícios dos cartõ...,blt140f0b59f9a3cb76
657,34532,Samsung Platinum,"<p>Renda mínima R$ 1.500,00</p><p></p><p>Anuid...",[],[PF] [Benefícios] [IA gen] Samsung Platinum,blt5336bb9098c193cc
658,34533,Decathlon Platinum,"<p>Renda mínima R$ 800,00</p><p>Anuidade gráti...",[],[PF] [Benefícios] [IA gen] Decathlon Platinum,blt73a6afc9343cbae0
659,34534,Quais os benefícios dos cartões TIM?,<p>Benefícios dos cartões TIM</p><p>Informaçõe...,[],[PF] [Benefícios] [IA gen] Benefícios dos cart...,bltd11ca24f97300331


# Create Embeddings for Answers

*\*\*IF THE ANSWER HAS MORE THAN THE MAXIMUM NUMBER OF TOKENS FOR THE EMBEDDINGS MODEL, A CHAT MODEL IS USED TO SUMMARIZE IT, AND THEN THE SUMMARY IS EMBEDDED\*\**

In [4]:
count = 0
answer_embeddings = []
for answer in tqdm(df_qna['resposta'].values):
    try:
        embedding = aoai_client.embeddings.create(
                    input=[answer], 
                    model=aoai_embeddings_deployment_name
        ).data[0].embedding
    except Exception as e:
        if "context length" in e.message:
            prompt = "## Crie um longo resumo detalhado de 8000 tokens em português para a seguinte página HTML:\n\n" + answer
            summary = aoai_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}], 
                model=aoai_chat_deployment_name, 
                temperature=0
            ).choices[0].message.content
            embedding = aoai_client.embeddings.create(
                    input=[summary], 
                    model=aoai_embeddings_deployment_name
            ).data[0].embedding
    answer_embeddings.append(embedding)

answer_embeddings_ar = np.array(answer_embeddings)
print(answer_embeddings_ar.shape)
answer_embeddings_ar

(661, 1536)


array([[ 0.00146169,  0.01871423, -0.03199973, ...,  0.00021403,
         0.00383962,  0.00499351],
       [-0.01133289,  0.01012021,  0.02701622, ..., -0.01521566,
        -0.00603079,  0.01791293],
       [-0.02621103, -0.002022  ,  0.0171054 , ...,  0.00745273,
        -0.01175849,  0.01981121],
       ...,
       [ 0.01569564,  0.03622417, -0.04164425, ...,  0.00428242,
         0.00133314,  0.04451237],
       [-0.01553899,  0.00998357, -0.00298004, ...,  0.00093578,
         0.00306097,  0.02353971],
       [-0.00034272,  0.01770813, -0.0522969 , ..., -0.00382021,
        -0.00464494,  0.05260583]])

In [7]:
with open('local_data/answer_embeddings.pkl', mode='wb') as pkl_file:
    pickle.dump(answer_embeddings_ar, pkl_file)

# Create Embeddings of Questions

In [10]:
question_embeddings = []
for question in tqdm(df_qna['pergunta'].values):
    embedding = aoai_client.embeddings.create(
                input=[question], 
                model=aoai_embeddings_deployment_name
    ).data[0].embedding
    question_embeddings.append(embedding)

question_embeddings_ar = np.array(question_embeddings)
print(question_embeddings_ar.shape)
question_embeddings_ar

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 661/661 [01:20<00:00,  8.24it/s]


(661, 1536)


array([[ 0.02774474,  0.00365294, -0.02350792, ...,  0.00263921,
        -0.00758168,  0.00979986],
       [ 0.00078652,  0.00399377, -0.00828338, ..., -0.00700901,
        -0.02218762,  0.00384301],
       [ 0.0098039 , -0.00761893,  0.02033613, ...,  0.02031337,
        -0.00892764,  0.00621919],
       ...,
       [ 0.07793394, -0.00479394, -0.03930265, ...,  0.03538637,
         0.0031575 ,  0.04579248],
       [-0.00318145, -0.00140842, -0.00160963, ...,  0.01419498,
         0.00307342,  0.03366175],
       [ 0.00579197, -0.00796867, -0.04693921, ..., -0.00610447,
        -0.01190722,  0.01705803]])

In [11]:
with open('local_data/question_embeddings.pkl', mode='wb') as pkl_file:
    pickle.dump(question_embeddings_ar, pkl_file)