In [1]:
import os
import faiss
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from openai import OpenAI
from dotenv import load_dotenv

import os

import nltk
nltk.download('punkt_tab')

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import NLTKTextSplitter
from tqdm import tqdm
import re
from typing import List

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alves\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [14]:
load_dotenv("openai")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

## Loading Document

In [3]:
with open(r'..\data\processed\plano-acao-adaptacao-climatica-nacional.txt', 'r',encoding='utf-8') as file:
    file_content = file.read()

# Now, file_content contains the entire content of the text file
print(file_content)



--- Page 1 ---

Plano Nacional de Adaptação
à Mudança do Clima
Grupo Executivo do Comitê Interministerial de Mudança do Clima – GEx-CIM
Ministério do Meio Ambiente
Brasília, 2015

--- Page 2 ---

Sumário
Apresentação ................................................................................................................. 3
1. Histórico e contexto legal ......................................................................................... 6
2. Mudança do clima observada e futura ....................................................................... 8
3. Objetivo geral, visão e princípios ............................................................................. 19
4. Objetivos específicos, iniciativas transversais e recomendações gerais ....................... 22
5. Gestão do Plano .................................................................................................... 27
6. Estratégias setoriais e temáticas ..................................................

## Chunking

In [4]:
text_splitter = NLTKTextSplitter(chunk_size = 300,chunk_overlap=50)
chunks = text_splitter.split_text(file_content)

Created a chunk of size 337, which is longer than the specified 300
Created a chunk of size 781, which is longer than the specified 300
Created a chunk of size 308, which is longer than the specified 300
Created a chunk of size 649, which is longer than the specified 300
Created a chunk of size 330, which is longer than the specified 300
Created a chunk of size 347, which is longer than the specified 300
Created a chunk of size 384, which is longer than the specified 300
Created a chunk of size 311, which is longer than the specified 300
Created a chunk of size 318, which is longer than the specified 300
Created a chunk of size 324, which is longer than the specified 300
Created a chunk of size 567, which is longer than the specified 300
Created a chunk of size 481, which is longer than the specified 300
Created a chunk of size 602, which is longer than the specified 300
Created a chunk of size 603, which is longer than the specified 300
Created a chunk of size 329, which is longer tha

In [5]:
for idx, chunk in enumerate(chunks):
    print(f"Chunk {idx} - {chunk}")

Chunk 0 - --- Page 1 ---

Plano Nacional de Adaptação
à Mudança do Clima
Grupo Executivo do Comitê Interministerial de Mudança do Clima – GEx-CIM
Ministério do Meio Ambiente
Brasília, 2015

--- Page 2 ---

Sumário
Apresentação ................................................................................................................. 3
1.
Chunk 1 - Histórico e contexto legal ......................................................................................... 6
2.

Mudança do clima observada e futura ....................................................................... 8
3.
Chunk 2 - Objetivo geral, visão e princípios ............................................................................. 19
4.

Objetivos específicos, iniciativas transversais e recomendações gerais ....................... 22
5.
Chunk 3 - Gestão do Plano .................................................................................................... 27
6.

Estratégias setoriais e temáticas .........

In [19]:
print(f"The document has {len(chunks)} chunks")

The document has 2362 chunks


In [7]:
chunks[2300]

'Disseminação de informações sobre a rede de transportes nas cidades;\nApoio à inovação em projetos que reduzam as emissões de carbono e aumentem a\ncapacidade adaptativa às mudanças climáticas\nDIRETRIZES (ENERGIA)\nPromover um maior envolvimento das instituições do setor energético ao tema de\nadaptação visando, quando aplicável, a adequação das políticas institucionais a novos\nparâmetros climáticos.'

## Embedding

In [12]:
def generate_embeddings(input: List[str],model='text-embedding-ada-002')-> List[float]:
    embedding = client.embeddings.create(
        model=model,
        input=input
    )
    total_tokens = embedding.usage.total_tokens
    embeddings = [data.embedding for data in embedding.data]
    return embedding.data[0].embedding

In [18]:
df = pd.DataFrame(chunks[0:100], columns=["content"])
df["embedding"] = df["content"].apply(lambda x: generate_embeddings([x]))

In [20]:
df

Unnamed: 0,content,embedding
0,--- Page 1 ---\n\nPlano Nacional de Adaptação\...,"[-0.001498627127148211, 0.014689480885863304, ..."
1,Histórico e contexto legal ......................,"[-0.01033360417932272, -0.009514909237623215, ..."
2,"Objetivo geral, visão e princípios ..............","[-0.013710755854845047, -0.01705038733780384, ..."
3,Gestão do Plano .................................,"[-0.007077664136886597, -0.00412316620349884, ..."
4,Estratégia de Agricultura .......................,"[-0.009672578424215317, -0.006657622288912535,..."
...,...,...
95,Estas simulações procuram\n268 subsidiar estud...,"[-0.013160723261535168, -0.021879861131310463,..."
96,270 Os resultados acima apresentados não esgot...,"[-0.00945217814296484, -0.0004963490064255893,..."
97,"É de extrema\n272 relevância que a sociedade, ...","[-0.00793637428432703, -0.009870664216578007, ..."
98,Parte deste processo está associado à\n275 ide...,"[-0.008711261674761772, 0.007426414638757706, ..."


## Create Vector DB

In [21]:
import psycopg2
from psycopg2.extras import execute_values

In [22]:
connection_string = "postgresql://xchallenge_user:xchallenge_password@localhost:5432/xchallenge_db"

In [24]:
conn = psycopg2.connect(connection_string)
conn.autocommit = True
cur = conn.cursor()

OperationalError: connection to server at "localhost" (::1), port 5432 failed: FATAL:  password authentication failed for user "xchallenge_user"
