In [1]:
import os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

In [4]:
from pinecone import Pinecone

import re
import pdfplumber
import openai
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


os.environ["PINECONE_API_KEY"] = os.getenv('PINECONE_API_KEY')

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("ragtest")

In [5]:
openai.api_key = os.getenv('OPENAI_API_KEY')
MODEL = "text-embedding-ada-002"

In [7]:
import re
import pypdf

# Define a function to preprocess text
def preprocess_text(text):
    # Replace consecutive spaces, newlines, and tabs
    text = re.sub(r'\s+', ' ', text)
    return text

def process_pdf(file_path):
    # create a loader
    loader = PyPDFLoader(file_path)
    # load your data
    data = loader.load()
    # Split your data up into smaller documents with Chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    documents = text_splitter.split_documents(data)
    # Convert Document objects into strings with progress tracking
    texts = [str(doc) for doc in documents]
    return texts


In [8]:
def create_embeddings(texts):
    embeddings_list = []
    for text in texts:
        response = openai.embeddings.create(input=text, model=MODEL)
        embedding = response.data[0].embedding  # Access the embedding correctly
        embeddings_list.append(embedding)
    return embeddings_list

# Define a function to upsert embeddings to Pinecone with metadata
def upsert_embeddings_to_pinecone(index, embeddings, ids, texts, batch_size=100):
    for i in range(0, len(embeddings), batch_size):
        batch_embeddings = embeddings[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]
        batch_texts = texts[i:i + batch_size]
        metadata = []
        for id, text in zip(batch_ids, batch_texts):
            # Extract filename from id (removing _chunk_X)
            filename = id.split('_chunk_')[0]
            # Get the filename without extension and split on space
            name_parts = filename.split('.')[0].split(' ')
            plan_type = name_parts[0] if name_parts else ''
            plan_difficulty = name_parts[1] if len(name_parts) > 1 else ''
            metadata.append({
                'text': preprocess_text(text),
                'plan_type': plan_type,
                'plan_difficulty': plan_difficulty
            })
        index.upsert(vectors=[(id, embedding, meta) for id, embedding, meta in zip(batch_ids, batch_embeddings, metadata)])

In [10]:
# Define a function to create embedding
# Process a PDF and create embeddings
file_paths = ["data_documents/Agenzia_entrate_1.pdf", "data_documents/Agenzia_entrate_2.pdf", "data_documents/Agenzia_entrate_3.pdf", "data_documents/Agenzia_entrate_4.pdf", "data_documents/Agenzia_entrate_5.pdf", "data_documents/fiscozen_web.pdf"]  # Replace with your actual file path
for file_path in file_paths:
  texts = process_pdf(file_path)
  print("processed text for file", file_path)
  embeddings = create_embeddings(texts)
  # Assuming that `file_path` is used as the ID for the entire document; if you want to use unique IDs for each text chunk, modify this accordingly
  ids = [f"{file_path}_chunk_{i}" for i in range(len(embeddings))]
  # Upsert the embeddings to Pinecone with text as metadata
  upsert_embeddings_to_pinecone(index, embeddings, ids, texts)
  print("upserted embeddings for file", file_path)

processed text for file data_documents/Agenzia_entrate_1.pdf
upserted embeddings for file data_documents/Agenzia_entrate_1.pdf
processed text for file data_documents/Agenzia_entrate_2.pdf
upserted embeddings for file data_documents/Agenzia_entrate_2.pdf
processed text for file data_documents/Agenzia_entrate_3.pdf
upserted embeddings for file data_documents/Agenzia_entrate_3.pdf
processed text for file data_documents/Agenzia_entrate_4.pdf
upserted embeddings for file data_documents/Agenzia_entrate_4.pdf
processed text for file data_documents/Agenzia_entrate_5.pdf
upserted embeddings for file data_documents/Agenzia_entrate_5.pdf
processed text for file data_documents/fiscozen_web.pdf
upserted embeddings for file data_documents/fiscozen_web.pdf


In [11]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings

text_field = "text"  # the metadata field that contains our text
embed_model = OpenAIEmbeddings(model=MODEL)
# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model, text_field,
)

  _alternative: str = alternative,
  _alternative: str = alternative,


In [14]:
from langchain_openai import OpenAIEmbeddings
import openai
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
# Set OpenAI API key
openai.api_key = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI Embeddings model
model = OpenAIEmbeddings(model="text-embedding-ada-002",openai_api_key=openai.api_key)

 # Initialize Pinecone with API key
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index = pc.Index("ragtest")

# Set up Pinecone VectorStore
vectorstore = PineconeVectorStore(index, model, "text")

# Initialize OpenAI client
client = openai.OpenAI()


query = "What is the IVA?"

result = vectorstore.similarity_search(query, k=5)

print(result)


[Document(metadata={'plan_difficulty': '', 'plan_type': 'data_documents/fiscozen_web'}, page_content="page_content='Partita IVA: cos’è e chi può averla? La Partita IVA è un codice di 11 cifre che ti identifica in modo univoco come lavoratore autonomo negli archivi dell’agenzia delle entrate Aprendo la Partita IVA potrai farti pagare e acquisirai i diritti e i doveri, come l’obbligo di versare le tasse e la possibilità di versare i contributi che riavrai indietro sotto forma di pensione. I requisiti fondamentali per aprire la Partita IVA sono 4 Il primo è che devi essere maggiorenne oppure aver ricevuto l’emancipazione dal tribunale, se sei minorenne. Il secondo requisito prevede che tu sia in possesso delle tue facoltà mentali, ovvero devi essere capace di intendere e volere. Il terzo è che tu sia residente in Italia. Se sei stato processato per un reato puoi aprire la Partita IVA solo dopo 5 anni dalla fine della condanna definitiva e solo dopo aver ricevuto la riabilitazione dal giud