In [5]:
# Import required libraries
from langchain.document_loaders import (
    PyMuPDFLoader,  # For loading PDF files
    DirectoryLoader,  # For loading files from a directory
    TextLoader,  # For loading plain text files
    Docx2txtLoader,  # For loading DOCX files
    UnstructuredPowerPointLoader,  # For loading PPTX files
    UnstructuredExcelLoader  # For loading XLSX files
)
from langchain.schema import Document
from langchain.document_loaders.csv_loader import CSVLoader  # For loading CSV files
# For splitting text into smaller chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
# For creating a vector database for similarity search
from langchain.vectorstores import Pinecone
# For generating embeddings with OpenAI's embedding model
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

from dotenv import load_dotenv  # For loading environment variables from .env file
import os

# Load environment variables from .env file
load_dotenv()

# Replace with the name of the directory carrying your data
data_directory = "new_data"

pinecone.init(api_key=os.getenv("PINECONE_API_KEY"),
              environment=os.getenv("PINECONE_ENVIRONMENT"))

# Load your documents from different sources


def get_documents():
    # Create loaders for PDF, text, CSV, DOCX, PPTX, XLSX files in the specified directory
    pdf_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.pdf", loader_cls=PyMuPDFLoader)
    txt_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.txt", loader_cls=TextLoader,loader_kwargs={'autodetect_encoding': True})
    csv_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.csv", loader_cls=CSVLoader)
    docx_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.docx", loader_cls=Docx2txtLoader)
    pptx_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.pptx", loader_cls=UnstructuredPowerPointLoader)
    xlsx_loader = DirectoryLoader(
        f"./{data_directory}", glob="**/*.xlsx", loader_cls=UnstructuredExcelLoader)

    # Initialize the 'docs' variable
    docs = None

    # Load files using the respective loaders
    pdf_data = pdf_loader.load()
    txt_data = txt_loader.load()
    csv_data = csv_loader.load()
    docx_data = docx_loader.load()
    pptx_data = pptx_loader.load()
    xlsx_data = xlsx_loader.load()

    # Combine all loaded data into a single list
    docs = pdf_data + txt_data + csv_data + docx_data + pptx_data + xlsx_data

    # Return all loaded data
    return docs


# Get the raw documents from different sources
raw_docs = get_documents()

# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=5)

docs = text_splitter.split_documents(raw_docs)

print(f"Total docs: {len(docs)}")

Total docs: 1370


In [6]:
from doctran import Doctran
from dotenv import load_dotenv  # For loading environment variables from .env file
import os
# Load environment variables from .env file
load_dotenv()
doctran = Doctran(openai_api_key=os.getenv("OPENAI_API_KEY"))

In [7]:
for doc in docs:
    document = doctran.parse(content=doc.page_content)
    transformed_document = await document.interrogate().execute()
    doc.page_content = {
        "body":doc.page_content,
        "QnAs": transformed_document.extracted_properties
    }

In [2]:
from pinecone_text.sparse import BM25Encoder

# or from pinecone_text.sparse import SpladeEncoder if you wish to work with SPLADE

# use default tf-idf values
bm25_encoder = BM25Encoder().default()

In [7]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [8]:
corpus = [doc.page_content for doc in docs]

# fit tf-idf values on your corpus
bm25_encoder.fit(corpus)

# store the values to a json file
bm25_encoder.dump("bm25_values_for_ddw.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values_for_ddw.json")

  0%|          | 0/1370 [00:00<?, ?it/s]

In [9]:
from langchain.retrievers import PineconeHybridSearchRetriever
import pinecone

pinecone.init(api_key=os.getenv("PINECONE_API_KEY"),
              environment=os.getenv("PINECONE_ENVIRONMENT"))

index = pinecone.Index(os.getenv("PINECONE_INDEX"))
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index
)

In [10]:
retriever.add_texts(texts=corpus,metadatas=[doc.metadata for doc in docs])

  0%|          | 0/43 [00:00<?, ?it/s]