In [2]:
import os
import torch
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import GPT2Tokenizer, GPT2Model
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings  # Correct import for Embeddings base class

# Define document directory
DOCS_DIR = "Documents"  # Folder containing the Word/PDF files

# Initialize the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")

# Set padding token to the eos_token (end-of-sequence token)
tokenizer.pad_token = tokenizer.eos_token

# Define a custom embedding class
class GPT2Embedding(Embeddings):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
            outputs = self.model(**inputs)
            # Use the last hidden state as the embeddings
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy())
        return embeddings

    def embed_query(self, text):
        # For query embeddings, similar to document embeddings
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = self.model(**inputs)
        # Use the last hidden state as the embeddings
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# Instantiate the custom embedding model
gpt2_embedding = GPT2Embedding(model, tokenizer)

# Function to load and process documents
def process_documents():
    docs = []
    for filename in os.listdir(DOCS_DIR):
        filepath = os.path.join(DOCS_DIR, filename)

        if filename.endswith(".pdf"):
            loader = PyPDFLoader(filepath)
        elif filename.endswith(".docx"):
            loader = UnstructuredWordDocumentLoader(filepath)
        else:
            continue  # Skip unsupported files

        docs.extend(loader.load())

    # Split documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(docs)

    # Extract texts from chunks (assuming 'page_content' attribute)
    texts = [chunk.page_content for chunk in chunks]

    # Create a FAISS vector store and store the embeddings using the custom GPT-2 embedding model
    vector_store = FAISS.from_texts(texts, gpt2_embedding)

    # Save FAISS index locally
    vector_store.save_local("faiss_db")

    return "✅ Documents processed and stored in FAISS!"

# Process documents when the script is run
if not os.path.exists("faiss_db"):
    process_documents()
