In [2]:
# Install pympdf package to env
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.5-cp311-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.5-cp311-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.5


In [5]:
# Import libraries
import fitz  # PyMuPDF
import os
import wandb
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables from .env file
load_dotenv()

# Initialize WandB
wandb.init(project="Visibility Example - AIE3", entity="tehnickapodrska")

# Function to extract text from all PDFs in a folder
def extract_text_from_pdfs(folder_path):
    pdf_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            pdf_document = fitz.open(pdf_path)
            text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                text += page.get_text()
            pdf_texts.append({"filename": filename, "text": text})
    return pdf_texts

# Extract text from all PDFs in the PDF folder
pdf_texts = extract_text_from_pdfs('PDF')

# Save extracted text to individual files and log as artifacts
for pdf in pdf_texts:
    text_filename = f"extracted_{pdf['filename']}.txt"
    with open(text_filename, "w") as text_file:
        text_file.write(pdf['text'])
    pdf_artifact = wandb.Artifact(pdf['filename'], type="dataset")
    pdf_artifact.add_file(text_filename)
    wandb.log_artifact(pdf_artifact)

# Define the RAG system class
class RAGSystem:
    def __init__(self):
        self.documents = []
        self.vectorizer = TfidfVectorizer()
        self.doc_vectors = None
    
    def ingest_document(self, text, source="unknown"):
        self.documents.append({"text": text, "source": source})
        self._update_vectors()
    
    def _update_vectors(self):
        corpus = [doc["text"] for doc in self.documents]
        self.doc_vectors = self.vectorizer.fit_transform(corpus)
    
    def ingest_pdf_texts(self, pdf_texts):
        for pdf in pdf_texts:
            self.ingest_document(pdf['text'], source=pdf['filename'])
    
    def retrieve(self, query):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.doc_vectors).flatten()
        best_match_index = similarities.argmax()
        return self.documents[best_match_index]

# Initialize the RAG system
rag_system = RAGSystem()

# Ingest the extracted PDF texts
rag_system.ingest_pdf_texts(pdf_texts)

# Log the number of documents ingested
wandb.log({"documents_ingested": len(rag_system.documents)})

# Perform a retrieval and log the results
query = "What is text generation?"
best_document = rag_system.retrieve(query)
print(f"Best document source: {best_document['source']}")
print(f"Best document text: {best_document['text'][:500]}")  # Print the first 500 characters

# Log the query and the best document source to WandB
wandb.log({"query": query, "best_document_source": best_document['source']})

# End the WandB run
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mdjuhas[0m ([33mtehnickapodrska[0m). Use [1m`wandb login --relogin`[0m to force relogin


ValueError: empty vocabulary; perhaps the documents only contain stop words