In [2]:
# Install necessary packages
!pip install PyPDF2 wandb scikit-learn

Collecting pymupdf
  Downloading PyMuPDF-1.24.5-cp311-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.5-cp311-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.5


In [3]:
# Install necessary packages
!pip install PyPDF2 wandb scikit-learn

# Import libraries
import os
import wandb
import PyPDF2
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.exceptions import NotFittedError
import re

# Load environment variables from .env file
load_dotenv()

# Initialize WandB
print("Initializing WandB...")
wandb.init(project="Visibility Example - AIE3", entity="tehnickapodrska")

# Function to extract text from all PDFs in a folder using PyPDF2
def extract_text_from_pdfs(folder_path):
    pdf_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text()
                if text.strip():  # Check if the text is not empty
                    pdf_texts.append({"filename": filename, "text": text})
                    print(f"Extracted text from {filename}")
                else:
                    print(f"No text extracted from {filename}")
    return pdf_texts

# Function to sanitize filenames
def sanitize_filename(filename):
    return re.sub(r'[^a-zA-Z0-9_\-.]', '_', filename)

# Check if there are any PDF files in the subfolder
pdf_folder = 'PDF'
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
if not pdf_files:
    print(f"No PDF files found in the folder '{pdf_folder}'. Please add some PDF files and try again.")
else:
    # Extract text from all PDFs in the PDF folder
    print("Extracting text from PDFs...")
    pdf_texts = extract_text_from_pdfs(pdf_folder)

    # Check if any text was extracted
    if not pdf_texts:
        print("No text extracted from any PDFs. Exiting.")
    else:
        # Save extracted text to individual files and log as artifacts
        for pdf in pdf_texts:
            sanitized_filename = sanitize_filename(pdf['filename'])
            text_filename = f"extracted_{sanitized_filename}.txt"
            with open(text_filename, "w") as text_file:
                text_file.write(pdf['text'])
            pdf_artifact = wandb.Artifact(sanitized_filename, type="dataset")
            pdf_artifact.add_file(text_filename)
            wandb.log_artifact(pdf_artifact)
            print(f"Logged artifact for {pdf['filename']}")

        # Define the RAG system class
        class RAGSystem:
            def __init__(self):
                self.documents = []
                self.vectorizer = TfidfVectorizer()
                self.doc_vectors = None
            
            def ingest_document(self, text, source="unknown"):
                if text.strip():  # Check if the text is not empty
                    self.documents.append({"text": text, "source": source})
                    print(f"Ingested document from {source}")
                    self._update_vectors()
                else:
                    print(f"Skipped empty document from {source}")
            
            def _update_vectors(self):
                corpus = [doc["text"] for doc in self.documents]
                print(f"Updating document vectors with corpus: {corpus}")
                if corpus:
                    self.doc_vectors = self.vectorizer.fit_transform(corpus)
                    print("Updated document vectors")
                else:
                    print("No documents to update vectors")
            
            def ingest_pdf_texts(self, pdf_texts):
                for pdf in pdf_texts:
                    self.ingest_document(pdf['text'], source=pdf['filename'])
            
            def retrieve(self, query):
                try:
                    query_vector = self.vectorizer.transform([query])
                    similarities = cosine_similarity(query_vector, self.doc_vectors).flatten()
                    best_match_index = similarities.argmax()
                    print("Retrieved best matching document")
                    return self.documents[best_match_index]
                except NotFittedError as e:
                    print("Error during retrieval: The TF-IDF vectorizer is not fitted. Ensure that the document vectors are updated properly.")
                    raise e

        # Initialize the RAG system
        print("Initializing RAG system...")
        rag_system = RAGSystem()

        # Ingest the extracted PDF texts
        print("Ingesting extracted PDF texts into RAG system...")
        rag_system.ingest_pdf_texts(pdf_texts)

        # Log the number of documents ingested
        print("Logging the number of documents ingested...")
        wandb.log({"documents_ingested": len(rag_system.documents)})

        # Perform a retrieval and log the results
        query = "What is text generation?"
        print(f"Performing retrieval for query: {query}")
        try:
            best_document = rag_system.retrieve(query)
            print(f"Best document source: {best_document['source']}")
            print(f"Best document text: {best_document['text'][:500]}")  # Print the first 500 characters
            # Log the query and the best document source to WandB
            print("Logging query and best document source to WandB...")
            wandb.log({"query": query, "best_document_source": best_document['source']})
        except NotFittedError:
            print("Retrieval failed due to vectorizer fitting issue.")

        # End the WandB run
        print("Ending WandB run...")
        wandb.finish()


Initializing WandB...


Extracting text from PDFs...
Extracted text from Text generation models.pdf
Logged artifact for Text generation models.pdf
Initializing RAG system...
Ingesting extracted PDF texts into RAG system...
Ingested document from Text generation models.pdf
Updating document vectors with corpus: [' \nText generation models  \n \nOpenAI\'s text generation models (often called generative pre -trained transformers or \nlarge language models) have been trained to understand natural language, code, \nand images. The models provide  text outputs in response to their inputs. The text \ninputs to these models are also referred to as "prompts". Designing a prompt is \nessentially how you “program” a large language model model, usually by providing \ninstructions or some examples of how to suc cessfully complete a task.  \nUsing OpenAI\'s text generation models, you can build applications to:  \n• Draft documents  \n• Write computer code  \n• Answer questions about a knowledge base  \n• Analyze texts  \n

0,1
documents_ingested,▁

0,1
best_document_source,Text generation mode...
documents_ingested,1
query,What is text generat...
