In [None]:
!pip install fitz
!pip install pymupdf

### Generating Raw Data

In [None]:
import fitz  
import markdown
import glob
import re
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    # Replace unwanted characters
    text = re.sub(r"</?ol>", "", text)  # Remove </ol> and <ol>
    text = re.sub(r"</?p>", "", text)   # Remove </p> and <p>
    text = re.sub(r"<h1>.*?</h1>", "", text) # Remove <h1> tags and content
    return text

def extract_text_from_md(md_path):
    with open(md_path, "r", encoding="utf-8") as f:
        content = f.read()
    text = markdown.markdown(content)
    # Replace unwanted characters (apply same replacements as PDF)
    text = re.sub(r"</?ol>", "", text)
    text = re.sub(r"</?p>", "", text)
    text = re.sub(r"<h1>.*?</h1>", "", text)
    return text

# Load all PDFs and Markdown files
pdf_files = glob.glob("data/*.pdf")
md_files = glob.glob("data/*.md")

documents = []
for file in pdf_files:
    documents.append(extract_text_from_pdf(file))
for file in md_files:
    documents.append(extract_text_from_md(file))

### Generate Chunks

In [None]:
!pip install nltk

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

# Download the Punkt tokenizer model for sentence tokenization
nltk.download('punkt_tab')

# Define the function to split documents into smaller chunks
def split_documents(documents, max_length=256):
    chunks = []
    for doc in documents:
        sentences = sent_tokenize(doc)  # Tokenize the document into sentences
        chunk = ""
        for sentence in sentences:
            if len(chunk) + len(sentence) <= max_length:
                chunk += sentence + " "
            else:
                chunks.append(chunk.strip())
                chunk = sentence + " "
        if chunk:
            chunks.append(chunk.strip())
    return chunks

def preprocess_chunk(chunk):
    chunk = chunk.strip()                    
    chunk = chunk.replace("\n", " ")    
    return chunk

chunks = split_documents(documents)

#for i, chunk in enumerate(chunks):
#    print(f"Chunk {i + 1}: {chunk}")

chunks = [preprocess_chunk(chunk) for chunk in chunks]

### Generating Questions and Answers

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

# Initialize the pipelines
question_generator = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl")
qa_extractor = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")


# Process each context chunk to generate questions and extract answers
dataset= []  # List to hold QA pairs
for context in chunks:
    # Step 1: Generate questions
    question = question_generator(f"Generate questions from: {context}")[0]["generated_text"] 
    # Step 2: Extract answers for each question
    answer = qa_extractor(question=question, context=context)
    dataset.append({"context":context,"question": question, "answer": answer["answer"]})


# for item in dataset:                                    # Display the QA pairs for each chunk
#     print(f"Q: {item['question']}")
#     print(f"A: {item['answer']}")


In [None]:
print(len(qa_pairs_per_chunk[1]))
print(len(chunks))

In [None]:
dataset=[]
for i in range(len(qa_pairs_per_chunk)):
    dataset.append({"context":chunks[i],"question":qa_pairs_per_chunk[i][0]['question'],"answer":qa_pairs_per_chunk[i][0]['answer']})

In [None]:
print(len(dataset))

In [None]:
import json

with open("output.json", "w") as file:
    json.dump(dataset, file)


In [None]:
print(len(documents))

In [1]:
import torch
print(torch.__version__)


2.5.1


In [2]:
import torch
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

GPU: NVIDIA GeForce RTX 2050 is available.


In [3]:
import transformers

  from .autonotebook import tqdm as notebook_tqdm
