In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import fitz

In [None]:
from huggingface_hub import login
login(token="your_token")


In [2]:
# =======================
# 🛠️ Install Dependencies
# =======================
!pip install transformers PyMuPDF --quiet

# ===========================
# 🔌 Mount Google Drive
# ===========================
from google.colab import drive
drive.mount('/content/drive')

# ===========================
# 📄 Extract Text from PDF
# ===========================
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        if page_num == 0 or page_num in [16, 17, 18]:
            continue
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# ===========================
# ✂️ Split Text into Chunks
# ===========================
def split_text_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        for i in range(end-1, max(start, end-100), -1):
            if text[i] in ['.', '?', '!'] and (i+1 == len(text) or text[i+1].isspace()):
                end = i + 1
                break
        chunks.append(text[start:end])
        start = end - overlap if end - overlap > start else end
    return chunks

# ===========================
# 🧠 Load New Question Generator Model
# ===========================
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_name = "valhalla/t5-small-qg-hl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# ===========================
# ❓ Generate Questions from Highlighted Text
# ===========================
def generate_questions(text, num_questions=5):
    # We use <hl> to highlight a sentence in a passage
    sentences = text.split('. ')
    questions = []

    for i, sentence in enumerate(sentences):
        if len(sentence.strip().split()) < 6:
            continue

        # Highlight the sentence
        highlighted = text.replace(sentence, f"<hl> {sentence.strip()} <hl>")

        input_text = f"generate question: {highlighted}"
        inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)

        outputs = model.generate(
            inputs,
            max_length=64,
            num_beams=4,
            early_stopping=True
        )

        question = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        if question.endswith("."):
            question = question[:-1] + "?"
        questions.append(question)

        if len(questions) >= num_questions:
            break

    return questions

# ===========================
# 🔁 Full Process on PDF
# ===========================
def generate_questions_from_pdf(pdf_path, questions_per_chunk=5):
    print("🔍 Extracting text...")
    document_text = extract_text_from_pdf(pdf_path)

    print("✂️ Splitting text...")
    chunks = split_text_into_chunks(document_text, chunk_size=1200)

    all_questions = []
    for i, chunk in enumerate(chunks):
        print(f"✨ Processing chunk {i+1}/{len(chunks)}...")
        qs = generate_questions(chunk, questions_per_chunk)
        all_questions.extend([f"Q{i+1}.{j+1}: {q}" for j, q in enumerate(qs)])
        print("\n".join(qs))
        print("--------")

    return all_questions

# ===========================
# 🧪 Run Example
# ===========================
pdf_path = "/content/drive/My Drive/DueDilDocuments/ey-token-due-diligence-a-structured-approach-to-evaluate-digital-asset-risk.pdf"
questions = generate_questions_from_pdf(pdf_path)

# Save to file
with open("/content/drive/My Drive/generated_questions4.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(questions))

print(f"\n✅ Total Questions: {len(questions)}")
print("📁 Saved to /content/drive/My Drive/generated_questions4.txt")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

🔍 Extracting text...
✂️ Splitting text...
✨ Processing chunk 1/61...
What is a structured approach to evaluate digital asset risk?
What is the general token due diligence framework?
What is the definition of a token due diligence framework?
What is the definition of a token due diligence framework?
What does Cybersecurity 13 6.1 Governance and operational security 13 7. Auditability 14 7.1 Auditability and ownership 14 8. Summary 15 9?
--------
✨ Processing chunk 2/61...
What is the key to cybersecurity 13 6.1 Governance and operational security 13 7?
What is the key to auditability?
What does Cybersecurity 13 6.1 Governance and operational security 13 7?
What may be difficult to ascertain quality from noise, memetic and groupthink from adoption?
What may a small subset revolutionize the world as innovative disintermediation tools?
--------
✨ Processing chunk 3/61...
What are some regulatory bodies known to require coin-listing policies to meet rigorous standards?
What does each of the