In [6]:
# upgrade pip (optional but helpful)
!pip install --upgrade pip

# install PyMuPDF (this provides `import fitz`)
!pip install PyMuPDF


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m116.4 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.4


In [7]:
!pip install pdfminer.six


Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m45.7 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [8]:
import fitz   # PyMuPDF
from pathlib import Path

pdf_path = "/content/10 ss.pdf"   # <- change if your file has a different name/path
assert Path(pdf_path).exists(), f"PDF not found at {pdf_path}. Upload it to Colab or fix the path."

doc = fitz.open(pdf_path)
pages = []
for i, page in enumerate(doc):
    txt = page.get_text("text") or ""
    pages.append({"page": i+1, "text": txt})

print("Extracted pages:", len(pages))
# quick preview of first page
print("First page preview:\n", pages[0]["text"][:1000].replace("\n", " ") + "\n\n---")


Extracted pages: 15
First page preview:
 1 To acquaint ourselves with   The race for colonies leading to rivalry and clashes among the great powers of Europe    Emergence of Japan as the strongest and most aggressive power in East Asia   Colonialism's impact on Africa    Causes, course and results of the First World War   Treaty of Versailles and its provisions   Causes, course and outcome of the Russian Revolution   Foundation, functioning and failure of the League of Nations         Introduction 1914 is a turning point in world history. The  political and social processes that began in 1789  culminated in the First World War that broke  out in that year and decisively shaped the course  of the twentieth century. Historians therefore  call this as ‘the long nineteenth century’. This  was the first industrial war that drew on the  economic resources of the entire world, and also  affected large sections of the civilian population.  The political map of the world was redra

In [9]:
import re
from typing import List, Dict
from itertools import accumulate

# 'pages' should now exist from the extraction cell above
# Join with page markers so we can keep provenance
page_texts = [p["text"] for p in pages]
combined = "\n\n".join([f"[PAGE:{p['page']}]\n{p['text']}" for p in pages])

# Define heading patterns (tune if needed)
heading_regex = re.compile(
    r"(?m)^(Chapter\s+\d+|CHAPTER\s+\d+|Lesson\s+\d+|Lesson\s+[IVXLC]+|[0-9]+\.\s+[A-Z][^\n]{0,80}|[A-Z][A-Z ]{6,})\s*$"
)

# Find heading positions
matches = list(heading_regex.finditer(combined))
print(f"Found {len(matches)} heading-like lines (may include noisy matches).")

# Build chunks from headings (each heading starts a chunk until next heading)
chunks: List[Dict] = []
if matches:
    for i, m in enumerate(matches):
        start = m.start()
        end = matches[i+1].start() if i+1 < len(matches) else len(combined)
        chunk_text = combined[start:end].strip()
        # try to extract page provenance: the first occurrence of [PAGE:n]
        page_marker = re.search(r"\[PAGE:(\d+)\]", chunk_text)
        page_num = int(page_marker.group(1)) if page_marker else None
        heading_line = m.group(0).strip()
        chunks.append({
            "chunk_id": len(chunks),
            "heading": heading_line,
            "page": page_num,
            "text": re.sub(r"\[PAGE:\d+\]\n?", "", chunk_text).strip()
        })
else:
    print("No headings detected — try the sliding-window chunker below.")

# Inspect
print("Sample chunks found:", len(chunks))
for c in chunks[:5]:
    print(f"CHUNK {c['chunk_id']} | heading: {c['heading']} | page: {c['page']} | chars: {len(c['text'])}")
    print(c['text'][:400].replace("\n"," ") + "...\n---\n")


Found 76 heading-like lines (may include noisy matches).
Sample chunks found: 76
CHUNK 0 | heading: THE WORLD WAR I | page: None | chars: 48
THE WORLD WAR I   RUSSIA JAPAN AUSTRIA - HUNGARY...
---

CHUNK 1 | heading: GREAT BRITAIN | page: None | chars: 27
GREAT BRITAIN CANADA AFRICA...
---

CHUNK 2 | heading: GERMANY | page: None | chars: 27
GERMANY FRANCE ITALY BOSNIA...
---

CHUNK 3 | heading: BELGIUM | page: None | chars: 28
BELGIUM SERBIA TURKEY GREECE...
---

CHUNK 4 | heading: MOROCCO | page: None | chars: 13
MOROCCO INDIA...
---



In [10]:
def chunk_text_sliding(text: str, max_words: int = 300, overlap: int = 60):
    words = text.split()
    chunks = []
    i = 0
    chunk_id = 0
    while i < len(words):
        chunk_words = words[i:i+max_words]
        chunk_text = " ".join(chunk_words)
        chunks.append({"chunk_id": chunk_id, "start_word": i, "end_word": i+len(chunk_words), "text": chunk_text})
        chunk_id += 1
        i += max_words - overlap
    return chunks

# Build sliding chunks from page-level combined text (with page markers for provenance)
all_text = combined  # already contains [PAGE:n] markers
sliding_chunks = chunk_text_sliding(all_text, max_words=300, overlap=60)

import re
for c in sliding_chunks:
    m = re.search(r"\[PAGE:(\d+)\]", c["text"])
    c["page"] = int(m.group(1)) if m else None
    c["text"] = re.sub(r"\[PAGE:\d+\]\s*", "", c["text"]).strip()

print("Sliding chunks created:", len(sliding_chunks))
print("Example chunk 0 length (chars):", len(sliding_chunks[0]['text']))
print("Preview chunk 0:", sliding_chunks[0]['text'][:500].replace("\n"," ") + "...")


Sliding chunks created: 30
Example chunk 0 length (chars): 1780
Preview chunk 0: 1 To acquaint ourselves with   The race for colonies leading to rivalry and clashes among the great powers of Europe   Emergence of Japan as the strongest and most aggressive power in East Asia   Colonialism's impact on Africa   Causes, course and results of the First World War   Treaty of Versailles and its provisions   Causes, course and outcome of the Russian Revolution   Foundation, functioning and failure of the League of Nations Introduction 1914 is a turning point in world h...


In [11]:
import json
from google.colab import files
import pandas as pd

# choose which set to inspect/save
if "chunks" in globals() and len(chunks)>0:
    to_save = chunks
    kind = "heading-aware"
elif "sliding_chunks" in globals() and len(sliding_chunks)>0:
    to_save = sliding_chunks
    kind = "sliding-window"
else:
    raise RuntimeError("No chunks found. Run heading-aware chunking or sliding fallback.")

print("Saving and previewing", kind, "chunks — total:", len(to_save))

# preview dataframe
df = pd.DataFrame([{
    "chunk_id": c.get("chunk_id"),
    "page": c.get("page"),
    "heading": c.get("heading") if c.get("heading") else "",
    "chars": len(c["text"]),
    "preview": c["text"][:200].replace("\n"," ")
} for c in to_save])
display(df.head(30))

# save JSONL with metadata
out_path = f"/content/{kind}_chunks.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
    for c in to_save:
        rec = {
            "chunk_id": c.get("chunk_id"),
            "page": c.get("page"),
            "heading": c.get("heading"),
            "text": c.get("text")
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Saved chunks to", out_path)
# Download link (uncomment to auto-download)
# files.download(out_path)


Saving and previewing heading-aware chunks — total: 76


Unnamed: 0,chunk_id,page,heading,chars,preview
0,0,,THE WORLD WAR I,48,THE WORLD WAR I RUSSIA JAPAN AUSTRIA - HUNGARY
1,1,,GREAT BRITAIN,27,GREAT BRITAIN CANADA AFRICA
2,2,,GERMANY,27,GERMANY FRANCE ITALY BOSNIA
3,3,,BELGIUM,28,BELGIUM SERBIA TURKEY GREECE
4,4,,MOROCCO,13,MOROCCO INDIA
5,5,,AUSTRALIA,9,AUSTRALIA
6,6,,NEW ZEALAND,11,NEW ZEALAND
7,7,,TRIPLE ENTENTE,21,TRIPLE ENTENTE LEGEND
8,8,,TRIPLE ALLIANCE,15,TRIPLE ALLIANCE
9,9,,BRITISH EMPIRE,14,BRITISH EMPIRE


Saved chunks to /content/heading-aware_chunks.jsonl


In [12]:
# 1_clean_chunks.py
import re
from typing import List, Dict

# choose your chunk set (heading-aware or sliding). We expect 'chunks' variable present.
if "chunks" in globals() and len(chunks)>0:
    raw_chunks = chunks
elif "sliding_chunks" in globals() and len(sliding_chunks)>0:
    raw_chunks = sliding_chunks
else:
    raise RuntimeError("No chunks found. Run chunking first.")

def clean_text(t):
    t = re.sub(r'\s+', ' ', t).strip()
    return t

cleaned = []
for c in raw_chunks:
    t = clean_text(c["text"])
    if len(t) < 100:            # filter threshold (tune as needed)
        continue
    cleaned.append({
        "chunk_id": c.get("chunk_id"),
        "page": c.get("page"),
        "heading": c.get("heading", ""),
        "text": t
    })

print("Raw chunks:", len(raw_chunks), "=> cleaned:", len(cleaned))
# preview
for c in cleaned[:3]:
    print(c["chunk_id"], c["page"], len(c["text"]))


Raw chunks: 76 => cleaned: 21
14 4 13489
15 None 164
24 9 15365


In [13]:
# run once
!pip install -q sentence-transformers faiss-cpu


In [14]:
# 2_embed_index.py
from sentence_transformers import SentenceTransformer, util
import faiss
import numpy as np
import json
from pathlib import Path

model = SentenceTransformer("all-MiniLM-L6-v2")   # fast & good
texts = [c["text"] for c in cleaned]
meta = [{"chunk_id": c["chunk_id"], "page": c["page"], "heading": c["heading"]} for c in cleaned]

# compute embeddings (batchable)
embs = model.encode(texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

d = embs.shape[1]
index = faiss.IndexFlatIP(d)   # inner-product on normalized vectors = cosine similarity
index.add(embs)
print("FAISS index size:", index.ntotal)

# Save index + metadata + texts
faiss.write_index(index, "/content/chunks_index.faiss")
with open("/content/chunks_meta.json", "w", encoding="utf-8") as f:
    json.dump({"meta": meta, "texts": texts}, f, ensure_ascii=False)
print("Saved index and metadata to /content/")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index size: 21
Saved index and metadata to /content/


In [15]:
# 3_retrieve.py
import faiss, json, numpy as np
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index("/content/chunks_index.faiss")
md = json.load(open("/content/chunks_meta.json","r",encoding="utf-8"))
meta = md["meta"]
texts = md["texts"]

def retrieve(topic_text, top_k=4):
    q_emb = model.encode(topic_text, convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(np.array([q_emb]), top_k)   # D: similarities, I: indices
    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({
            "idx": int(idx),
            "score": float(score),
            "text": texts[idx],
            **meta[idx]
        })
    return results

# quick test
res = retrieve("Outbreak of World War I chapter", top_k=4)
for r in res:
    print(r["score"], "page", r["page"], r["text"][:200].replace("\n"," "), "...\n")


0.6495532989501953 page 15 5. Hall of Mirrors - Madras EXERCISE 01_History_Unit_1_EM.indd 14 11-04-2020 12:19:37 15 Outbreak of World War I and Its Aft ermath V Answer briefl y ...

0.6470255851745605 page None ICT CORNER Through this activity you will learn about Time Line Events of World War -I Outbreak of World War I and Its Aftermath 01_History_Unit_1_EM.indd 15 11-04-2020 12:19:37 ...

0.5411995649337769 page None SUMMARY   Capitalistic countries’ ever growing demand for markets and raw materials leading to scramble for colonies and the resultant clashes amongst great powers in Europe are discussed.   Rise  ...

0.5385742783546448 page 9 10. The Rhineland was to be occupied by the Allies. The area on the east bank of the Rhine was to be demilitarized. President Wilson laid down his Fourteen Points, which were to be followed by the All ...



In [16]:
# 4_prompt_template.py
RAG_PROMPT = """You are a strict exam-writer. ONLY use facts contained in the provided Context. Do NOT invent facts.

Context:
{context}

Task:
Generate EXACTLY 12 questions about the Context, split as:
- 4 Basic (B1..B4):  MCQ
- 4 Moderate (M1..M4): MCQ
- 4 Advanced (A1..A4): MCQ

Output format (must match exactly, so it can be parsed):
B1) <question>  — Answer: <one-line canonical answer>
B2) ...
B3)
B4)

M1) ...
...
A4) <question> — Answer: <one-line model answer or rubric summary>

Constraints:
- Use only the information in Context. Do not invent dates/names not present.
- Keep each canonical answer one sentence (max 25 words).
- For MCQs, provide options in parentheses if you like, but keep answers short.
"""

def build_prompt(context_chunks:list):
    # join top chunks into one context (trim to ~1500-2000 tokens if needed)
    ctx = "\n\n".join([c["text"] for c in context_chunks])
    return RAG_PROMPT.format(context=ctx)


In [17]:
!pip install -q openai


In [18]:
!pip install -q transformers accelerate


In [19]:
# === Fix variable collision + run generation safely ===
# Copy-paste into one Colab cell and run.

# 0) Imports
import os, json, re, numpy as np
from pathlib import Path

# 1) Ensure embedding model exists (do not overwrite with seq2seq model)
try:
    embed_model  # if present, keep it
except NameError:
    # load embedding model (fast)
    !pip install -q sentence-transformers faiss-cpu
    from sentence_transformers import SentenceTransformer, util
    embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# 2) Load FAISS index and meta if not already loaded
try:
    index
    texts
    meta
except NameError:
    import faiss
    idx_path = "/content/chunks_index.faiss"
    meta_path = "/content/chunks_meta.json"
    if not Path(idx_path).exists() or not Path(meta_path).exists():
        raise RuntimeError(f"FAISS index or meta not found at {idx_path} / {meta_path}. Run the embedding/index cell first.")
    index = faiss.read_index(idx_path)
    md = json.load(open(meta_path, "r", encoding="utf-8"))
    meta = md["meta"]
    texts = md["texts"]

# 3) Safe retrieve() that uses embed_model
def retrieve(topic_text, top_k=4, embed_model=embed_model, index=index, texts=texts, meta=meta):
    q_emb = embed_model.encode(topic_text, convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(np.array([q_emb]), top_k)
    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({
            "idx": int(idx),
            "score": float(score),
            "text": texts[idx],
            **meta[idx]
        })
    return results

# 4) build_prompt (same template as before)
RAG_PROMPT = """You are a strict exam-writer. ONLY use facts contained in the provided Context. Do NOT invent facts.

Context:
{context}

Task:
Generate EXACTLY 12 questions about the Context, split as:
- 4 Basic (B1..B4): MCQ style
- 4 Moderate (M1..M4):  MCQ style
- 4 Advanced (A1..A4): MCQ style

Output format (must match exactly, so it can be parsed):
B1) <question>  — Answer: <one-line canonical answer>
...
A4) <question> — Answer: <one-line model answer or rubric summary>

Constraints:
- Use only the information in Context. Do not invent dates/names not present.
- Keep each canonical answer one sentence (max 25 words).
"""
def build_prompt(context_chunks:list):
    ctx = "\n\n".join([c["text"] for c in context_chunks])
    # optionally truncate context to ~1500-2000 tokens: here we limit chars to 12000
    if len(ctx) > 12000:
        ctx = ctx[:12000]
    return RAG_PROMPT.format(context=ctx)

# 5) Prepare prompt (retrieve top-K chunks)
topic_query = "Outbreak of World War I chapter"   # change as needed
top_k = 4
context_chunks = retrieve(topic_query, top_k=top_k)
print("Retrieved chunks (score, page):", [(round(c['score'],3), c['page']) for c in context_chunks])

prompt = build_prompt(context_chunks)
print("\n--- Prompt preview (first 800 chars) ---\n")
print(prompt[:800] + ("\n\n... (truncated)\n" if len(prompt)>800 else "\n"))

# 6) Load tokenizer + seq2seq model with safe variable names (seq_tokenizer, seq_model)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

MODEL_NAME = "google/flan-t5-base"   # change if desired
device = "cuda" if torch.cuda.is_available() else "cpu"

print("\nLoading seq2seq model (this may take a minute)...")
seq_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
seq_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# 7) Tokenize + move tensors explicitly, then generate
inputs = seq_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs.get("attention_mask")
if attention_mask is not None:
    attention_mask = attention_mask.to(device)

print("\nGenerating (this can take some seconds)...")
outputs = seq_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=512,
    num_beams=4,
    do_sample=False,
    early_stopping=True
)

out_text = seq_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("\n--- Generated text (first 2000 chars) ---\n")
print(out_text[:2000] + ("\n\n... (truncated)\n" if len(out_text)>2000 else "\n"))

# 8) Parse generated Q/A into structured list
pattern = re.compile(r'^(B|M|A)(\d+)\)\s*(.*?)\s*—\s*Answer:\s*(.*)$', re.UNICODE)
qas = []
for line in [l.strip() for l in out_text.splitlines() if l.strip()]:
    m = pattern.match(line)
    if m:
        difficulty = {"B":"basic","M":"moderate","A":"advanced"}[m.group(1)]
        qid = m.group(2)
        q = m.group(3).strip()
        a = m.group(4).strip()
        qas.append({"qid": f"{m.group(1)}{qid}", "difficulty": difficulty, "question": q, "answer": a})

print("\nParsed Qs:", len(qas))
for q in qas:
    print(q["qid"], "|", q["difficulty"], "|", q["question"], "—", q["answer"])


Retrieved chunks (score, page): [(0.65, 15), (0.647, None), (0.541, None), (0.539, 9)]

--- Prompt preview (first 800 chars) ---

You are a strict exam-writer. ONLY use facts contained in the provided Context. Do NOT invent facts.

Context:
5. Hall of Mirrors - Madras EXERCISE 01_History_Unit_1_EM.indd 14 11-04-2020 12:19:37 15 Outbreak of World War I and Its Aft ermath V Answer briefl y

ICT CORNER Through this activity you will learn about Time Line Events of World War -I Outbreak of World War I and Its Aftermath 01_History_Unit_1_EM.indd 15 11-04-2020 12:19:37

SUMMARY   Capitalistic countries’ ever growing demand for markets and raw materials leading to scramble for colonies and the resultant clashes amongst great powers in Europe are discussed.   Rise of Japan as an Imperial Power in Asia is highlighted   Division of Europe into two warring camps and the resultant alliances and counter-alliances are detail

... (truncated)


Loading seq2seq model (this may take a minute)...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Generating (this can take some seconds)...

--- Generated text (first 2000 chars) ---

Outbreak of World War I and Its Aft ermath V


Parsed Qs: 0


In [20]:
RAG_PROMPT = """You are a strict exam-writer. ONLY use facts contained in the provided Context. Do NOT invent facts.

Context:
{context}

Task:
Generate EXACTLY 12 questions about the Context, split as:
- 4 Basic (B1..B4): MCQ
- 4 Moderate (M1..M4): MCQ
- 4 Advanced (A1..A4): MCQ

Output format (follow this EXACTLY):

B1) Which country was called the "sick man of Europe"? — Answer: The Ottoman Empire
B2) Who was assassinated in Sarajevo in 1914? — Answer: Archduke Franz Ferdinand
B3) ...
B4) ...

M1) Explain why imperial rivalry increased before World War I. — Answer: European powers competed for colonies, markets, and raw materials.
M2) ...
M3) ...
M4) ...

A1) Analyse how the Treaty of Versailles created political instability in Germany. — Answer: Harsh reparations and territorial losses caused resentment and economic hardship.
A2) ...
A3) ...
A4) ...

Now, generate the 12 questions strictly in the above format, using only the facts in the Context."""


In [21]:
outputs = seq_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=768,       # longer than before
    num_beams=6,
    do_sample=False,
    early_stopping=True
)


In [22]:
print("\n=== RAW MODEL OUTPUT ===\n")
print(out_text)



=== RAW MODEL OUTPUT ===

Outbreak of World War I and Its Aft ermath V


In [23]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch, re

MODEL_NAME = "google/flan-t5-base"   # try flan-t5-large if you have GPU memory
device = "cuda" if torch.cuda.is_available() else "cpu"

seq_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
seq_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# --- Stronger prompt with few-shot examples ---
FEWSHOT_PROMPT = """You are a strict exam-writer. ONLY use facts contained in the provided Context. Do NOT invent facts.

Context:
{context}

Task:
Generate EXACTLY 12 questions about the Context, split as:
- 4 Basic (B1..B4)
- 4 Moderate (M1..M4)
- 4 Advanced (A1..A4)

Follow this format exactly:
B1) Who was assassinated in Sarajevo in 1914? — Answer: Archduke Franz Ferdinand
B2) Which empire was called the "sick man of Europe"? — Answer: The Ottoman Empire
B3) ...
B4) ...

M1) Explain why imperial rivalry increased before World War I. — Answer: European powers competed for colonies and raw materials.
M2) ...
M3) ...
M4) ...

A1) Analyse how the Treaty of Versailles created political instability. — Answer: Reparations and territorial losses caused resentment and hardship.
A2) ...
A3) ...
A4) ...

Now, write the 12 questions strictly in the above format, using only the Context."""

def build_prompt(context_chunks):
    ctx = "\n\n".join([c["text"] for c in context_chunks])
    if len(ctx) > 10000:
        ctx = ctx[:10000]  # trim if too long
    return FEWSHOT_PROMPT.format(context=ctx)

# --- Generate ---
context_chunks = retrieve("Outbreak of World War I chapter", top_k=4)
prompt = build_prompt(context_chunks)

inputs = seq_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

outputs = seq_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=768,
    num_beams=6,
    do_sample=False,
    early_stopping=True
)

out_text = seq_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("\n=== RAW OUTPUT ===\n")
print(out_text)

# --- Parse questions back into JSON ---
pattern = re.compile(r'^(B|M|A)(\d+)\)\s*(.*?)\s*—\s*Answer:\s*(.*)$', re.UNICODE)
qas = []
for line in [l.strip() for l in out_text.splitlines() if l.strip()]:
    m = pattern.match(line)
    if m:
        diff = {"B":"basic","M":"moderate","A":"advanced"}[m.group(1)]
        qas.append({
            "qid": f"{m.group(1)}{m.group(2)}",
            "difficulty": diff,
            "question": m.group(3).strip(),
            "answer": m.group(4).strip()
        })

print("\n=== Parsed Qs ===")
for q in qas:
    print(q)



=== RAW OUTPUT ===

Outbreak of World War I and Its Aft ermath

=== Parsed Qs ===


In [24]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch, re

# Load flan-t5-large
MODEL_NAME = "google/flan-t5-large"
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading flan-t5-large (≈2.9 GB)...")
seq_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
seq_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# --- Few-shot prompt template ---
FEWSHOT_PROMPT = """You are a strict exam-writer. ONLY use facts contained in the provided Context. Do NOT invent facts.

Context:
{context}

Task:
Generate EXACTLY 12 questions about the Context, split as:
- 4 Basic (B1..B4)
- 4 Moderate (M1..M4)
- 4 Advanced (A1..A4)

Follow this exact format:
B1) Who was assassinated in Sarajevo in 1914? — Answer: Archduke Franz Ferdinand
B2) Which empire was called the "sick man of Europe"? — Answer: The Ottoman Empire
B3) ...
B4) ...

M1) Explain why imperial rivalry increased before World War I. — Answer: European powers competed for colonies and raw materials.
M2) ...
M3) ...
M4) ...

A1) Analyse how the Treaty of Versailles created political instability. — Answer: Reparations and territorial losses caused resentment and hardship.
A2) ...
A3) ...
A4) ...

Now, write the 12 questions strictly in the above format, using only the Context.
"""

def build_prompt(context_chunks):
    ctx = "\n\n".join([c["text"] for c in context_chunks])
    if len(ctx) > 10000:   # truncate if very long
        ctx = ctx[:10000]
    return FEWSHOT_PROMPT.format(context=ctx)

# Retrieve top-K chunks for the topic
topic_query = "Outbreak of World War I chapter"
context_chunks = retrieve(topic_query, top_k=4)
prompt = build_prompt(context_chunks)

# Tokenize and move tensors to device
inputs = seq_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# Generate with flan-t5-large
outputs = seq_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=900,       # keep under 1000 tokens to avoid OOM
    num_beams=6,
    do_sample=False,
    early_stopping=True
)

out_text = seq_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("\n=== RAW OUTPUT ===\n")
print(out_text)

# --- Parse questions back into structured format ---
pattern = re.compile(r'^(B|M|A)(\d+)\)\s*(.*?)\s*—\s*Answer:\s*(.*)$', re.UNICODE)
qas = []
for line in [l.strip() for l in out_text.splitlines() if l.strip()]:
    m = pattern.match(line)
    if m:
        diff = {"B":"basic","M":"moderate","A":"advanced"}[m.group(1)]
        qas.append({
            "qid": f"{m.group(1)}{m.group(2)}",
            "difficulty": diff,
            "question": m.group(3).strip(),
            "answer": m.group(4).strip()
        })

print("\n=== Parsed Qs ===")
for q in qas:
    print(q)


Loading flan-t5-large (≈2.9 GB)...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


=== RAW OUTPUT ===

Russian Revolution, the U.S.S.R or the Union of Socialist and Soviet Republics was formed.

=== Parsed Qs ===


In [25]:
def clean_context(text):
    # remove junk patterns
    text = re.sub(r'\d{2}-\d{2}-\d{4}', '', text)       # dates
    text = re.sub(r'\d{2}:\d{2}:\d{2}', '', text)       # times
    text = re.sub(r'EXERCISE.*', '', text)
    text = re.sub(r'ICT CORNER.*', '', text)
    text = re.sub(r'\.indd', '', text)
    return text.strip()

for c in context_chunks:
    c["text"] = clean_context(c["text"])


In [26]:
FEWSHOT_PROMPT = """You are a strict exam-writer. ONLY use facts from the Context. Do NOT invent facts.

Context:
{context}

Example (format to follow exactly):

B1) Who was assassinated in Sarajevo in 1914? — Answer: Archduke Franz Ferdinand
B2) Which empire was called the "sick man of Europe"? — Answer: The Ottoman Empire
B3) When did World War I begin? — Answer: 1914
B4) Which treaty ended World War I? — Answer: Treaty of Versailles

M1) Explain why imperial rivalry increased before WWI. — Answer: European powers competed for colonies and raw materials.
M2) Describe one outcome of the Paris Peace Conference for Germany. — Answer: Germany was forced to pay reparations and limit its army.
M3) Why did Russia experience a revolution in 1917? — Answer: Food shortages, war defeats, and failure of the government.
M4) What was the League of Nations’ main goal? — Answer: To maintain international peace and cooperation.

A1) Analyse how the Treaty of Versailles destabilised Europe. — Answer: Reparations and territorial losses fostered resentment and economic hardship.
A2) Compare nationalism and imperialism as causes of WWI. — Answer: Nationalism inflamed Balkan tensions; imperialism created colonial rivalries.
A3) Discuss the impact of the Russian Revolution on Asia. — Answer: It spread socialist ideas and inspired anti-colonial movements.
A4) Evaluate why the League of Nations failed. — Answer: Lack of US membership, unanimity rule, and no military power.

Now, generate 12 NEW questions (B1–B4, M1–M4, A1–A4) in the SAME format, using ONLY the Context above.
"""


In [27]:
!pip install -q google-generativeai


In [28]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyB_ZJuVpd68q8Icjh6E2sd8nAf2XSMJAmA")


In [47]:
GEMINI_PROMPT = """You are a strict exam-writer. ONLY use the provided Context. Do NOT invent facts.

Context:
{context}

Task:
Generate EXACTLY 12 questions:
- 4 Basic (B1..B4)
- 4 Moderate (M1..M4)
- 4 Advanced (A1..A4)

Each question must be **multiple choice with 4 options (A, B, C, D)**.
Clearly mark the correct option.

Output format:
B1) <question>
A) <option>
B) <option>
C) <option>
D) <option>
Correct Answer: <A/B/C/D>

...
A4) <question>
A) <option>
B) <option>
C) <option>
D) <option>
Correct Answer: <A/B/C/D>
"""


In [80]:
def build_gemini_mcq_prompt(context_chunks):
    # context_chunks is a list of strings, so join directly
    ctx = "\n\n".join(context_chunks)

    # Limit context to first 10,000 characters
    if len(ctx) > 10000:
        ctx = ctx[:10000]

    prompt = f"""
You are an AI tutor. Based on the following study material, generate **12 multiple-choice questions (MCQs)**:
- 4 Basic
- 4 Moderate
- 4 Advanced

Instructions:
- Each question must have 4 options labeled A, B, C, D.
- Clearly indicate the correct answer for each question.
- Write questions in numbered format under each level.

Context:
{ctx}

Now, generate the MCQs.
"""
    return prompt


In [81]:
# Retrieve context chunks
context_chunks = retrieve("Outbreak of World War I chapter", top_k=4)

# Build MCQ-specific prompt
prompt = build_gemini_mcq_prompt(context_chunks)

# Preview the first 1000 characters of the prompt
print(prompt[:1000])



You are an AI tutor. Based on the following study material, generate **12 multiple-choice questions (MCQs)**:
- 4 Basic
- 4 Moderate
- 4 Advanced

Instructions:
- Each question must have 4 options labeled A, B, C, D.
- Clearly indicate the correct answer for each question.
- Write questions in numbered format under each level.

Context:
applied in actual practice.
When Italy, Japan and Germany, headed 
by dictators, refused to be bound by the orders 
of the League, Britain and France were the only 
major powers to act decisively. 
01_History_Unit_1_EM.indd   12 11-04-2020   12:19:36
13
Outbreak of World War I and Its Aftermath
SUMMARY
 Capitalistic countries’ ever growing demand for markets and raw materials leading to scramble 
for colonies and the resultant clashes amongst great powers in Europe are discussed.
 Rise of Japan as an Imperial Power in Asia is highlighted
 Division of Europe into two warring camps and the resultant alliances and counter-alliances 
are detailed.


In [82]:
model = genai.GenerativeModel("gemini-1.5-flash")

response = model.generate_content(prompt)
out_text = response.text
print("\n=== RAW OUTPUT ===\n")
print(out_text)



=== RAW OUTPUT ===

**Basic Level**

1.  Which two major powers acted decisively when Italy, Japan, and Germany defied the League of Nations?
    A.  Russia and Germany
    B.  Germany and Austria-Hungary
    C.  Britain and France  **(Correct Answer)**
    D.  United States and Russia

2.  What was a major aim of capitalist industry, as described in the text?
    A.  To reduce production costs
    B.  To decrease global competition
    C.  To produce more and more goods **(Correct Answer)**
    D.  To limit the expansion of markets

3.  What significant event is highlighted as the biggest outcome of World War I?
    A.  The formation of the League of Nations
    B.  The redrawing of the political map of the world
    C.  The Russian Revolution **(Correct Answer)**
    D.  The collapse of the Ottoman Empire

4.  According to the text, which treaty, though initially accepted, ultimately failed due to resistance?
    A.  Treaty of Versailles
    B.  Treaty of Sevres **(Correct Answer)**

In [84]:
import re, json

def parse_mcq(text):
    qas = []
    # Split by question numbers like 1., 2., 3. (works for Basic, Moderate, Advanced sections)
    blocks = re.split(r'\n\d+\.', text)

    for block in blocks:
        block = block.strip()
        if not block:
            continue

        lines = block.splitlines()
        # First line is the question
        question_line = lines[0].strip() if lines else ""

        options = {}
        correct = ""
        for line in lines[1:]:
            line = line.strip()
            opt_match = re.match(r'^([A-D])\s*\)\s*(.*)', line)
            if opt_match:
                opt_id, opt_text = opt_match.groups()
                options[opt_id] = opt_text.strip()
            elif line.lower().startswith("correct answer:"):
                correct = line.split(":",1)[1].strip()

        # Try to detect difficulty from section headers
        diff = "unknown"
        if "Basic Level" in text:
            diff = "basic"
        elif "Moderate Level" in text:
            diff = "moderate"
        elif "Advanced Level" in text:
            diff = "advanced"

        qas.append({
            "question": question_line,
            "difficulty": diff,
            "options": options,
            "correct": correct
        })
    return qas

parsed = parse_mcq(out_text)
print(json.dumps(parsed, indent=2, ensure_ascii=False))


[
  {
    "question": "**Basic Level**",
    "difficulty": "basic",
    "options": {},
    "correct": ""
  },
  {
    "question": "Which two major powers acted decisively when Italy, Japan, and Germany defied the League of Nations?",
    "difficulty": "basic",
    "options": {},
    "correct": ""
  },
  {
    "question": "What was a major aim of capitalist industry, as described in the text?",
    "difficulty": "basic",
    "options": {},
    "correct": ""
  },
  {
    "question": "What significant event is highlighted as the biggest outcome of World War I?",
    "difficulty": "basic",
    "options": {},
    "correct": ""
  },
  {
    "question": "According to the text, which treaty, though initially accepted, ultimately failed due to resistance?",
    "difficulty": "basic",
    "options": {},
    "correct": ""
  },
  {
    "question": "What were two of the crucial factors that led to the outbreak of World War I, according to the text?",
    "difficulty": "basic",
    "options": {},
  

In [86]:
print(type(model))



<class 'google.generativeai.generative_models.GenerativeModel'>


In [87]:
!pip install flask flask-cors


Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Installing collected packages: flask-cors
Successfully installed flask-cors-6.0.1


In [95]:
!ngrok authtoken '31vLMtjLl7cqiiKS8nDmbg6QySL_FAuPwmJCqD2p15kFfQ79'


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [96]:
import google.generativeai as genai

genai.api_key = "AIzaSyB_ZJuVpd68q8Icjh6E2sd8nAf2XSMJAmA"


In [99]:
!pip install --upgrade google-generative-ai


[31mERROR: Could not find a version that satisfies the requirement google-generative-ai (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for google-generative-ai[0m[31m
[0m

In [100]:
!pip install --upgrade google-genai


Collecting google-genai
  Downloading google_genai-1.32.0-py3-none-any.whl.metadata (43 kB)
Downloading google_genai-1.32.0-py3-none-any.whl (241 kB)
Installing collected packages: google-genai
  Attempting uninstall: google-genai
    Found existing installation: google-genai 1.31.0
    Uninstalling google-genai-1.31.0:
      Successfully uninstalled google-genai-1.31.0
Successfully installed google-genai-1.32.0


In [101]:
import google.genai as genai

genai.api_key = "AIzaSyB_ZJuVpd68q8Icjh6E2sd8nAf2XSMJAmA"


In [102]:
from pyngrok import ngrok

# Open a ngrok tunnel to the Flask app
public_url = ngrok.connect(5000)
print("Public URL:", public_url)


Public URL: NgrokTunnel: "https://c78e32b63d37.ngrok-free.app" -> "http://localhost:5000"


In [98]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import google.generativeai as genai
from pyngrok import ngrok

# Set your API key
genai.api_key = "YOUR_GOOGLE_API_KEY"

app = Flask(__name__)
CORS(app)

@app.route("/generate", methods=["POST"])
def generate_questions():
    data = request.json
    prompt = data.get("prompt", "")
    if not prompt:
        return jsonify({"error": "Prompt required"}), 400

    response = genai.models.generate(
        model="gemini-1.5",
        prompt=prompt,
        temperature=0.7,
        max_output_tokens=500
    )
    return jsonify({"output": response.text})


In [103]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyB_ZJuVpd68q8Icjh6E2sd8nAf2XSMJAmA")  # Replace with your real Google API key (from AI Studio or similar). Don't hardcode it—use os.environ for security if sharing.

In [104]:
from pyngrok import ngrok
ngrok.set_auth_token("31vLMtjLl7cqiiKS8nDmbg6QySL_FAuPwmJCqD2p15kFfQ79")  # Replace with your token.

In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import google.generativeai as genai

app = Flask(__name__)
CORS(app)  # Enables CORS for React to connect

@app.route("/generate", methods=["POST"])
def generate_questions():
    try:
        data = request.json
        prompt = data.get("prompt", "")
        if not prompt:
            return jsonify({"error": "Prompt is required"}), 400

        # Generate content using Gemini (update model if needed, e.g., 'gemini-1.5-flash')
        model = genai.GenerativeModel('gemini-1.5-flash')  # Or your preferred model
        response = model.generate_content(prompt)  # Simple generation; adjust params like temperature

        return jsonify({"output": response.text})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    public_url = ngrok.connect(5000)
    print("Public URL:", public_url.public_url)  # This prints the https://xxxx.ngrok-free.app URL
    app.run(port=5000)

Public URL: https://9a58077cf6b3.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
