In [1]:
# If you're on Colab, run this cell once.
!pip install -q faiss-cpu sentence-transformers pypdf python-docx openai anthropic requests tiktoken


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m483.4/483.4 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.0/812.0 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.2/297.2 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m352.0/352.0 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install langchain faiss-cpu sentence-transformers transformers pypdf langchain-community pypdf python-docx openai anthropic  faiss-cpu sentence-transformers pypdf python-docx tiktoken



Collecting langchain
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.28-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.9 (from langchain)
  Downloading langchain_text_splitters-0.3.9-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith>=0.1.17 (from langchain)
  Downloading langsmith-0.4.19-py3-none-any.whl.metadata (14 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading sqlalchemy-2.0.43-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain-community)
  Downloading aiohttp-3.12.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metada

In [None]:
from __future__ import annotations
import os, re, json, time, textwrap, math, typing
from dataclasses import dataclass
from typing import List, Optional, Tuple

# Vector + Embeddings
import faiss  # type: ignore
from sentence_transformers import SentenceTransformer

# Optional loaders
try:
    from pypdf import PdfReader
except Exception:
    PdfReader = None  # type: ignore
try:
    import docx  # python-docx
except Exception:
    docx = None  # type: ignore

# Optional LLM clients (use env vars for keys)
try:
    import openai
except Exception:
    openai = None  # type: ignore
try:
    import anthropic
except Exception:
    anthropic = None  # type: ignore
import requests

# Colab/Jupyter upload fallbacks
IN_COLAB = False
try:
    from google.colab import files as colab_files  # type: ignore
    IN_COLAB = True
except Exception:
    pass

def read_txt(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def read_pdf(path: str) -> str:
    if PdfReader is None:
        raise RuntimeError("pypdf is not installed. Install it or convert the PDF to TXT.")
    reader = PdfReader(path)
    parts = []
    for page in reader.pages:
        txt = page.extract_text() or ""
        parts.append(txt)
    return "\n".join(parts)

def read_docx(path: str) -> str:
    if docx is None:
        raise RuntimeError("python-docx is not installed. Install it or export the DOCX to TXT.")
    d = docx.Document(path)
    return "\n".join(p.text for p in d.paragraphs)

def load_any(path: str) -> str:
    ext = os.path.splitext(path)[1].lower()
    if ext in (".txt", ".md"):
        return read_txt(path)
    if ext == ".pdf":
        return read_pdf(path)
    if ext in (".doc", ".docx"):
        return read_docx(path)
    raise ValueError(f"Unsupported file type: {ext}")

def smart_split(text: str, max_len: int = 1200, overlap: int = 150) -> List[str]:
    """Split text by sentence boundaries with overlap (lengths in characters)."""
    text = re.sub(r"\s+", " ", text).strip()
    sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z(\[])", text)
    chunks: List[str] = []
    cur = ""
    for s in sentences:
        if len(cur) + len(s) + 1 <= max_len:
            cur = (cur + " " + s).strip()
        else:
            if cur:
                chunks.append(cur)
            if overlap and cur:
                tail = cur[-overlap:]
                cur = (tail + " " + s).strip()
            else:
                cur = s
    if cur:
        chunks.append(cur)
    return chunks

@dataclass
class VectorStore:
    index: faiss.IndexFlatIP
    embeddings: List[List[float]]
    texts: List[str]

def build_faiss(chunks: List[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> VectorStore:
    enc = SentenceTransformer(model_name)
    vecs = enc.encode(chunks, normalize_embeddings=True)
    dim = vecs.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(vecs)
    return VectorStore(index=index, embeddings=vecs.tolist(), texts=chunks)

def search(store: VectorStore, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
    enc = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    qvec = enc.encode([query], normalize_embeddings=True)
    D, I = store.index.search(qvec, top_k)
    return list(zip(I[0].tolist(), D[0].tolist()))

def build_improvement_prompt(topic: str, language: str, retrieved_blocks: List[str], min_words: int) -> str:
    lang = (language or "en").lower().strip()
    si = (lang.startswith("si") or "sinhala" in lang)

    instructions_en = f"""
You are an expert academic writing assistant. Improve and expand the user's literature review on the topic: "{topic}".

HARD REQUIREMENTS:
- LENGTH: At least {min_words} words (preferably more).
- Do NOT invent or hallucinate sources. Preserve ALL existing citation markers such as [12], (Author, 2021), numeric superscripts, and numbered references.
- If you introduce a claim that isn't supported by the context, flag it in a short "Gaps & To-verify" list at the end.
- Deduplicate citations, unify style (APA/IEEE/Harvard) WITHOUT fabricating missing details.
- Reduce redundancy, improve coherence and flow, and structure by themes/methods/findings and chronology when relevant.
- Add strong signposting, transitions, and synthesis. Compare/contrast studies, highlight methodological differences, and identify open problems.

DELIVER THESE THREE SECTIONS:
1) Improved Literature Review (comprehensive prose, >= {min_words} words)
2) Gaps & To-verify (bulleted list)
3) Suggested Outline (hierarchical, 2–3 levels)

Now use ONLY the provided context to guide what you write. Keep all inline citation markers intact.
""".strip()

    instructions_si = f"""
ඔබ ජාත්‍යන්තර ප්‍රමිති අනුව ලියන අකադෙමිය ලේඛන විශේෂඥයෙක්. තේමාව: "{topic}".
අවශ්‍යතා:
- දිග: වචන {min_words} ට වඩා වැඩි.
- නව උපුටා දැක්වීම් නිර්මාණය නොකරන්න. දැනට ඇති [12], (Author, 2021) වගේ සලකුණු රඳවා තබන්න.
- සන්දර්භයෙන් පිට තොරතුරු ඇතුළත් කලහොත් "හිඟ & තහවුරු කිරීමට" යටතේ සටහන් කරන්න.
- අනුපිළිවෙළින් තේමා/ව්‍යුහ/ක්‍රමවේද අනුව සංවිධානය කර සංගතිය වැඩිකරන්න. සංක්‍රාන්ත වාක්‍ය සහ සාරාංශයන් එක් කරන්න.

අංශ 3ක් පිළිගන්න:
1) மேம்படுத்தப்பட்ட සඟරාව/සඳහන් විචාරය (වචන {min_words}+)
2) හිඟ & තහවුරු කිරීමට (බුලට්)
3) නිර්දේශිත රූපරේඛාව (දර්ජා 2–3)
""".strip()

    header = instructions_si if si else instructions_en
    context_joined = "\n\n--- Retrieved Context ---\n\n" + "\n\n".join(retrieved_blocks)
    return header + "\n\n" + context_joined

# --- LLM backends (no secrets hardcoded; use environment variables) ---

def call_openai_chat(prompt: str, model: str = "gpt-4o-mini", temperature: float = 0.2) -> str:
    if openai is None:
        raise RuntimeError("openai package not installed. `pip install openai`")
    api_key = os.getenv("api key for open ai")
    base = os.getenv("OPENAI_BASE_URL")  # optional
    if not api_key:
        raise RuntimeError("Set OPENAI_API_KEY environment variable.")
    if hasattr(openai, "OpenAI"):
        client = openai.OpenAI(api_key=api_key, base_url=base)
        resp = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a precise academic writing assistant."},
                {"role": "user", "content": prompt},
            ],
            temperature=temperature,
        )
        return resp.choices[0].message.content.strip()
    else:
        raise RuntimeError("OpenAI SDK version not supported. Upgrade `openai` to >= 1.0.")

def call_anthropic_chat(prompt: str, model: str = "claude-3-5-sonnet-20240620", temperature: float = 0.2) -> str:
    if anthropic is None:
        raise RuntimeError("anthropic package not installed. `pip install anthropic`")
    api_key = os.getenv("input the api key claude")
    if not api_key:
        raise RuntimeError("Set ANTHROPIC_API_KEY environment variable.")
    client = anthropic.Anthropic(api_key=api_key)
    msg = client.messages.create(
        model=model,
        max_tokens=4000,
        temperature=temperature,
        system="You are a precise academic writing assistant.",
        messages=[{"role":"user", "content": prompt}],
    )
    out = "".join(b.text for b in msg.content if getattr(b, "type", "") == "text")
    return out.strip()

def call_openai_compatible(prompt: str, model: str = "gpt-4o-mini", temperature: float = 0.2) -> str:
    base = os.getenv("LLM_BASE_URL")
    api_key = os.getenv("LLM_API_KEY", "no-key")
    if not base:
        raise RuntimeError("Set LLM_BASE_URL (OpenAI-compatible) and optionally LLM_API_KEY.")
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    body = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are a precise academic writing assistant."},
            {"role": "user", "content": prompt},
        ],
        "temperature": temperature,
    }
    r = requests.post(base.rstrip("/") + "/v1/chat/completions", headers=headers, json=body, timeout=180)
    r.raise_for_status()
    data = r.json()
    return data["choices"][0]["message"]["content"].strip()

def call_hf_inference(prompt: str, model_id: Optional[str] = None, temperature: float = 0.2, max_new_tokens: int = 3000) -> str:
    token = os.getenv("hugging face api")
    model = model_id or os.getenv("Meta-Llama-3-8B-Instruct")
    if not token or not model:
        raise RuntimeError("Set HUGGINGFACE_API_KEY and HF_MODEL_ID environment variables.")
    headers = {"Authorization": f"Bearer {token}"}
    payload = {"inputs": prompt, "parameters": {"temperature": temperature, "max_new_tokens": max_new_tokens}}
    r = requests.post(f"https://api-inference.huggingface.co/models/{model}", headers=headers, json=payload, timeout=300)
    r.raise_for_status()
    data = r.json()
    # Multiple formats possible
    if isinstance(data, list) and data and "generated_text" in data[0]:
        return data[0]["generated_text"][len(prompt):].strip()
    if isinstance(data, dict) and "generated_text" in data:
        return data["generated_text"][len(prompt):].strip()
    if isinstance(data, list) and data and "summary_text" in data[0]:
        return data[0]["summary_text"].strip()
    return json.dumps(data)

def word_count(s: str) -> int:
    return len(re.findall(r"\b\w+\b", s))

def save_text(path: str, content: str):
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"✅ Saved: {path}")


  * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or


In [4]:
# ---- Upload (Colab) or specify paths (Jupyter) ----
uploaded_paths = []

if IN_COLAB:
    print("Upload your draft and any notes (TXT/PDF/DOCX). You can upload multiple files.")
    uploaded = colab_files.upload()
    for name in uploaded.keys():
        uploaded_paths.append(name)
else:
    # Jupyter fallback: manually set your file paths here if not using Colab upload
    uploaded_paths = [
        "litreview.txt",   # <- change to your path(s)
        # "notes.pdf",
        # "sources.docx",
    ]

print("Files to use:", uploaded_paths)

# ---- RAG + Generation Parameters ----
TOPIC = "Evaluation of LLMs on Sinhala structured examination questions (Grades 6–11)"  # <- change as needed
LANGUAGE = "en"           # "en" or "si"
BACKEND = "openai"        # "dryrun", "openai", "anthropic", "openai_compat", "hf"
OPENAI_MODEL = "gpt-4o-mini"
ANTHROPIC_MODEL = "claude-3-5-sonnet-20240620"
COMPAT_MODEL = "gpt-4o-mini"   # for OpenAI-compatible endpoints
HF_MODEL = None               # e.g., "meta-llama/Meta-Llama-3-8B-Instruct" if using HF

# Retrieval settings
CHUNK = 1800         # larger chunks to keep detail
OVERLAP = 250
TOP_K = 14           # retrieve more chunks to broaden coverage

# Generation settings
TARGET_MIN_WORDS = 5000       # hard minimum
TEMPERATURE = 0.2
MAX_EXPANSION_PASSES = 3      # extra expansion loops if under target

# Output files
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_MAIN = os.path.join(OUTPUT_DIR, "improved_litreview.md")
OUTPUT_PROMPT = os.path.join(OUTPUT_DIR, "copy_this_prompt.txt")

print(f"Topic: {TOPIC}")
print(f"Backend: {BACKEND}")
print(f"Target words: {TARGET_MIN_WORDS}+")


Upload your draft and any notes (TXT/PDF/DOCX). You can upload multiple files.


Saving all.txt to all.txt
Files to use: ['all.txt']
Topic: Evaluation of LLMs on Sinhala structured examination questions (Grades 6–11)
Backend: openai
Target words: 5000+


In [5]:
# Load and combine all documents
all_texts = []
for p in uploaded_paths:
    if not os.path.exists(p):
        raise FileNotFoundError(f"File not found: {p}")
    txt = load_any(p)
    all_texts.append(txt)

combined = "\n\n".join(all_texts)
print(f"Loaded {len(uploaded_paths)} file(s), total chars: {len(combined):,}")

# Chunk + embed
chunks = smart_split(combined, max_len=CHUNK, overlap=OVERLAP)
print(f"Chunked into {len(chunks)} pieces.")

store = build_faiss(chunks)
print("Vector index built.")


Loaded 1 file(s), total chars: 81,766
Chunked into 63 pieces.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector index built.


In [6]:
# Retrieve top-k relevant blocks
hits = search(store, TOPIC, top_k=TOP_K)
retrieved_blocks = [store.texts[i] for i, _ in hits if 0 <= i < len(store.texts)]
print(f"Retrieved {len(retrieved_blocks)} blocks for topic.")

# Build the base prompt with explicit minimum word count
base_prompt = build_improvement_prompt(
    topic=TOPIC,
    language=LANGUAGE,
    retrieved_blocks=retrieved_blocks,
    min_words=TARGET_MIN_WORDS
)

# Save a copy for reference or manual use
save_text(OUTPUT_PROMPT, base_prompt[:200000])  # prevent edge cases of enormous prompts


Retrieved 14 blocks for topic.
✅ Saved: outputs/copy_this_prompt.txt


In [8]:
# Assuming you have a variable containing the blocks
retrieved_blocks = [...]  # Your 14 blocks here
confirmation_message = "✅ Saved: outputs/copy_this_prompt.txt"

# Print all blocks with numbering
print("Retrieved blocks:")
for i, block in enumerate(retrieved_blocks, 1):
    print(f"{i}. {block}")

# Print the confirmation message
print("\n" + confirmation_message)

Retrieved blocks:
1. Ellipsis

✅ Saved: outputs/copy_this_prompt.txt
