<a href="https://colab.research.google.com/github/dauphineezc/NLP-Final-Project/blob/main/NLPfinalproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Environment setup (run once per session)
# If you ever see ImportError or model‐not‐found errors, un-comment commands and re-run:
# !pip install nltk spacy pdfminer.six PyPDF2
# !python -m spacy download en_core_web_sm

import nltk
import spacy
from pdfminer.high_level import extract_text
from collections import Counter
import re

# Load SpaCy model
try:
    nlp = spacy.load('en_core_web_sm')
except:
    print("Run `!python -m spacy download en_core_web_sm` in Colab.")
    nlp = None

In [5]:
# Extract text from a PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    return extract_text(pdf_path)


# Basic preprocessing
def preprocess_text(text: str):
    sentences = nltk.sent_tokenize(text)
    tokens = [nltk.word_tokenize(sent) for sent in sentences]
    pos_tags = [nltk.pos_tag(tokens_i) for tokens_i in tokens]
    return sentences, tokens, pos_tags


# Candidate Term Extraction (SpaCy noun chunks + frequency)
def extract_candidate_terms(text: str, top_k: int = 20) -> list[str]:
    """
    Runs SpaCy over text and returns the top_k most frequent noun-chunks
    (lowercased, de-duplicated).
    """
    doc = nlp(text)
    # collect normalized noun-chunk strings
    chunks = [chunk.text.lower().strip() for chunk in doc.noun_chunks]
    # simple frequency ranking
    freq = Counter(chunks)
    return [term for term, _ in freq.most_common(top_k)]


# Definition Extraction via Regex Patterns
_DEFINITION_PATTERNS = [
    r"{term}\s+is\s+([^.;]+)",
    r"{term}\s+are\s+([^.;]+)",
    r"{term}\s+refers\s+to\s+([^.;]+)",
    r"the\s+term\s+{term}\s+means\s+([^.;]+)",
]


def extract_definitions(
    sentences: list[str],
    candidate_terms: list[str]
) -> dict[str,str]:
    """
    For each term, scans sentences for a matching pattern and returns
    the first captured definition. Case-insensitive.
    """
    definitions: dict[str,str] = {}
    for term in candidate_terms:
        escaped = re.escape(term)
        for sent in sentences:
            for pat in _DEFINITION_PATTERNS:
                regex = pat.format(term=escaped)
                m = re.search(regex, sent, flags=re.IGNORECASE)
                if m:
                    definitions[term] = m.group(1).strip()
                    break
            if term in definitions:
                break
    return definitions


# Flashcard Generation
def generate_flashcards(pdf_path: str, top_k: int = 20) -> list[dict]:
    """
    End-to-end: PDF → text → preprocess → term extraction → definitions → flashcards
    Returns a list of {"term":..., "definition":...}.
    """
    # 1) raw extraction + basic NLP
    text = extract_text_from_pdf(pdf_path)
    sentences, tokens, pos_tags = preprocess_text(text)

    # 2) terms extraction
    candidates = extract_candidate_terms(text, top_k=top_k)

    # 3) definition generations
    defs = extract_definitions(sentences, candidates)

    # 4) format
    flashcards = []
    for term in candidates:
        flashcards.append({
            "term":       term,
            "definition": defs.get(term, "")
        })
    return flashcards


# Example Usage
if __name__ == "__main__":
    sample_pdf = "/content/drive/MyDrive/notes/sample_notes.pdf"  # adjust path
    cards = generate_flashcards(sample_pdf, top_k=10)
    for card in cards:
        print(f"• {card['term'].title()}\n    → {card['definition']}\n")
