<a href="https://colab.research.google.com/github/dauphineezc/NLP-Final-Project/blob/main/NLPfinalproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfminer.six


Collecting pdfminer.six
  Downloading pdfminer_six-20250416-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer_six-20250416-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250416


In [8]:
# !pip install pdfminer.six spacy scikit-learn
# !python -m spacy download en_core_web_sm

import spacy
import nltk
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pdfminer.high_level import extract_text  # <-- this is key!
from google.colab import files

# Load SpaCy
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("Please run `!python -m spacy download en_core_web_sm` if this fails.")

nltk.download("punkt")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
uploaded = files.upload()
pdf_path = next(iter(uploaded))  # Grab uploaded filename

In [4]:
glossary = {
    "mitochondria": "An organelle that produces energy for the cell.",
    "dna": "The molecule that carries genetic information.",
    "enzyme": "A protein that speeds up chemical reactions in a cell.",
    "ribosome": "A structure that synthesizes proteins.",
    "photosynthesis": "The process by which green plants convert sunlight into energy.",
    "atp": "A molecule that stores and transfers energy in cells.",
    "nucleus": "The control center of the cell that contains DNA.",
    "chloroplast": "An organelle in plant cells where photosynthesis occurs.",
    "cell membrane": "A barrier that surrounds the cell and controls what enters and leaves."
}


In [11]:
def normalize_term(raw: str) -> str:
    """
    Lowercase, strip leading bullets/non‐alnums,
    drop leading 'the/a/an ', and trailing punctuation.
    """
    t = raw.lower().strip()
    t = re.sub(r'^[^a-z0-9]+', '', t)           # drop bullets etc.
    t = re.sub(r'^(the|a|an)\s+', '', t)        # drop leading articles
    t = re.sub(r'[^\w\s]+$', '', t)             # drop trailing punctuation
    return t

def extract_candidate_terms(doc, top_k=20):
    """
    1) Pull every noun_chunk
    2) Normalize them
    3) Count freq
    4) Take the top_k most common, in order, deduped
    """
    raw_chunks = [chunk.text for chunk in doc.noun_chunks]
    norm_chunks = [normalize_term(c) for c in raw_chunks]
    # drop any empties
    norm_chunks = [c for c in norm_chunks if c]

    freq = Counter(norm_chunks)
    candidates = []
    for term, _ in freq.most_common():
        if term not in candidates:
            candidates.append(term)
        if len(candidates) >= top_k:
            break
    return candidates

def extract_definitions(sentences, candidate_terms):
    """
    Same patterns, but match against lower‐cased sentences
    and normalized term keys.
    """
    patterns = [
        r"{term}\s+is\s+([^.;]+)",
        r"{term}\s+are\s+([^.;]+)",
        r"{term}\s+refers\s+to\s+([^.;]+)",
        r"the\s+term\s+{term}\s+means\s+([^.;]+)",
    ]
    defs = {}
    # pre-lower all sentences once
    lower_sents = [s.lower() for s in sentences]

    for term in candidate_terms:
        esc = re.escape(term)
        for sent in lower_sents:
            for pat in patterns:
                m = re.search(pat.format(term=esc), sent)
                if m:
                    defs[term] = m.group(1).strip()
                    break
            if term in defs:
                break
    return defs

def generate_flashcards(pdf_path, top_k=20):
    text = extract_text_from_pdf(pdf_path)
    doc, sentences = preprocess_text(text)
    terms = extract_candidate_terms(doc, top_k=top_k)
    defs  = extract_definitions(sentences, terms)
    return [
        {"term": term, "definition": defs.get(term, "")}
        for term in terms
    ]


def check_similarity(user_def, ref_def):
    vectorizer = TfidfVectorizer().fit_transform([user_def, ref_def])
    return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0]



In [12]:
cards = generate_flashcards(pdf_path)


print("📘 Flashcard Results:\n")

for card in cards:
    term = card["term"].lower().strip()
    user_def = card["definition"]

    if not user_def:
        continue  # skip empty definitions

    print(f"🧠 Term: {term.title()}")
    print(f"📝 Definition: {user_def}")

    if term in glossary:
        ref_def = glossary[term]
        score = check_similarity(user_def, ref_def)
        if score < 0.8:
            print(f"⚠️ Warning: Similarity = {score:.2f}. Please double-check this definition.")
        else:
            print(f"✅ Similarity = {score:.2f} — Looks good!")
    else:
        print("🔍 No reference definition available for this term.")

    print("-" * 60)



📘 Flashcard Results:

🧠 Term: Dna
📝 Definition: the molecule that carries genetic information and provides the blueprint for 

protein synthesis
------------------------------------------------------------
🧠 Term: Photosynthesis
📝 Definition: the process by which green plants convert sunlight into energy, 

producing glucose and oxygen
------------------------------------------------------------
🧠 Term: Nucleus
📝 Definition: the control center of the cell that contains dna
✅ Similarity = 1.00 — Looks good!
------------------------------------------------------------
🧠 Term: Atp
📝 Definition: a molecule that stores and transfers energy in cells, fueling processes like 

muscle contraction and active transport
------------------------------------------------------------
🧠 Term: Synthesis
📝 Definition: the process by which green plants convert sunlight into energy, 

producing glucose and oxygen
🔍 No reference definition available for this term.
-------------------------------------------

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [7]:
!git clone https://github.com/dauphineezc/NLP-Final-Project.git
%cd NLP-Final-Project

pdf_path = 'NLP sample bio notes.pdf'
cards    = generate_flashcards(pdf_path)

Cloning into 'NLP-Final-Project'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 12 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (12/12), 74.89 KiB | 2.88 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/NLP-Final-Project
