<a href="https://colab.research.google.com/github/dauphineezc/NLP-Final-Project/blob/main/NLPfinalproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfminer.six


Collecting pdfminer.six
  Downloading pdfminer_six-20250416-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer_six-20250416-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250416


In [2]:
# !pip install pdfminer.six spacy scikit-learn
# !python -m spacy download en_core_web_sm

import spacy
import nltk
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pdfminer.high_level import extract_text  # <-- this is key!
from google.colab import files

# Load SpaCy
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("Please run `!python -m spacy download en_core_web_sm` if this fails.")

nltk.download("punkt")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
# uploaded = files.upload()
# pdf_path = next(iter(uploaded))  # Grab uploaded filename

In [5]:
glossary = {
    "mitochondria": "An organelle that produces energy for the cell.",
    "dna": "The molecule that carries genetic information.",
    "enzyme": "A protein that speeds up chemical reactions in a cell.",
    "ribosome": "A structure that synthesizes proteins.",
    "photosynthesis": "The process by which green plants convert sunlight into energy.",
    "atp": "A molecule that stores and transfers energy in cells.",
    "nucleus": "The control center of the cell that contains DNA.",
    "chloroplast": "An organelle in plant cells where photosynthesis occurs.",
    "cell membrane": "A barrier that surrounds the cell and controls what enters and leaves."
}


In [6]:
def normalize_term(raw: str) -> str:
    """
    Lowercase, strip leading bullets/non‐alnums,
    drop leading 'the/a/an ', and trailing punctuation.
    """
    t = raw.lower().strip()
    t = re.sub(r'^[^a-z0-9]+', '', t)           # drop bullets etc.
    t = re.sub(r'^(the|a|an)\s+', '', t)        # drop leading articles
    t = re.sub(r'[^\w\s]+$', '', t)             # drop trailing punctuation
    return t

def extract_candidate_terms(doc, top_k=20):
    """
    1) Pull every noun_chunk
    2) Normalize them
    3) Count freq
    4) Take the top_k most common, in order, deduped
    """
    raw_chunks = [chunk.text for chunk in doc.noun_chunks]
    norm_chunks = [normalize_term(c) for c in raw_chunks]
    # drop any empties
    norm_chunks = [c for c in norm_chunks if c]

    freq = Counter(norm_chunks)
    candidates = []
    for term, _ in freq.most_common():
        if term not in candidates:
            candidates.append(term)
        if len(candidates) >= top_k:
            break
    return candidates

def extract_definitions(sentences, candidate_terms):
    """
    Same patterns, but match against lower‐cased sentences
    and normalized term keys.
    """
    patterns = [
        r"{term}\s+is\s+([^.;]+)",
        r"{term}\s+are\s+([^.;]+)",
        r"{term}\s+refers\s+to\s+([^.;]+)",
        r"the\s+term\s+{term}\s+means\s+([^.;]+)",
    ]
    defs = {}
    # pre-lower all sentences once
    lower_sents = [s.lower() for s in sentences]

    for term in candidate_terms:
        esc = re.escape(term)
        for sent in lower_sents:
            for pat in patterns:
                m = re.search(pat.format(term=esc), sent)
                if m:
                    defs[term] = m.group(1).strip()
                    break
            if term in defs:
                break
    return defs

def generate_flashcards(pdf_path, top_k=20):
    text = extract_text_from_pdf(pdf_path)
    doc, sentences = preprocess_text(text)
    terms = extract_candidate_terms(doc, top_k=top_k)
    defs  = extract_definitions(sentences, terms)
    return [
        {"term": term, "definition": defs.get(term, "")}
        for term in terms
    ]


def check_similarity(user_def, ref_def):
    vectorizer = TfidfVectorizer().fit_transform([user_def, ref_def])
    return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0]



In [7]:
cards = generate_flashcards(pdf_path)


print("📘 Flashcard Results:\n")

for card in cards:
    term = card["term"].lower().strip()
    user_def = card["definition"]

    if not user_def:
        continue  # skip empty definitions

    print(f"🧠 Term: {term.title()}")
    print(f"📝 Definition: {user_def}")

    if term in glossary:
        ref_def = glossary[term]
        score = check_similarity(user_def, ref_def)
        if score < 0.8:
            print(f"⚠️ Warning: Similarity = {score:.2f}. Please double-check this definition.")
        else:
            print(f"✅ Similarity = {score:.2f} — Looks good!")
    else:
        print("🔍 No reference definition available for this term.")

    print("-" * 60)



NameError: name 'extract_text_from_pdf' is not defined

In [10]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [11]:
!git clone https://github.com/dauphineezc/NLP-Final-Project.git
%cd NLP-Final-Project

pdf_path = 'NLP sample bio notes.pdf'
cards    = generate_flashcards(pdf_path)

Cloning into 'NLP-Final-Project'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 15 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (15/15), 78.35 KiB | 3.01 MiB/s, done.
Resolving deltas: 100% (2/2), done.
/content/NLP-Final-Project/NLP-Final-Project


NameError: name 'extract_text_from_pdf' is not defined

# New Section

In [36]:
!pip install transformers

from google.colab import files
uploaded = files.upload()

import os, numpy as np, pandas as pd, spacy
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

for fname, data in uploaded.items():
    with open(fname, "wb") as f:
        f.write(data)
txt_file = next(f for f in uploaded if f.lower().endswith(".txt"))
csv_file = next(f for f in uploaded if f.lower().endswith(".csv"))

nlp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def normalize_term(raw: str) -> str:
    t = raw.lower().strip().lstrip("•-*· ")
    for art in ("the ", "a ", "an "):
        if t.startswith(art): t = t[len(art):]
    return t.rstrip(".,;:!?")

def clean_text(text: str) -> str:
    doc = nlp(text or "")
    return " ".join(sent.text.strip() for sent in doc.sents if sent.text.strip())

def extract_lecture_definitions(txt_path: str, terms: list[str]) -> dict[str,str]:
    text = open(txt_path, "r", encoding="utf-8").read()
    defs, current, buffer = {}, None, []
    term_set = set(terms)
    for line in text.splitlines():
        s = line.strip()
        if not s:
            if current and buffer:
                defs[current] = " ".join(buffer).strip()
                current, buffer = None, []
            continue
        key = normalize_term(s)
        if key in term_set:
            if current and buffer:
                defs[current] = " ".join(buffer).strip()
            current, buffer = key, []
        elif current:
            buffer.append(s)
    if current and buffer:
        defs[current] = " ".join(buffer).strip()
    return defs

def extract_keywords(text: str, vect: TfidfVectorizer, top_n: int = 10) -> list[str]:
    tfidf = vect.transform([text])
    coo = tfidf.tocoo()
    scores = {col: val for col, val in zip(coo.col, coo.data)}
    feature_names = vect.get_feature_names_out()
    top_idxs = sorted(scores, key=scores.get, reverse=True)[:top_n]
    return [feature_names[i] for i in top_idxs]

def summarize_from_keywords(keywords: list[str], budget: int = 25) -> str:
    if not keywords:
        return ""
    prompt = " ".join(keywords)
    summ = summarizer(prompt, max_length=budget, min_length=5, do_sample=False)[0]["summary_text"]
    words = summ.split()
    return " ".join(words[:budget])

# Load & clean reference CSV
df = pd.read_csv(csv_file).dropna(subset=["term","explanation"])
df["term_norm"] = df["term"].apply(normalize_term)
df["ref_clean"] = df["explanation"].apply(clean_text)
terms = df["term_norm"].tolist()

# Fit TF-IDF on all reference definitions
vect = TfidfVectorizer(stop_words="english").fit(df["ref_clean"])

# Extract lecture definitions
raw_defs = extract_lecture_definitions(txt_file, terms)

# Build flashcards and evaluate
results = []
for term, ref_def in zip(terms, df["ref_clean"]):
    raw      = raw_defs.get(term, "")
    cleaned  = clean_text(raw)
    keywords = extract_keywords(cleaned, vect, top_n=10)
    summary  = summarize_from_keywords(keywords, budget=25)
    sim      = float(cosine_similarity(vect.transform([summary]), vect.transform([ref_def]))[0,0])
    results.append({
        "term":       term,
        "flashcard":  summary,
        "reference":  ref_def,
        "similarity": sim
    })

res_df = pd.DataFrame(results)
print(res_df.to_string(index=False))
print("\nAverage similarity:", res_df["similarity"].mean())

# Save & download
out = "flashcard_evaluation_results.csv"
res_df.to_csv(out, index=False)
print(f"\nSaved → {out}")
files.download(out)




Saving Glossary_HIV_cleaned.csv to Glossary_HIV_cleaned (20).csv
Saving HIV_Glossary_Lecture_Notes.txt to HIV_Glossary_Lecture_Notes (16).txt


Device set to use cpu
Your max_length is set to 25, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max_length is set to 25, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 25, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 25, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_

KeyboardInterrupt: 

In [48]:
try:
    from google.colab import files
    uploaded = files.upload()
    for fname, data in uploaded.items():
        with open(fname, "wb") as f:
            f.write(data)
    txt_file = next(f for f in uploaded if f.lower().endswith(".txt"))
    csv_file = next(f for f in uploaded if f.lower().endswith(".csv"))
    is_colab = True
except ImportError:
    is_colab = False
    txt_file = "HIV_Glossary_Lecture_Notes.txt"
    csv_file = "Glossary_HIV_cleaned.csv"

import re
import nltk
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize

paraphraser = pipeline(
    "text2text-generation",
    model="Vamsi/T5_Paraphrase_Paws"
)

def normalize_term(raw: str) -> str:
    t = raw.lower().strip().lstrip("•-*· ")
    for art in ("the ", "a ", "an "):
        if t.startswith(art):
            t = t[len(art):]
    return t.rstrip(".,;:!?")

def extract_lecture_definitions(path: str, terms: list) -> dict:
    text = open(path, "r", encoding="utf-8").read()
    defs, cur, buf = {}, None, []
    term_set = set(terms)
    for line in text.splitlines():
        s = line.strip()
        if not s:
            if cur and buf:
                defs[cur] = " ".join(buf)
                cur, buf = None, []
            continue
        key = normalize_term(s)
        if key in term_set:
            if cur and buf:
                defs[cur] = " ".join(buf)
            cur, buf = key, []
        elif cur:
            buf.append(s)
    if cur and buf:
        defs[cur] = " ".join(buf)
    return defs

def pagerank(sim: np.ndarray, d=0.85, eps=1e-4) -> np.ndarray:
    n = sim.shape[0]
    M = sim.astype(float)
    for i in range(n):
        M[i,i] = 0
        col = M[:,i].sum()
        M[:,i] = 1.0/n if col == 0 else M[:,i]/col
    pr = np.ones(n)/n
    delta = 1
    while delta > eps:
        new_pr = (1-d)/n + d*M.dot(pr)
        delta = np.abs(new_pr - pr).sum()
        pr = new_pr
    return pr

def extractive_summary(def_text: str) -> str:
    sents = sent_tokenize(def_text)
    if not sents:
        return ""
    if len(sents) == 1:
        return sents[0]
    vect = TfidfVectorizer().fit(sents)
    mat = vect.transform(sents)
    sim = cosine_similarity(mat)
    scores = pagerank(sim)
    return sents[int(np.argmax(scores))]

df = (
    pd.read_csv(csv_file)
      .dropna(subset=["term","explanation"])
      .iloc[:30]
      .reset_index(drop=True)
)
df["term_norm"] = df["term"].apply(normalize_term)
df["ref_clean"] = df["explanation"].str.replace(r"\s+", " ", regex=True).str.strip()
terms = df["term_norm"].tolist()

raw_defs = extract_lecture_definitions(txt_file, terms)
vect = TfidfVectorizer().fit(df["ref_clean"].tolist())

results = []
for term, ref in zip(terms, df["ref_clean"]):
    raw = raw_defs.get(term, "")
    ext = extractive_summary(raw)
    if ext:
        prompt = "paraphrase: " + ext
        para = paraphraser(prompt, max_length=60, num_return_sequences=1)[0]["generated_text"]
    else:
        para = ""
    sim = float(cosine_similarity(vect.transform([para]), vect.transform([ref]))[0,0])
    results.append({
        "term": term,
        "flashcard": para,
        "reference": ref,
        "similarity": sim
    })

res_df = pd.DataFrame(results)
print(res_df.to_string(index=False))
print("\nAverage similarity:", res_df["similarity"].mean())

out = "flashcard_eval_30_paraphrase.csv"
res_df.to_csv(out, index=False)
print(f"Saved → {out}")
if is_colab:
    files.download(out)


Saving HIV_Glossary_Lecture_Notes.txt to HIV_Glossary_Lecture_Notes (28).txt
Saving Glossary_HIV_cleaned.csv to Glossary_HIV_cleaned (31).csv


Device set to use cpu


                            term                                                                                                                                                                                                                                                      flashcard                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
try:
    from google.colab import files
    uploaded = files.upload()
    for fname, data in uploaded.items():
        with open(fname, "wb") as f:
            f.write(data)
    txt_file = next(f for f in uploaded if f.lower().endswith(".txt"))
    csv_file = next(f for f in uploaded if f.lower().endswith(".csv"))
    is_colab = True
except ImportError:
    is_colab = False
    txt_file = "HIV_Glossary_Lecture_Notes.txt"
    csv_file = "Glossary_HIV_cleaned.csv"

import nltk
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize

paraphraser = pipeline(
    "text2text-generation",
    model="tuner007/pegasus_paraphrase",
    framework="pt"
)

def normalize_term(raw: str) -> str:
    t = raw.lower().strip().lstrip("•-*· ")
    for art in ("the ", "a ", "an "):
        if t.startswith(art):
            t = t[len(art):]
    return t.rstrip(".,;:!?")

def extract_lecture_definitions(path: str, terms: list) -> dict:
    text = open(path, "r", encoding="utf-8").read()
    defs, cur, buf = {}, None, []
    term_set = set(terms)
    for line in text.splitlines():
        s = line.strip()
        if not s:
            if cur and buf:
                defs[cur] = " ".join(buf)
                cur, buf = None, []
            continue
        key = normalize_term(s)
        if key in term_set:
            if cur and buf:
                defs[cur] = " ".join(buf)
            cur, buf = key, []
        elif cur:
            buf.append(s)
    if cur and buf:
        defs[cur] = " ".join(buf)
    return defs

def pagerank(sim: np.ndarray, d=0.85, eps=1e-4) -> np.ndarray:
    n = sim.shape[0]
    M = sim.astype(float)
    for i in range(n):
        M[i,i] = 0
        col = M[:,i].sum()
        M[:,i] = 1.0/n if col == 0 else M[:,i]/col
    pr = np.ones(n)/n
    delta = 1
    while delta > eps:
        new_pr = (1-d)/n + d * M.dot(pr)
        delta = np.abs(new_pr - pr).sum()
        pr = new_pr
    return pr

def extractive_summary(def_text: str) -> str:
    sents = sent_tokenize(def_text)
    if not sents:
        return ""
    if len(sents) == 1:
        return sents[0]
    vect = TfidfVectorizer().fit(sents)
    mat = vect.transform(sents)
    sim = cosine_similarity(mat)
    scores = pagerank(sim)
    return sents[int(np.argmax(scores))]

df = (
    pd.read_csv(csv_file)
      .dropna(subset=["term","explanation"])
      .iloc[122:129]
      .reset_index(drop=True)
)
df["term_norm"] = df["term"].apply(normalize_term)
df["ref_clean"] = df["explanation"].str.replace(r"\s+", " ", regex=True).str.strip()
terms = df["term_norm"].tolist()

raw_defs = extract_lecture_definitions(txt_file, terms)
vect = TfidfVectorizer().fit(df["ref_clean"].tolist())

results = []
for term, ref in zip(terms, df["ref_clean"]):
    raw = raw_defs.get(term, "")
    ext = extractive_summary(raw)
    if ext:
        para = paraphraser(ext, max_length=60, min_length=5, do_sample=False, truncation=True)[0]["generated_text"]
    else:
        para = ""
    sim = float(cosine_similarity(vect.transform([para]), vect.transform([ref]))[0,0])
    results.append({
        "term": term,
        "flashcard": para,
        "reference": ref,
        "similarity": sim
    })

res_df = pd.DataFrame(results)
print(res_df.to_string(index=False))
print("\nAverage similarity:", res_df["similarity"].mean())

out = "flashcards_pt5.csv"
res_df.to_csv(out, index=False)
print(f"\nSaved → {out}")
if is_colab:
    files.download(out)

Saving HIV_Glossary_Lecture_Notes.txt to HIV_Glossary_Lecture_Notes (10).txt
Saving Glossary_HIV_cleaned.csv to Glossary_HIV_cleaned (11).csv


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


      term                                                           flashcard                                                                                                                                                                                                                          reference  similarity
     nasba  A primer-based alternative that can occur at a steady temperature. Nucleic acid sequence based amplification. A primer-based alternative to RT-PCR that can occur at a steady temperature, i.e. without heat cycling. Used to measure HIV-1 plasma viral load by amplifying a segment of the HIV RNA.    0.553861
iatrogenic          Illness can be caused by medical examination or treatment.                                                                                                                                                                    relating to illness caused by medical examination or treatment.    0.829796
 exogenous                 It can be growing o

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>