(fine-tuning-preprocessing)=
# Fine-Tuning: Pre-Processing the Data

## Convert Slides (LaTeX) to Raw Text

In [1]:
from pathlib import Path, PurePath
import pypandoc, re

input_dir = 'phpe400_corpus/slides'
output_file = 'extracted/raw_slides.txt'

def tex_to_plain(tex_path):
    txt = pypandoc.convert_file(
        str(tex_path),
        to="plain",
        format="latex",
        extra_args=["--wrap=none", "--quiet"])
    return re.sub(r'\n\s*\n+', '\n', txt)  # collapse blank lines

with Path(output_file).open("a", encoding="utf-8") as out:
    for tex in Path(input_dir).rglob("*.tex"):
        print(f"Processing {tex}")
        out.write(tex_to_plain(tex) + "\n")

print(f"\nOutput written to {output_file}")

FileNotFoundError: [Errno 2] No such file or directory: 'extracted/raw_slides.txt'

## Convert Syllabus (PDF) to Raw Text

In [14]:
from pathlib import Path
from pypdf import PdfReader
import re, textwrap

SRC  = "phpe400_corpus/syllabus/syl-methods-ppe-v4.pdf"    # the file you just uploaded
DEST = "extracted/raw_syllabus.txt"            # append or create

def pdf_to_plain(path):
    reader = PdfReader(path)
    lines  = []
    for page in reader.pages:
        txt = page.extract_text() or ""
        txt = txt.replace("\u200b", "")          # zero-width
        # strip page headers/footers like “5 / 8”
        txt = re.sub(r'\b\d+\s*/\s*\d+\s*$', '', txt, flags=re.M)
        lines.extend(txt.splitlines())
    # collapse blocks of ≥2 blank lines to a single blank
    cleaned = "\n".join(line.rstrip() for line in lines)
    cleaned = re.sub(r'\n\s*\n+', '\n', cleaned)
    return cleaned.strip()

print(f"Processing {SRC}")
plain = pdf_to_plain(SRC)

# stream into your master corpus file
with Path(DEST).open("a", encoding="utf-8") as out:
    for para in plain.split("\n"):
        if para.strip():
            out.write(para.strip() + "\n")
print(f"\nOutput written to {DEST}")

Processing phpe400_corpus/syllabus/syl-methods-ppe-v4.pdf

Output written to extracted/raw_syllabus.txt


## Convert Review Sheets (LaTeX) to Raw Text

In [15]:
from pathlib import Path, PurePath
import pypandoc, re

input_dir = 'phpe400_corpus/review-sheets'
output_file = 'extracted/raw_review-sheets.txt'

def tex_to_plain(tex_path):
    txt = pypandoc.convert_file(
        str(tex_path),
        to="plain",
        format="latex",
        extra_args=["--wrap=none", "--quiet"])
    return re.sub(r'\n\s*\n+', '\n', txt)  # collapse blank lines

with Path(output_file).open("a", encoding="utf-8") as out:
    for tex in Path(input_dir).rglob("*.tex"):
        print(f"Processing {tex}")
        out.write(tex_to_plain(tex) + "\n")

print(f"\nOutput written to {output_file}")

Processing phpe400_corpus/review-sheets/exam1-review-answers.tex


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Processing phpe400_corpus/review-sheets/final-exam-review.tex
Processing phpe400_corpus/review-sheets/exam1-review.tex


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Processing phpe400_corpus/review-sheets/final-exam-review-answers.tex

Output written to extracted/raw_review-sheets.txt


## Convert Chapters (PDF) to Raw Text

In [16]:
from pathlib import Path
from pypdf import PdfReader
import re

input_dir = 'phpe400_corpus/text'  # folder with your 3 PDFs
output_file = 'extracted/raw_text.txt'

# --------------------------- 1. header & noise patterns --------------------------
HEADER_RE = re.compile(
    r'^[ \t]*(?:Social Choice Theory|Game Theory|Rationality and Utility Theory)'
    r'(?:[ \t]+\d+)?[ \t]*$',                     # optional page number
    flags=re.M
)
# Lone page numbers like “55”
LONE_PAGE_RE   = re.compile(r'^[ \t]*\d{1,3}[ \t]*$', flags=re.M)
# All-caps chapter banners such as “FOUR” or “PART II”
CAPS_BANNER_RE = re.compile(r'^[ \t]*[A-Z]{2,}[ \t]*$', flags=re.M)
# Numbered section headings “4.1 …” or “2.3.5 …”
SECNUM_RE      = re.compile(r'^\d+(?:\.\d+)+\s+.*$', flags=re.M)

# --------------------------- 2. tidy helper --------------------------------------
def tidy(text: str) -> str:
    text = HEADER_RE.sub('', text)
    text = LONE_PAGE_RE.sub('', text)
    text = CAPS_BANNER_RE.sub('', text)
    text = SECNUM_RE.sub('', text)
    text = re.sub(r'(\w+)-\n(\w+)',  r'\1\2', text)   # hard hyphen breaks
    text = re.sub(r'(\w+)\u00ad(\w+)', r'\1\2', text) # soft hyphen
    text = re.sub(r'\n\s*\n+', '\n', text)            # collapse blank lines
    return text.strip()

# --------------------------- 3. PDF → plain-text ---------------------------------
def pdf_to_plain(path: Path) -> str:
    reader = PdfReader(path)
    pages  = []
    for pg in reader.pages:
        raw = pg.extract_text() or ""
        raw = raw.replace('\u200b', '')               # zero-width chars
        raw = re.sub(r'\b\d+\s*/\s*\d+\s*$', '', raw, flags=re.M)  # “5 / 15” style
        pages.append(tidy(raw))
    return "\n".join(pages)

# --------------------------- 4. batch over all PDFs ------------------------------
PDF_DIR = Path(input_dir)   # folder with your 3 PDFs
DEST    = Path(output_file)       # master output file

with DEST.open("a", encoding="utf-8") as out:
    for pdf in sorted(PDF_DIR.glob("*.pdf")):
        print(f"Processing {pdf.name}")
        for para in pdf_to_plain(pdf).split("\n"):
            if para.strip():
                out.write(para + "\n") 

print("\nOutput written to", DEST.name)

Processing econ-analysis-moral-phil-public-policy-ch4.pdf
Processing econ-analysis-moral-philosophy-public-policy-ch13.pdf
Processing econ-analysis-moral-philosophy-public-policy-ch14.pdf

Output written to raw_text.txt


## Convert Piazza Comments/Answers (JSON) to Raw Text

In [17]:
from pathlib import Path
import json, html
import re
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

PIAZZA = Path("phpe400_corpus/piazza/class_content_flat.json")     # uploaded file
OUT    = Path("extracted/raw_piazza_qa.txt")

def clean(html_snippet: str) -> str:
    """Remove <tags>, decode entities and collapse whitespace."""
    # 1. HTML → plain text
    text = BeautifulSoup(html_snippet, "html.parser").get_text(" ", strip=True)
    # 2. Unescape &nbsp; etc.
    text = html.unescape(text)
    # 3. Tighten spaces/newlines
    return re.sub(r'\s+', ' ', text).strip()

with PIAZZA.open() as f:
    posts = json.load(f)

# Index every post by its Piazza ID so we can match answers to questions
by_id = {p["id"]: p for p in posts}

qapairs = []

for p in posts:
    if p["type"] == "question":
        qtxt = f"Q: {clean(p['subject'])}\n{clean(p['content'])}"
        # grab instructor answer (type == 'i_answer') in same thread
        ans = next((by_id[c] for c in by_id            # walk once over dict
                    if by_id[c].get("parent_id") == p["id"]
                    and by_id[c]["type"] == "i_answer"),
                   None)
        if ans:
            atxt = clean(ans["content"])
            qapairs.append(f"{qtxt}\nA: {atxt}")

OUT.write_text("\n\n".join(qapairs), encoding="utf-8")
print(f"Wrote {len(qapairs)} Q/A pairs to {OUT}")


Wrote 56 Q/A pairs to extracted/raw_piazza_qa.txt


## Convert Online Notes (HTML) to Raw Text

In [18]:
from pathlib import Path
from bs4 import BeautifulSoup
import html2text, html, re

# === 1. one-time imports & (auto-)installs =========================
import sys, subprocess, re, html, shutil
from pathlib import Path

for pkg in ("beautifulsoup4", "html2text", "lxml"):
    if not shutil.which("pip") or subprocess.call(
          [sys.executable, "-m", "pip", "show", pkg],
          stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL):
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

from bs4 import BeautifulSoup
import html2text

# === 2. configure the directory to scan ============================
HTML_DIR = Path("phpe400_corpus/notes")    # <── change if your html lives elsewhere
DEST     = Path("extracted/raw_html_notes.txt")

# === 3. regex helpers ==============================================
SPAN_MATH  = re.compile(r'<span[^>]*class="math[^"]*"(?:[^>]*>)(.*?)</span>', re.S)
SCR_MATH   = re.compile(r'<script[^>]*type="math/tex[^"]*"(?:[^>]*>)(.*?)</script>', re.S)
IMG_TAG    = re.compile(r'<img[^>]*>', re.S)
LINE_JUNK  = re.compile(r'^(?:__+|\s*[*\-]\s*$|\s*\d+\.\s+\S.*)$')
SIDEBAR_KW = ("sidebar", "toc", "nav-page", "menu")

# === 4. utility functions ==========================================
def strip_outer(expr: str) -> str:
    expr = expr.strip()
    if expr.startswith(r'\(') and expr.endswith(r'\)'): expr = expr[2:-2].strip()
    elif expr.startswith(r'\[') and expr.endswith(r'\]'): expr = expr[2:-2].strip()
    return re.sub(r'\\\\([{}])', r'\\\1', expr)  # \\{ -> \{   \\} -> \}

def looks_sidebar(tag) -> bool:
    if tag.name in ("nav", "aside"): return True
    blob = " ".join([tag.get("id", ""), *tag.get("class", [])]).lower()
    return any(k in blob for k in SIDEBAR_KW)

h2t = html2text.HTML2Text(); h2t.ignore_links = True; h2t.body_width = 0

def html_to_plain(html_file: Path) -> str:
    soup  = BeautifulSoup(html_file.read_text(errors="ignore"), "lxml")
    main  = soup.find("main", id="quarto-content") or soup.find("main") or soup.body or soup
    soup  = BeautifulSoup(str(main), "lxml")            # clone so .decompose() is safe
    for tag in soup.find_all(looks_sidebar): tag.decompose()

    raw = IMG_TAG.sub('', str(soup))
    raw = SPAN_MATH.sub(lambda m: f"${strip_outer(m.group(1))}$", raw)
    raw = SCR_MATH.sub( lambda m: f"${strip_outer(m.group(1))}$", raw)

    text = h2t.handle(raw)
    text = html.unescape(text)
    text = re.sub(r'\\\\([{}])', r'\\\1', text)          # collapse any \\{ left

    lines = [ln.strip() for ln in text.splitlines()
             if ln.strip() and not LINE_JUNK.match(ln)]
    return "\n".join(lines)

# === 5. process every html file ====================================
DEST.write_text("", encoding="utf-8")   # overwrite
count = 0
for html_f in sorted(HTML_DIR.rglob("*.html")):
    print(f"Processing {html_f.name}")
    cleaned = html_to_plain(html_f)
    DEST.write_text(DEST.read_text() + cleaned + "\n", encoding="utf-8")
    count += 1
print(f"\n Output {count} html files to {DEST.name}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Processing evaluative-voting.html
Processing grading-vs-ranking.html
Processing interpersonal-comparison-utilities.html
Processing objections-utilitarianism.html
Processing social-welfare-functionals.html
Processing decision-problems.html
Processing newcomb.html
Processing rational-decisions.html
Processing allais.html
Processing ellsberg.html
Processing evaluating-axioms.html
Processing expected-utility.html
Processing independence.html
Processing functions.html
Processing lotteries.html
Processing preferences-over-lotteries.html
Processing utility-functions.html
Processing index.html
Processing completeness.html
Processing relations.html
Processing sets.html
Processing preference-and-choice.html
Processing preference-relations.html
Processing rational-preferences.html
Processing transitivity.html
Processing references.html
Processing arrows-theorem.html
Processing beyond-two-alternatives.html
Processing condorcet-jury-theorem.html
Processing justifying-majority-rule.html
Processing m

## Combine All Text Files 

In [19]:
from pathlib import Path

raw_text_files = [
    "raw_slides.txt",
    "raw_syllabus.txt",
    "raw_review-sheets.txt",
    "raw_text.txt",
    "raw_piazza_qa.txt",
    "raw_html_notes.txt"
]

MASTER = Path("extracted/raw_phpe400_corpus.txt")
MASTER.parent.mkdir(parents=True, exist_ok=True)

with MASTER.open("w", encoding="utf-8") as master:
    for raw in raw_text_files:
        p = Path(raw)
        if not p.exists():
            print(f"⚠  {raw} not found — skipping")
            continue
        print(f"✓ adding {raw}")
        for line in p.read_text(encoding="utf-8").splitlines():
            if line.strip():               # skip truly empty lines
                master.write(line.strip() + "\n")
        master.write("<|eod|>\n")          # <-- boundary token

print(f"\nwrote combined corpus → {MASTER}  ({MASTER.stat().st_size/1024:.1f} KB)")


✓ adding raw_slides.txt
✓ adding raw_syllabus.txt
✓ adding raw_review-sheets.txt
✓ adding raw_text.txt
✓ adding raw_piazza_qa.txt
✓ adding raw_html_notes.txt

wrote combined corpus → extracted/raw_phpe400_corpus.txt  (1328.9 KB)


In [20]:
import re, textwrap, random
from pathlib import Path

random.seed(42)

# ────────────────────────────────────────────────────────────────
# 0. directories
# ────────────────────────────────────────────────────────────────
EXTRACT_DIR = Path("extracted")           # all *.txt live here
OUT_DIR     = Path("data")
OUT_DIR.mkdir(exist_ok=True)

# ────────────────────────────────────────────────────────────────
# 1. helper:  Q:  ...  A:  ...  →  <|question|> ... <|answer|> ... <|end|>
# ────────────────────────────────────────────────────────────────
def convert_QA_blocks(text:str) -> str:
    pat = re.compile(
        r"^Q:\s*(?P<q>.*?)\nA:\s*(?P<a>.*?)(?=^\s*\n|\Z)",
        flags=re.S | re.M)
    def repl(m):
        q = textwrap.dedent(m.group("q")).strip()
        a = textwrap.dedent(m.group("a")).strip()
        return f"<|question|>\n{q}\n<|answer|>\n{a}\n<|end|>"
    return pat.sub(repl, text)

# ────────────────────────────────────────────────────────────────
# 2. helper: full cleaner you already tested
# ────────────────────────────────────────────────────────────────
def clean_text_block(text:str) -> str:
    text = re.sub(r"^[\u2022\-\*-]\s*", "", text, flags=re.M)    # bullets
    text = re.sub(r"\n{3,}", "\n\n", text)                       # blank lines
    text = re.sub(r"[ \t]{2,}", " ", text)                       # 2+ spaces
    H_RULE = re.compile(r"^[\s\--—_=]{3,}$", flags=re.M)         # rules
    text   = re.sub(H_RULE, "", text)
    seen, out = set(), []
    for line in text.splitlines():                               # dedupe titles
        if re.fullmatch(r"[A-Z][A-Z ]{2,40}", line.strip()):
            if line in seen: continue
            seen.add(line)
        out.append(line)
    return "\n".join(out).strip()

# ────────────────────────────────────────────────────────────────
# 3. helper: split long prose into <|statement|> blocks
#            (but skip lines that are already tagged)
# ────────────────────────────────────────────────────────────────
def paragraph_blocks(text:str, max_chars=3500):
    if text.startswith("<|question|>"):          # already tagged, keep as-is
        return [text]
    buf, out = [], []
    for line in text.splitlines():
        buf.append(line)
        if len(" ".join(buf)) > max_chars:
            out.append("\n".join(buf)); buf=[]
    if buf: out.append("\n".join(buf))
    return [f"<|statement|>\n{b.strip()}\n<|end|>" for b in out]

# ────────────────────────────────────────────────────────────────
# 4. load every extracted *.txt  (except qa_pairs.txt)
# ────────────────────────────────────────────────────────────────
plain_blocks = []
for txt_file in EXTRACT_DIR.glob("*.txt"):
    if txt_file.name == "qa_pairs.txt":
        continue
    raw = txt_file.read_text()
    raw = convert_QA_blocks(raw)       # convert embedded Q/A
    clean = clean_text_block(raw)      # your cleaner
    plain_blocks.extend(paragraph_blocks(clean))
    print("✓ processed", txt_file.name)

# ────────────────────────────────────────────────────────────────
# 5. load hand-crafted Q-A file, dedupe, oversample
# ────────────────────────────────────────────────────────────────
qa_raw    = Path(EXTRACT_DIR / "qa_pairs.txt").read_text()
def dedupe_qas(raw):
    seen, out = set(), []
    for m in re.finditer(r"<\|question\|>.*?<\|end\|>", raw, re.S):
        blk = textwrap.dedent(m.group(0)).strip()
        if blk not in seen:
            out.append(blk); seen.add(blk)
    return out

qa_blocks  = dedupe_qas(qa_raw)
qa_blocks *= 2          # oversample factor (2 = duplicate once)

# ────────────────────────────────────────────────────────────────
# 6. shuffle & write corpus
# ────────────────────────────────────────────────────────────────
all_blocks = qa_blocks + plain_blocks
random.shuffle(all_blocks)

out_file = OUT_DIR / "clean_corpus.txt"
out_file.write_text("\n\n".join(all_blocks), encoding="utf-8")
print(f"✓ wrote {out_file}  —  {len(all_blocks)} tagged blocks")


✓ processed raw_html_notes.txt
✓ processed raw_syllabus.txt
✓ processed raw_text.txt
✓ processed raw_phpe400_corpus.txt
✓ processed raw_piazza_qa.txt
✓ processed raw_slides.txt
✓ processed raw_review-sheets.txt
✓ wrote data/clean_corpus.txt  —  580 tagged blocks


In [21]:
from pathlib import Path
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
corpus = Path("data/clean_corpus.txt").read_text(encoding="utf-8")
n_tokens = len(tokenizer(corpus).input_ids)

print(f"Total tokens: {n_tokens:,}")


Token indices sequence length is longer than the specified maximum sequence length for this model (552851 > 1024). Running this sequence through the model will result in indexing errors


Total tokens: 552,851
