In [2]:
# ---------- Cell 1: (install dependencies) ----------
# Run this first in a fresh Colab runtime
!pip install -q --upgrade pip
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers accelerate bitsandbytes peft safetensors sentencepiece datasets python-dateutil



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m1.7/1.8 MB[0m [31m53.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Cell 2: Mount Google Drive for dataset & checkpoints
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
MODEL = "meta-llama/Llama-3.2-3B-Instruct"   # exact model id you used for training
OUTPUT_DIR = "/content/drive/MyDrive/llama_finetune/lora_llama3_3b/checkpoint-165"  # adapter folder (has adapter_config.json, adapter weights)
VALID_FILE = "/content/valid.jsonl"  # path to your validation jsonl (prompt/response per line)
MAX_NEW_TOKENS = 384
GEN_REP_PENALTY = 1.2
MAX_INPUT_LENGTH = 8000

In [5]:
import json
import re
import torch
from datetime import datetime
from dateutil import parser as dparser

def normalize_resume_text(text):
    """Clean up resume text and handle placeholder tokens"""
    if not isinstance(text, str):
        text = str(text)

    # Replace all newline placeholders with actual newlines
    text = text.replace("{new_line}", "\n")
    text = text.replace("{newline}", "\n")
    text = text.replace("{\\n}", "\n")
    text = text.replace("\\n", "\n")

    # Collapse excessive whitespace
    text = re.sub(r"[ \t]{2,}", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()

def extract_first_balanced_json(text):
    """Extract first complete balanced JSON object"""
    start = text.find("{")
    if start == -1:
        return None

    depth = 0
    in_string = False
    escape_next = False

    for i in range(start, len(text)):
        ch = text[i]

        # Handle string escaping
        if escape_next:
            escape_next = False
            continue

        if ch == "\\":
            escape_next = True
            continue

        # Track if we're inside a string
        if ch == '"':
            in_string = not in_string
            continue

        # Only count braces outside of strings
        if not in_string:
            if ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    return text[start:i+1]

    return None

def compute_years_rounded(start, end):
    """Compute years between dates with rounding"""
    if not start:
        return None

    try:
        s = dparser.parse(str(start), fuzzy=True)
    except Exception:
        return None

    if not end or "present" or "ongoing" in str(end).lower():
        e = datetime.now()
    else:
        try:
            e = dparser.parse(str(end), fuzzy=True)
        except Exception:
            return None

    total_months = (e.year - s.year) * 12 + (e.month - s.month)
    if total_months < 0:
        return None

    full_years = total_months // 12
    leftover = total_months % 12

    if leftover >= 9:
        return int(full_years + 1)
    elif leftover >= 3:
        return float(full_years) + 0.5
    else:
        return int(full_years)

def clean_and_fill(parsed):
    """Fill missing years_worked and clean empty education entries"""
    if not isinstance(parsed, dict):
        return parsed

    # Fill years_worked for experience
    for e in parsed.get("experience", []):
        if e.get("years_worked") in (None, ""):
            start = e.get("start_date") or e.get("Start Date") or e.get("start")
            end = e.get("end_date") or e.get("End Date") or e.get("end")
            e["years_worked"] = compute_years_rounded(start, end)

    # Remove empty education entries
    edu = []
    for ed in parsed.get("education", []):
        has_degree = ed.get("degree") not in (None, "")
        has_college = ed.get("college") not in (None, "")
        if has_degree or has_college:
            edu.append(ed)
    parsed["education"] = edu

    return parsed

def repair_incomplete_json(json_str):
    """Attempt to repair incomplete JSON"""
    # Count braces
    open_braces = json_str.count("{")
    close_braces = json_str.count("}")

    # Add missing closing braces
    if open_braces > close_braces:
        json_str += "}" * (open_braces - close_braces)

    # Count brackets
    open_brackets = json_str.count("[")
    close_brackets = json_str.count("]")

    # Add missing closing brackets
    if open_brackets > close_brackets:
        json_str += "]" * (open_brackets - close_brackets)

    return json_str



In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

print("Loading tokenizer and base model...")

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

base = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

print(f"Attaching PEFT adapter from {OUTPUT_DIR}")
model_peft = PeftModel.from_pretrained(base, OUTPUT_DIR)
model_peft.eval()
device = next(model_peft.parameters()).device

print(f"Model loaded on {device}")
print(f"PEFT type: {getattr(model_peft, 'peft_type', None)}")

# Count LoRA parameters
total = 0
nonzero = 0
for name, p in model_peft.named_parameters():
    if ("lora_A" in name) or ("lora_B" in name):
        total += p.numel()
        nonzero += (p.detach().cpu().abs() > 1e-8).sum().item()
print(f"LoRA non-zero params: {nonzero}/{total}")



Loading tokenizer and base model...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Attaching PEFT adapter from /content/drive/MyDrive/llama_finetune/lora_llama3_3b/checkpoint-55
Model loaded on cuda:0
PEFT type: PeftType.LORA
LoRA non-zero params: 4587512/4587520


In [7]:
def infer_resume_single(raw_resume, max_new_tokens=MAX_NEW_TOKENS):
    """Generate structured JSON from resume text"""
    resume = normalize_resume_text(raw_resume)

    # Create prompt
    prompt = (
        "Extract 'experience' and 'education' as strict JSON.\n\n"
        f"Resume:\n{resume}\n\n"
        "Return only valid JSON."
    )

    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_LENGTH
    ).to(device)

    # Generation parameters
    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=False,  # Deterministic
        repetition_penalty=REPETITION_PENALTY,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        temperature=1.0,  # Required with do_sample=False
    )

    # Generate
    with torch.no_grad():
        out = model_peft.generate(**inputs, **gen_kwargs)

    # Decode
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)

    # Clean placeholder tokens that may appear in output
    decoded = normalize_resume_text(decoded)

    # Extract JSON
    candidate = extract_first_balanced_json(decoded)

    if candidate is None:
        # Try extracting after the prompt marker
        suffix = decoded.split("Return only valid JSON.")[-1].strip()
        candidate = extract_first_balanced_json(suffix)

    if candidate is None:
        # Last resort: naive extraction
        s = decoded.find("{")
        e = decoded.rfind("}")
        if s != -1 and e != -1 and e > s:
            candidate = decoded[s:e+1]
            # Try to repair
            candidate = repair_incomplete_json(candidate)

    if candidate is None:
        return {
            "error": "no_json_found",
            "raw": decoded[:500]  # Truncate for readability
        }
    # Parse JSON
    try:
        parsed = json.loads(candidate)
    except json.JSONDecodeError as ex:
        # Try fixing common issues
        fixed = candidate.replace("'", '"')  # Single to double quotes
        fixed = re.sub(r",\s*([}\]])", r"\1", fixed)  # Trailing commas

        try:
            parsed = json.loads(fixed)
        except json.JSONDecodeError as ex2:
            return {
                "error": "json_parse_failed",
                "candidate": candidate[:500],
                "exception": str(ex2),
                "raw": decoded[:500]
            }

    # Clean and fill missing fields
    parsed = clean_and_fill(parsed)

    return {
        "json": parsed,
        "raw": decoded
    }


In [None]:
import os
import random

# Copy dataset from Drive if needed
!cp /content/drive/MyDrive/llama_finetune/dataset.jsonl /content/dataset.jsonl 2>/dev/null || true

dataset_file = "/content/dataset.jsonl"
if not os.path.exists(dataset_file):
    print(f"ERROR: {dataset_file} not found!")
    print("Please upload dataset.jsonl or adjust the path.")
else:
    lines = [l for l in open(dataset_file, encoding='utf-8').read().splitlines() if l.strip()]

    random.seed(42)
    random.shuffle(lines)

    n = len(lines)
    val_count = max(10, int(0.1 * n))

    train_lines = lines[val_count:]
    valid_lines = lines[:val_count]

    # Save splits
    open("/content/train.jsonl", "w", encoding="utf-8").write("\n".join(train_lines))
    open("/content/valid.jsonl", "w", encoding="utf-8").write("\n".join(valid_lines))

    print(f"Total examples: {n}")
    print(f"Train: {len(train_lines)}, Valid: {len(valid_lines)}")

Total examples: 481
Train: 433, Valid: 48


In [None]:
# ============================================================================
# CELL 8: Run Inference Tests
# ============================================================================
import os
import json

# Ensure constants are defined
MAX_NEW_TOKENS = 512
REPETITION_PENALTY = 1.15
VALID_FILE = "/content/valid.jsonl"

if not os.path.exists(VALID_FILE):
    print(f"ERROR: {VALID_FILE} not found!")
else:
    lines = [l for l in open(VALID_FILE, encoding="utf-8").read().splitlines() if l.strip()]

    print(f"\nRunning inference on {min(5, len(lines))} validation samples...\n")

    success_count = 0
    fail_count = 0

    for i, line in enumerate(lines[:5]):
        obj = json.loads(line)

        # Extract resume text
        if "prompt" in obj and "Resume:" in obj["prompt"]:
            resume = obj["prompt"].split("Resume:")[-1].split("Return only valid JSON.")[0].strip()
        elif "resume" in obj:
            resume = obj["resume"]
        else:
            resume = obj.get("prompt", obj.get("text", str(obj)))

        print(f"{'='*60}")
        print(f"SAMPLE {i+1}/{min(5, len(lines))}")
        print(f"{'='*60}")
        print(f"Resume length: {len(resume)} chars\n")

        # Run inference
        result = infer_resume_single(resume, max_new_tokens=MAX_NEW_TOKENS)

        if "json" in result:
            print("✓ SUCCESS - Valid JSON extracted:")
            print(json.dumps(result["json"], indent=2, ensure_ascii=False)[:1000])
            success_count += 1
        else:
            print("✗ FAILED:")
            print(f"Error: {result.get('error')}")
            if 'exception' in result:
                print(f"Exception: {result['exception']}")
            if 'candidate' in result:
                print(f"Candidate JSON (first 300 chars):\n{result['candidate'][:300]}")
            fail_count += 1

        print()

    print(f"\n{'='*60}")
    print(f"RESULTS: {success_count} success, {fail_count} failed")
    print(f"{'='*60}")


Running inference on 5 validation samples...

SAMPLE 1/5
Resume length: 3860 chars

✓ SUCCESS - Valid JSON extracted:
{
  "experience": [
    {
      "role": "Software Developer",
      "company": "Tata Consultancy Services Limited",
      "start_date": "May 2021",
      "end_date": "Present",
      "years_worked": 4.5
    },
    {
      "role": "Developer",
      "company": "VitalityHealth",
      "start_date": "null",
      "end_date": "null",
      "years_worked": null
    }
  ],
  "education": [
    {
      "degree": "Bachelor of Engineering",
      "college": "PSNA College of Engineering and Technology",
      "start_year": null,
      "end_year": null
    }
  ]
}

SAMPLE 2/5
Resume length: 2449 chars

✓ SUCCESS - Valid JSON extracted:
{
  "experience": [
    {
      "role": "Automaton tester",
      "company": "Creo Webtech Solution",
      "start_date": "01/10/2018",
      "end_date": "01/09/2022",
      "years_worked": 4
    },
    {
      "role": "None",
      "company": "Min

In [1]:
resume_text="""Mia Aguilar
mia_aguilar@berkeley.edu | linkedin.com/in/mia-aguilar


EDUCATION
University of California, Berkeley 								Expected May 2026
Bachelor of Arts in Comparative Ethnic Studies
GPA: 3.8

RELEVANT COURSEWORK
Sociology of Poverty, Organizations and Social Institutions, Data Science for Social Impact, Research Design and Sociological Methods

FELLOWSHIPS & GRANTS
Faculty Mentored Undergraduate Research Fellow, Latinx Research Center			2023-2026

RESEARCH EXPERIENCE
Research Apprentice - UC Berkeley Department of Spanish and Portuguese                   	          2023-Present
Advisor: Dr. Raul Coronado, Undergraduate Research Apprenticeship Program (URAP)
Collaborated with a team to conduct data analysis in R and Excel to understand the impacts of green spaces on children living in urban areas
Developed visualizations using Excel and infographics utilizing Canva to communicate research findings to diverse audiences
Conducted literature review to gather research on green spaces impacts on children in global urban areas and craft written overview of research to present findings to lead researcher

PROFESSIONAL EXPERIENCE
Public Health Outreach Associate - Safe Passages, Oakland, CA			  	           20XX-20XX
Conducted outreach via in-person, email campaigns, and phone calls to educate communities in Oakland about health and social support resources
Led workshops for K-12 students in groups of 10-30 students on health topics included mental health, environmental health, and stress management
Worked with program manager to collect and analyze data in Excel and developed visualizations for program reports and grant proposals
Interviewed community members about their health concerns and compiled a report to communicate trends

LEADERSHIP EXPERIENCE
Professional Development Coordinator - Berkeley Undergraduate Sociology Association 	20XX-20XX
Planned and implemented 3-4 professional development events with faculty, alumni, and industry professionals to help members explore diverse career pathways each semester
Developed a survey using Qualtrics and analyze survey results to gather insights from members to understand professional development needs

Health Education Coordinator - Berkeley Student Cooperative       				20XX-20XX
Developed programs and services in partnership with Health Worker Coordinator and Health Workers to support member health needs
Assessed the health needs of members through Google Form and determined programs and services that would best meet these needs
Created a grant proposal and was awarded $2,000 to provide specialized training around mental health resources and approaches to support members

Writing Tutor - UC Berkeley Student Learning Center                                                        	20XX-20XX
Provided individual and group tutoring sessions to support in the development of writing skills
Co-facilitated writing workshops with other tutors for groups of 10-20 students

COMMUNITY SERVICE
Volunteer -  City of Oakland Community Gardening Program, Oakland, CA			20XX-Present
Work with volunteers to maintain community gardens and organize gardening supplies each week

Coordinator - UC Berkeley Paws for Mental Health						20XX-20XX
Planned events and tabling opportunities and mobilized volunteers to promote mental through canine companions for the UC Berkeley campus community

ADDITIONAL WORK EXPERIENCE
Assistant Store Manager - Starbucks, Concord, CA						20XX-20XX
Coordinated training sessions for staff with store manager for 5 staff on a monthly basis on topics related to store operations and professional development
Ensured health and safety of staff by adhering to company protocols and providing equipment and resources to staff
Provided quality and attentive service to diverse customers and resolved any issues or complaints

PRESENTATIONS
Presentation Title
Minorities in Health, 20XX
Presented research on the benefits of gardens and green spaces for low-income urban communities


SKILLS
Technical Skills: R, Microsoft Office (Word, Excel, and PowerPoint), Google Suite (Docs, Sheets, Slides), Qualtrics
Language Skills: Bilingual in Tagalog and English

"""



In [11]:
MAX_NEW_TOKENS = 512
REPETITION_PENALTY = 1.15
result = infer_resume_single(resume_text, max_new_tokens=MAX_NEW_TOKENS)

if "json" in result:
    print("✓ SUCCESS - Valid JSON extracted:")
    print(json.dumps(result["json"], indent=2, ensure_ascii=False)[:1000])
else:
    print("✗ FAILED:")
    print(f"Error: {result.get('error')}")
    if 'exception' in result:
        print(f"Exception: {result['exception']}")
    if 'candidate' in result:
        print(f"Candidate JSON (first 300 chars):\n{result['candidate'][:300]}")


✓ SUCCESS - Valid JSON extracted:
{
  "experience": [
    {
      "role": "Research Apprentice",
      "company": "UC Berkeley Department of Spanish and Portuguese",
      "start_date": "01/2023",
      "end_date": "null",
      "years_worked": 3
    },
    {
      "role": "Public Health Outreach Associate",
      "company": "Safe Passages",
      "start_date": "01/20XX",
      "end_date": "01/20XX",
      "years_worked": 0
    },
    {
      "role": "Professional Development Coordinator",
      "company": "Berkeley Undergraduate Sociology Association",
      "start_date": "01/20XX",
      "end_date": "01/20XX",
      "years_worked": 0
    },
    {
      "role": "Health Education Coordinator",
      "company": "Berkeley Student Cooperative",
      "start_date": "01/20XX",
      "end_date": "01/20XX",
      "years_worked": 0
    },
    {
      "role": "Writing Tutor",
      "company": "UC Berkeley Student Learning Center",
      "start_date": "01/20XX",
      "end_date": "01/20XX",
    