In [1]:
# pip install --upgrade "selenium>=4.20" requests beautifulsoup4

import os
import re
import json
import time
import tempfile
import datetime as dt
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

SEARCH_URL = ("https://jobs.careers.microsoft.com/global/en/search"
              "?lc=Mexico&lc=United%20States&l=en_us&pg=1&pgSz=20&o=Recent&flt=true")

DB_PATH = "jobs_ms.json"  # persistent JSON (dict keyed by job_id or url)
MAX_PAGES = 10
PAGE_LOAD_TIMEOUT = 60
WAIT_PER_PAGE = 25
DELAY_AFTER_NEXT = 1.2

# Optional: if Selenium Manager is blocked and you have a local chromedriver, set this path:
LOCAL_CHROMEDRIVER = ""  # e.g., r"C:\Tools\chromedriver\chromedriver.exe"

JOB_ID_FROM_ARIA = re.compile(r"Job item\s+(\d+)")
ISO_DATE_RE = re.compile(r"(20\d{2})-(\d{2})-(\d{2})")

session = requests.Session()
session.headers.update({"User-Agent": "MS-Careers-Scraper/1.5 (+you@example.com)"})


def launch_chrome():
    opts = ChromeOptions()
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1400,2000")
    if LOCAL_CHROMEDRIVER:
        from selenium.webdriver.chrome.service import Service
        return webdriver.Chrome(service=Service(LOCAL_CHROMEDRIVER), options=opts)
    return webdriver.Chrome(options=opts)  # Selenium Manager


def with_page(url: str, page: int) -> str:
    """Fallback: set pg=<page> in query if clicking Next fails."""
    parts = list(urlparse(url))
    q = parse_qs(parts[4], keep_blank_values=True)
    q["pg"] = [str(page)]
    parts[4] = urlencode(q, doseq=True)
    return urlunparse(parts)


def find_cards(driver):
    return driver.find_elements(By.CSS_SELECTOR, 'div[role="listitem"]')


def title_from_card(card):
    try:
        h2 = card.find_element(By.CSS_SELECTOR, "h2")
        t = (h2.text or "").strip()
        if t:
            return t
    except Exception:
        pass
    txt = (card.text or "").strip()
    return txt.splitlines()[0].strip() if txt else None


def job_id_from_card(card):
    aria = card.get_attribute("aria-label") or ""
    m = JOB_ID_FROM_ARIA.search(aria)
    if m:
        return m.group(1)
    try:
        outer = card._parent.execute_script("return arguments[0].outerHTML;", card)
    except Exception:
        outer = ""
    m2 = JOB_ID_FROM_ARIA.search(outer or "")
    return m2.group(1) if m2 else None


def link_from_card(card, job_id):
    try:
        a = card.find_element(By.CSS_SELECTOR, 'a[href*="/global/en/job/"]')
        href = a.get_attribute("href")
        if href:
            return href
    except Exception:
        pass
    return f"https://jobs.careers.microsoft.com/global/en/job/{job_id}/" if job_id else None


def parse_date_posted_from_detail(html_text):
    soup = BeautifulSoup(html_text, "html.parser")

    for tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
        raw = tag.string or ""
        try:
            data = json.loads(raw)
        except Exception:
            continue
        items = data if isinstance(data, list) else [data]
        for item in items:
            if isinstance(item, dict) and item.get("@type") in {"JobPosting", "Posting"}:
                dp = item.get("datePosted") or item.get("dateCreated") or item.get("dateModified")
                if dp:
                    m = ISO_DATE_RE.search(dp)
                    if m:
                        return f"{m.group(1)}-{m.group(2)}-{m.group(3)}"

    m2 = ISO_DATE_RE.search(html_text)
    if m2:
        return f"{m2.group(1)}-{m2.group(2)}-{m2.group(3)}"

    text = soup.get_text(" ", strip=True)
    if "Today" in text:
        return dt.date.today().isoformat()
    return None


def click_next_if_possible(driver) -> bool:
    """
    Try several selectors to click the 'Next' pagination button.
    Returns True if we clicked something that looked like Next.
    """
    selectors = [
        (By.CSS_SELECTOR, 'button[aria-label*="Next"]:not([disabled]):not([aria-disabled="true"])'),
        (By.XPATH, "//button[(contains(., 'Next') or contains(@aria-label, 'Next')) and not(@disabled) and not(@aria-disabled='true')]"),
        (By.XPATH, "//a[(contains(., 'Next') or contains(@aria-label, 'Next')) and not(contains(@class,'disabled'))]"),
    ]
    for by, sel in selectors:
        try:
            btn = driver.find_element(by, sel)
            # Some UIs hide Next when not applicable; verify displayed & enabled
            if not btn.is_displayed():
                continue
            if btn.get_attribute("disabled") or btn.get_attribute("aria-disabled") == "true":
                continue
            btn.click()
            return True
        except Exception:
            continue
    return False


def wait_for_new_page(driver, prev_ids, timeout=12) -> bool:
    """
    After clicking Next, wait until we see a card with a job_id not in prev_ids,
    or DOM count changes. Returns True if changed, False if timeout.
    """
    t0 = time.time()
    last_count = len(prev_ids)
    while time.time() - t0 < timeout:
        time.sleep(0.8)
        cards = find_cards(driver)
        # attach driver reference for outerHTML extraction
        for c in cards:
            try:
                c._parent = driver
            except Exception:
                pass
        curr_ids = set()
        for c in cards:
            jid = job_id_from_card(c)
            if jid:
                curr_ids.add(jid)
        if len(cards) != last_count or (curr_ids - prev_ids):
            return True
    return False


def scrape_paginated(max_pages=MAX_PAGES):
    driver = launch_chrome()
    driver.set_page_load_timeout(PAGE_LOAD_TIMEOUT)
    wait = WebDriverWait(driver, WAIT_PER_PAGE)

    # Start at pg=1
    driver.get(SEARCH_URL)
    try:
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[role="listitem"]')))
    except Exception:
        pass

    all_rows = []
    seen_global_ids = set()

    current_page = 1
    while current_page <= max_pages:
        # Collect this page’s 20 (or fewer)
        cards = find_cards(driver)
        for c in cards:
            try:
                c._parent = driver
            except Exception:
                pass

        print(f"[PAGE {current_page}] cards found: {len(cards)}")

        page_ids = set()
        for card in cards:
            name = title_from_card(card)
            jid  = job_id_from_card(card)
            if jid:
                page_ids.add(jid)
                if jid in seen_global_ids:
                    continue
            url  = link_from_card(card, jid)
            date_posted = None
            if url:
                try:
                    r = session.get(url, timeout=25, allow_redirects=True)
                    date_posted = parse_date_posted_from_detail(r.text)
                    url = r.url
                except Exception:
                    pass
            all_rows.append({
                "name": name,
                "job_id": jid,
                "url": url,
                "date_posted": date_posted
            })
            if jid:
                seen_global_ids.add(jid)

        # If fewer than 20, we’re done
        if len(cards) < 20:
            break

        # Try clicking Next; if that fails, try navigating with pg=+1
        clicked = click_next_if_possible(driver)
        if clicked:
            # wait for a change
            changed = wait_for_new_page(driver, page_ids, timeout=12)
            if not changed:
                # fallback: explicit navigation
                next_url = with_page(SEARCH_URL, current_page + 1)
                driver.get(next_url)
                try:
                    WebDriverWait(driver, WAIT_PER_PAGE).until(
                        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[role="listitem"]'))
                    )
                except Exception:
                    pass
        else:
            # no Next button detected; fallback to URL pg=+1
            next_url = with_page(SEARCH_URL, current_page + 1)
            driver.get(next_url)
            try:
                WebDriverWait(driver, WAIT_PER_PAGE).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[role="listitem"]'))
                )
            except Exception:
                pass

        time.sleep(DELAY_AFTER_NEXT)
        current_page += 1

    driver.quit()
    return all_rows


# -------- persistence helpers -------- #

def load_db(path: str) -> dict:
    if not os.path.exists(path):
        return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict):
            return data
        out = {}
        for row in data:
            key = (row.get("job_id") or row.get("url"))
            if key:
                out[str(key)] = row
        return out
    except Exception:
        return {}


def save_db_atomic(path: str, data: dict):
    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
    fd, tmp_path = tempfile.mkstemp(prefix="jobs_", suffix=".json", dir=os.path.dirname(os.path.abspath(path)))
    os.close(fd)
    with open(tmp_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    os.replace(tmp_path, path)


def upsert_rows(db: dict, rows: list) -> int:
    added = 0
    for row in rows:
        key = str(row.get("job_id") or row.get("url"))
        if not key:
            continue
        if key not in db:
            db[key] = row
            added += 1
        else:
            old = db[key]
            for fld in ("name", "url", "date_posted"):
                if row.get(fld):
                    old[fld] = row[fld]
    return added


def main():
    print(f"[DB] loading: {DB_PATH}")
    db = load_db(DB_PATH)
    print(f"[DB] existing records: {len(db)}")

    print(f"[SCRAPE] paginate up to {MAX_PAGES} pages (20 per page)…")
    rows = scrape_paginated(max_pages=MAX_PAGES)
    print(f"[SCRAPE] total rows scraped: {len(rows)}")

    added = upsert_rows(db, rows)
    print(f"[DB] new rows added: {added}")

    print(f"[DB] saving to: {DB_PATH}")
    save_db_atomic(DB_PATH, db)

    print("\n[PREVIEW] first 10 entries:")
    for r in list(db.values())[:10]:
        print(f"- {r.get('name')} | {r.get('job_id')} | {r.get('date_posted')} | {r.get('url')}")


if __name__ == "__main__":
    main()


[DB] loading: jobs_ms.json
[DB] existing records: 389
[SCRAPE] paginate up to 10 pages (20 per page)…
[PAGE 1] cards found: 20
[PAGE 2] cards found: 20
[PAGE 3] cards found: 20
[PAGE 4] cards found: 20
[PAGE 5] cards found: 20
[PAGE 6] cards found: 20
[PAGE 7] cards found: 20
[PAGE 8] cards found: 20
[PAGE 9] cards found: 20
[PAGE 10] cards found: 20
[SCRAPE] total rows scraped: 200
[DB] new rows added: 20
[DB] saving to: jobs_ms.json

[PREVIEW] first 10 entries:
- Principal Applied Scientist | 1881669 | None | https://jobs.careers.microsoft.com/global/en/job/1881669/
- Senior Product Marketing Manager, Identity | 1879566 | None | https://jobs.careers.microsoft.com/global/en/job/1879566/
- Industry Advisory - Microsoft Discovery | 1881248 | None | https://jobs.careers.microsoft.com/global/en/job/1881248/
- Sales Strategy Enablement - Higher Education | 1880149 | None | https://jobs.careers.microsoft.com/global/en/job/1880149/
- Research Intern - MSR Systems Research Group - Redmond | 1

In [2]:
# read json
jobs = []
with open(DB_PATH, "r", encoding="utf-8") as f:
    jobs = json.load(f)
print(len(jobs), "jobs loaded.")

409 jobs loaded.


In [3]:
# pip install --upgrade selenium beautifulsoup4

import os, re, json, time
from typing import Dict, Any, List
from bs4 import BeautifulSoup, NavigableString

from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# -------------------- CONFIG --------------------

DB_PATH_IN  = "jobs_ms.json"           # your existing list/index (from the search pages)
DB_PATH_OUT = "jobs_ms_details.json"   # NEW details DB (this script writes here)

LABELS = [
    "Date posted","Work site","Role type","Discipline",
    "Job number","Travel","Profession","Employment type"
]

# polite delay between requests (seconds)
SLEEP_BETWEEN = (0.6, 1.2)  # (min, max)
MAX_RETRIES   = 2           # retries per job page on failures


# -------------------- UTILS --------------------

def norm(s: str | None) -> str:
    import re as _re
    return _re.sub(r"\s+", " ", (s or "")).strip()

def sleep_a_bit():
    import random
    time.sleep(random.uniform(*SLEEP_BETWEEN))

def load_jobs_index(path: str) -> List[Dict[str, Any]]:
    """Load jobs_ms.json -> return a list of job rows with url or job_id."""
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    data = json.load(open(path, "r", encoding="utf-8"))
    rows = list(data.values()) if isinstance(data, dict) else list(data)
    if not rows:
        raise RuntimeError("jobs_ms.json has no rows")
    return rows

def load_details_db(path: str) -> Dict[str, Any]:
    if not os.path.exists(path): return {}
    try:
        return json.load(open(path, "r", encoding="utf-8"))
    except Exception:
        return {}

def save_details_db(db: Dict[str, Any], path: str):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(db, f, ensure_ascii=False, indent=2)

def make_job_url(row: Dict[str, Any]) -> str | None:
    if row.get("url"):
        return row["url"]
    jid = row.get("job_id")
    if jid:
        return f"https://jobs.careers.microsoft.com/global/en/job/{jid}/"
    return None

def upsert(rec: Dict[str, Any], db: Dict[str, Any]) -> None:
    key = str(rec.get("job_id") or rec.get("url"))
    if not key:
        return
    if key not in db:
        db[key] = rec
    else:
        # update only non-empty values
        for k, v in rec.items():
            if v not in (None, "", []):
                db[key][k] = v


# -------------------- PAY / LOC HELPERS --------------------

USD_RANGE  = re.compile(r"USD\s*\$\s*[\d,]+\s*-\s*\$\s*[\d,]+", re.I)
PAY_START  = re.compile(
    r"(typical\s+base\s+pay\s+range|base\s+pay\s+range\s+for\s+this\s+role|benefits\s+and\s+pay\s+information|USD\s*\$\s*[\d,]+\s*-\s*\$\s*[\d,]+)",
    re.I
)

def extract_pay_ranges(text: str) -> List[Dict[str,str]]:
    spans = []
    for m in USD_RANGE.finditer(text):
        s, e = m.span()
        ctx = text[max(0, s-140): min(len(text), e+140)]
        region = "U.S."
        if re.search(r"San\s*Francisco\s*Bay|New\s*York\s*City", ctx, re.I):
            region = "SF Bay Area / NYC"
        spans.append({"region": region, "range": m.group(0)})
    # de-dup
    uniq, seen = [], set()
    for r in spans:
        key = (r["region"], r["range"])
        if key not in seen:
            seen.add(key); uniq.append(r)
    return uniq

def extract_locations_jsonld(html_text: str) -> List[str]:
    out = []
    soup = BeautifulSoup(html_text, "html.parser")
    for tag in soup.find_all("script", {"type": "application/ld+json"}):
        try:
            data = json.loads(tag.string or "")
        except Exception:
            continue
        items = data if isinstance(data, list) else [data]
        for it in items:
            if isinstance(it, dict) and it.get("@type") in {"JobPosting","Posting"}:
                jl = it.get("jobLocation")
                if isinstance(jl, dict): jl = [jl]
                if isinstance(jl, list):
                    for loc in jl:
                        addr = (loc or {}).get("address", {})
                        parts = [addr.get("addressLocality"), addr.get("addressRegion"), addr.get("addressCountry")]
                        parts = [p for p in parts if p]
                        if parts: out.append(", ".join(parts))
    # de-dup
    return list(dict.fromkeys(out))


# -------------------- QUALIFICATIONS SPLIT (YOUR RULES) --------------------

REQ_RE   = re.compile(r"\bRequired\s+Qualifications\b", re.I)
PREF_RE  = re.compile(r"\bPreferred\s+Qualifications\b", re.I)
OTHER_RE = re.compile(r"\bOther\s+Requirements?\b", re.I)

def block_text_from_html(html: str) -> str:
    """Convert block-level HTML into newline-separated text preserving bullets."""
    soup = BeautifulSoup(html, "html.parser")
    pieces = []
    for node in soup.descendants:
        if isinstance(node, NavigableString):  # we'll take text from elements instead
            continue
        if node.name in ("ul","ol"):
            for li in node.select(":scope > li"):
                t = norm(li.get_text(" ", strip=True))
                if t: pieces.append("• " + t)
        elif node.name in ("p","div","section"):
            t = norm(node.get_text(" ", strip=True))
            if t: pieces.append(t)
    # collapse duplicate consecutive lines
    out = []
    for p in pieces:
        if not out or p != out[-1]:
            out.append(p)
    return "\n".join(out)

def find_span(text: str, pattern: re.Pattern, start_at: int = 0):
    m = pattern.search(text, start_at)
    return (m.start(), m.end()) if m else (None, None)

def slice_between(text: str, start_pat: re.Pattern, end_pats: tuple[re.Pattern, ...], start_offset_to_content=True) -> str:
    s0, s1 = find_span(text, start_pat)
    if s0 is None: return ""
    start = s1 if start_offset_to_content else s0
    ends = []
    for ep in end_pats:
        e0, _ = find_span(text, ep, start_at=start)
        if e0 is not None:
            ends.append(e0)
    stop = min(ends) if ends else len(text)
    return text[start:stop].strip()

def split_qualifications(qual_text: str):
    """
    Rules:
      - Required: from 'Required Qualifications' up to (Other OR Preferred)
      - Other   : from 'Other Requirements' up to 'Preferred Qualifications'
      - Preferred: from 'Preferred Qualifications' up to the pay paragraph
    """
    q = qual_text
    pay_start_idx, _ = find_span(q, PAY_START)
    pay_enders = (PAY_START,) if pay_start_idx is not None else ()

    required_text  = slice_between(q, REQ_RE,   (OTHER_RE, PREF_RE))
    other_text     = slice_between(q, OTHER_RE, (PREF_RE,))
    preferred_text = slice_between(q, PREF_RE,  pay_enders)

    # fallbacks if markers exist but end wasn't found
    if not required_text and REQ_RE.search(q):
        required_text = slice_between(q, REQ_RE, pay_enders)
    if not other_text and OTHER_RE.search(q):
        other_text = slice_between(q, OTHER_RE, pay_enders)
    if not preferred_text and PREF_RE.search(q):
        preferred_text = slice_between(q, PREF_RE, ())

    return required_text, preferred_text, other_text


# -------------------- SELENIUM SCRAPER --------------------

def launch_chrome():
    opt = ChromeOptions()
    opt.add_argument("--headless=new")
    opt.add_argument("--no-sandbox")
    opt.add_argument("--disable-dev-shm-usage")
    opt.add_argument("--window-size=1400,2200")
    return webdriver.Chrome(options=opt)

def safe_text(el) -> str | None:
    try:
        return norm(el.text)
    except Exception:
        return None

def parse_detail_page(url: str, driver: webdriver.Chrome) -> Dict[str, Any]:
    driver.get(url)
    WebDriverWait(driver, 35).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
    time.sleep(0.7)

    # anchor by "Date posted" and find the title just before it
    dp = WebDriverWait(driver, 25).until(
        EC.presence_of_element_located((By.XPATH, "//*[normalize-space()='Date posted']"))
    )
    title_el = dp.find_element(By.XPATH, "preceding::h1[1]")
    title = safe_text(title_el)

    # panel = smallest ancestor containing both title & the "Date posted" label
    panel = title_el.find_element(By.XPATH, "ancestor::*[.//*[normalize-space()='Date posted']][1]")

    # location: first meaningful text after <h1> inside the panel (skips Apply/Save area)
    location = None
    try:
        cand = panel.find_element(By.XPATH, ".//h1/following::*[normalize-space()][1]")
        txt = safe_text(cand)
        if txt and not any(x in txt for x in ("Apply", "Save", "Share job")):
            location = txt
    except Exception:
        pass

    # key/value fields
    def value_for(label: str) -> str | None:
        try:
            lab = panel.find_element(By.XPATH, f".//*[normalize-space()='{label}' or normalize-space()='{label}:']")
        except Exception:
            return None
        for rel in ["./following-sibling::*[normalize-space()][1]",
                    "following::*[normalize-space()][1]"]:
            try:
                node = lab.find_element(By.XPATH, rel)
                val = safe_text(node)
                if val: return val
            except Exception:
                pass
        return None

    fields = {lab: value_for(lab) for lab in LABELS}

    # Qualifications HTML (block after its heading)
    q_html = ""
    try:
        qh = panel.find_element(By.XPATH, ".//h2[normalize-space()='Qualifications'] | .//h3[normalize-space()='Qualifications']")
        frag, sib = [], qh
        for _ in range(160):
            try:
                sib = sib.find_element(By.XPATH, "following-sibling::*[1]")
            except Exception:
                break
            tg = sib.tag_name.lower()
            if tg in ("h2","h3"): break
            frag.append(sib.get_attribute("outerHTML"))
        q_html = "".join(frag)
    except Exception:
        pass

    qualifications_text = block_text_from_html(q_html) if q_html else ""
    req_text, pref_text, other_text = split_qualifications(qualifications_text)

    # pay ranges from qualifications (Microsoft places pay block there)
    pay_ranges = extract_pay_ranges(qualifications_text)

    # locations fallback via JSON-LD if needed
    if not location:
        jl = extract_locations_jsonld(driver.page_source)
        if jl: location = " | ".join(jl)

    # job id
    m = re.search(r"/job/(\d+)", driver.current_url)
    job_id = fields.get("Job number") or (m.group(1) if m else None)

    return {
        "job_id": job_id,
        "title": title,
        "url": driver.current_url,
        "date_posted": fields.get("Date posted"),
        "locations": [location] if location else [],
        "travel": fields.get("Travel"),
        # FULL TEXT blocks (as requested)
        "required_qualifications_text": req_text,
        "other_requirements_text":     other_text,
        "preferred_qualifications_text": pref_text,
        "qualifications_text": qualifications_text,   # entire section for reference
        "pay_ranges": pay_ranges,
    }


# -------------------- MAIN: PROCESS ALL URLS --------------------

def main():
    rows = load_jobs_index(DB_PATH_IN)
    
    # Build URLs list...
    urls = []
    seen = set()
    for row in rows:
        url = make_job_url(row)
        if not url: continue
        if url not in seen:
            seen.add(url)
            urls.append(url)

    details_db = load_details_db(DB_PATH_OUT)
    
    # Restart browser every 10 jobs to prevent memory issues
    RESTART_EVERY = 10
    drv = None
    processed = 0
    
    try:
        print(f"[DETAILS] processing {len(urls)} job pages …")
        for i, url in enumerate(urls, 1):
            key = re.search(r"/job/(\d+)", url)
            key = key.group(1) if key else url

            if key in details_db:
                print(f"[{i}/{len(urls)}] SKIP already saved: {key}")
                continue

            # Restart browser every RESTART_EVERY jobs
            if drv is None or (processed > 0 and processed % RESTART_EVERY == 0):
                if drv:
                    print(f"   - restarting browser after {RESTART_EVERY} jobs")
                    drv.quit()
                    time.sleep(2)
                drv = launch_chrome()

            print(f"[{i}/{len(urls)}] GET {url}")
            success = False
            for attempt in range(1, MAX_RETRIES+1):
                try:
                    rec = parse_detail_page(url, drv)
                    upsert(rec, details_db)
                    processed += 1
                    success = True
                    break
                except Exception as e:
                    print(f"   ! attempt {attempt} failed: {e}")
                    if "chrome" in str(e).lower() or "session" in str(e).lower():
                        # Browser crash - restart it
                        try:
                            drv.quit()
                        except:
                            pass
                        time.sleep(2)
                        drv = launch_chrome()
                    time.sleep(1.0)

            if not success:
                print(f"   x failed all attempts: {url}")

            # Checkpoint save every 5 records
            if processed and processed % 5 == 0:
                save_details_db(details_db, DB_PATH_OUT)
                print(f"   - checkpoint saved ({processed} records)")

            sleep_a_bit()

    finally:
        save_details_db(details_db, DB_PATH_OUT)
        if drv:
            drv.quit()

    print(f"[DONE] wrote {len(details_db)} records to {DB_PATH_OUT}")

if __name__ == "__main__":
    main()


[DETAILS] processing 408 job pages …
[1/408] SKIP already saved: 1881669
[2/408] SKIP already saved: 1879566
[3/408] SKIP already saved: 1881248
[4/408] SKIP already saved: 1880149
[5/408] SKIP already saved: 1883457
[6/408] SKIP already saved: 1837771
[7/408] SKIP already saved: 1878935
[8/408] SKIP already saved: 1876605
[9/408] SKIP already saved: 1881287
[10/408] SKIP already saved: 1841942
[11/408] SKIP already saved: 1883443
[12/408] SKIP already saved: 1881809
[13/408] SKIP already saved: 1882128
[14/408] SKIP already saved: 1878004
[15/408] SKIP already saved: 1860231
[16/408] SKIP already saved: 1881109
[17/408] SKIP already saved: 1880623
[18/408] SKIP already saved: 1882874
[19/408] SKIP already saved: 1875579
[20/408] SKIP already saved: 1883136
[21/408] SKIP already saved: 1883354
[22/408] SKIP already saved: 1858524
[23/408] SKIP already saved: 1880866
[24/408] SKIP already saved: 1881872
[25/408] SKIP already saved: 1839391
[26/408] SKIP already saved: 1883330
[27/408] S

In [4]:
# run with: python flag_jobs_by_field_minimal.py

import os, re, json
from typing import Dict, List, Any, Iterable

DETAILS_PATH = "jobs_ms_details.json"                 # input (your detailed jobs)
OUTPUT_PATH  = "jobs_ms_avoid_hits_by_field.json"     # output (only hits, no empties)

# ----------------------------------------------
# EDIT: classes -> { field -> [keywords...] }
# "*" applies to ALL scannable fields
# fields should match keys in your details JSON
# ----------------------------------------------
AVOID_RULES: Dict[str, Dict[str, List[str]]] = {
    "visa_sponsorship_block": {
        "title": ["no sponsorship", "no visa"],
        "qualifications_text": ["without sponsorship"],
        "other_requirements_text": ["citizens only", "citizenship required", "citizenship is required", "U.S. citizens", "US citizens", "green card", "permanent resident"],
    },
    "senior_only": {
        "title": ["principal only", "senior only"],
        "required_qualifications_text": ["6+ years", "10+ years", "12+ years"],
    },
    "clearance_required": {
        "other_requirements_text": ["security clearance", "public trust", "polygraph"],
    },
    "knowledge_fullstack": {
        "required_qualifications_text": ["HTML", "React", "Node.js", "REST", "Full Stack", "Full-Stack", "Fullstack", "Front End", "Frontend", "Back End", "Backend", "API",
                  "Angular", "Vue.js", "Django", "Flask", "Ruby on Rails", "PHP", "http", "HTTP", "HTTPS", "https"],
    },
    "unwanted_languages": {
        "required_qualifications_text": ["java", "javascript", "c#", "c-sharp", "c plus plus", "c++", "ruby", "php", "swift", "kotlin", "go ", "golang", "r ", "perl", "scala", "haskell", "lua"],
    },
    "knowledge_python": {
        "*": ["python"],
    },
    "unwanted_positions": {
        "title": ["finance", "accounting", "recruiter", "recruitment", 
                  "salesforce", "sales force", "sales", "marketing", 
                  "legal", "attorney", "lawyer", "paralegal", "compliance",
                  "human resources", "hr ", "talent acquisition", "talent management",
                  "UX designer", "user experience", "graphic designer", "ui designer",
                  "technical writer", "content writer", "copywriter",],
        "required_qualifications_text": ["finance", "accounting", "recruiter", "recruitment",
                  "salesforce", "sales force", "sales", "marketing",
                  "legal", "attorney", "lawyer", "paralegal", "compliance",
                  "human resources", "hr ", "talent acquisition", "talent management",
                  "UX designer", "user experience", "graphic designer", "ui designer",
                  "technical writer", "content writer", "copywriter",],
    }
}

# Limit which fields we scan (None = auto-detect string/list/dict fields)
SCANNABLE_FIELDS: List[str] | None = [
    "title",
    "locations",
    "travel",
    "qualifications_text",
    "required_qualifications_text",
    "preferred_qualifications_text",
    "other_requirements_text",
    "date_posted",
]

# ----------------------------------------------

def norm(s: str | None) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def to_text(val: Any) -> str:
    """Normalize any field into plain text (lowercased) for matching."""
    if val is None:
        return ""
    if isinstance(val, list):
        parts: List[str] = []
        for x in val:
            if isinstance(x, dict):
                parts.append(json.dumps(x, ensure_ascii=False))
            else:
                parts.append(str(x))
        return norm(" | ".join(parts)).lower()
    if isinstance(val, dict):
        return norm(json.dumps(val, ensure_ascii=False)).lower()
    return norm(str(val)).lower()

def load_json(path: str) -> Any:
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(obj: Any, path: str) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def get_job_id(key: str, rec: Dict[str, Any]) -> str:
    if rec.get("job_id"):
        return str(rec["job_id"])
    m = re.search(r"/job/(\d+)", rec.get("url") or key or "")
    return m.group(1) if m else (rec.get("url") or key or "UNKNOWN")

def iter_scannable_fields(rec: Dict[str, Any]) -> Iterable[str]:
    if SCANNABLE_FIELDS is not None:
        for f in SCANNABLE_FIELDS:
            if f in rec:
                yield f
    else:
        for f, v in rec.items():
            if isinstance(v, (str, list, dict)):
                yield f

def kw_boundary_search(blob: str, kw: str) -> bool:
    """Case-insensitive word-ish boundary search to avoid 'java' in 'javascript'."""
    return re.search(rf"(?<!\w){re.escape(kw)}(?!\w)", blob, re.I) is not None

def materialize_field_keywords(per_field: Dict[str, List[str]], available_fields: List[str]) -> Dict[str, List[str]]:
    """
    Merge wildcard '*' kws into each available field, plus its explicit kws.
    Returns only fields that end up with >=1 keyword.
    """
    result: Dict[str, List[str]] = {}
    wild = per_field.get("*", [])
    for f in available_fields:
        kws = set(wild)
        if f in per_field:
            kws.update(per_field[f])
        if kws:
            result[f] = sorted(kws)
    return result

def main():
    details = load_json(DETAILS_PATH)  # dict: key -> record

    # We'll only add classes that have at least one hit.
    hits_out: Dict[str, Dict[str, Any]] = {}

    total = 0
    total_hits = 0

    for key, rec in details.items():
        total += 1
        job_id = get_job_id(key, rec)

        # cache blobs per field for this record
        available_fields = list(iter_scannable_fields(rec))
        field_blob: Dict[str, str] = {f: to_text(rec.get(f)) for f in available_fields}

        for cls, per_field in AVOID_RULES.items():
            # build the final field->keywords map (explicit + wildcard)
            field_kws = materialize_field_keywords(per_field, available_fields)
            if not field_kws:
                continue

            # test this job for this class
            matched_fields: Dict[str, List[str]] = {}
            for f, kws in field_kws.items():
                blob = field_blob.get(f, "")
                if not blob:
                    continue
                found = [kw for kw in kws if kw_boundary_search(blob, kw)]
                if found:
                    matched_fields[f] = sorted(set(found))

            if matched_fields:
                # we have at least one hit: add/update this class in output
                bucket = hits_out.setdefault(cls, {"job_ids": [], "matches": {}})
                if job_id not in bucket["job_ids"]:
                    bucket["job_ids"].append(job_id)
                    total_hits += 1
                # store only non-empty matches for this job_id
                bucket["matches"][job_id] = matched_fields

    # tidy/sort for stable diffs
    for cls, bucket in list(hits_out.items()):
        bucket["job_ids"] = sorted(bucket["job_ids"], key=lambda x: (len(str(x)), str(x)))
        # if somehow a class ended up with no job_ids (shouldn't happen), drop it
        if not bucket["job_ids"]:
            del hits_out[cls]

    save_json(hits_out, OUTPUT_PATH)

    print(f"[OK] scanned {total} jobs from {DETAILS_PATH}")
    print(f"[OK] wrote ONLY hits to {OUTPUT_PATH}")
    for cls in sorted(hits_out.keys()):
        print(f"  - {cls}: {len(hits_out[cls]['job_ids'])} job(s)")

if __name__ == "__main__":
    main()


[OK] scanned 405 jobs from jobs_ms_details.json
[OK] wrote ONLY hits to jobs_ms_avoid_hits_by_field.json
  - clearance_required: 12 job(s)
  - knowledge_fullstack: 25 job(s)
  - knowledge_python: 169 job(s)
  - senior_only: 60 job(s)
  - unwanted_languages: 133 job(s)
  - unwanted_positions: 67 job(s)
  - visa_sponsorship_block: 1 job(s)


In [5]:
df = load_details_db(OUTPUT_PATH)

In [6]:
from datetime import datetime

# show knowledge python hits
python_job = set(df['knowledge_python']['job_ids'])
knowledge_fullstack = set(df['knowledge_fullstack']['job_ids'])
clearance_required = set(df['clearance_required']['job_ids'])   
visa_sponsorship_block = set(df['visa_sponsorship_block']['job_ids'])
unwanted_positions = set(df['unwanted_positions']['job_ids'])
senior_only = set(df['senior_only']['job_ids'])
wanted_python_jobs = python_job - knowledge_fullstack - clearance_required - visa_sponsorship_block - unwanted_positions - senior_only
print(f"Total Python jobs: {len(python_job)}")
print(f"Total knowledge fullstack jobs: {len(knowledge_fullstack)}")
print(f"Total wanted Python jobs: {len(wanted_python_jobs)}")
full_list = load_details_db(DB_PATH_OUT)
# Sort wanted_python_jobs by date_posted (oldest first)

def parse_date(date_str):
    # Try to parse various date formats, fallback to a large date if missing
    if not date_str:
        return datetime.max
    for fmt in ("%b %d, %Y", "%Y-%m-%d", "%b %d, %Y."):
        try:
            return datetime.strptime(date_str.strip(), fmt)
        except Exception:
            continue
    return datetime.max

sorted_jobs = sorted(
    wanted_python_jobs,
    key=lambda job_id: parse_date(full_list.get(job_id, {}).get("date_posted"))
)

full_detailed_list = load_details_db(DETAILS_PATH)

sorted_jobs_detailed = sorted(
    wanted_python_jobs,
    key=lambda job_id: parse_date(full_detailed_list.get(job_id, {}).get("date_posted"))
)

for job_id in sorted_jobs:
    job = full_list.get(job_id, {})
    print(f"- {job_id} | {job.get('title')} | {job.get('locations')} | {job.get('travel')} | {job.get('date_posted')} | {job.get('url')}")


Total Python jobs: 169
Total knowledge fullstack jobs: 25
Total wanted Python jobs: 117
- 1864545 | Senior Fabric IP Verification Engineer | ['Austin, Texas, United States + 4 more locations'] | 0-25 % | Sep 18, 2025 | https://jobs.careers.microsoft.com/global/en/job/1864545/
- 1879713 | Senior Software Engineer | ['Redmond, Washington, United States'] | 0-25 % | Sep 23, 2025 | https://jobs.careers.microsoft.com/global/en/job/1879713/
- 1857039 | Principal Software Engineer | ['Redmond, Washington, United States + 4 more locations'] | None | Sep 23, 2025 | https://jobs.careers.microsoft.com/global/en/job/1857039/
- 1864645 | Software Engineer II | ['Multiple Locations, United States'] | 0-25 % | Sep 23, 2025 | https://jobs.careers.microsoft.com/global/en/job/1864645/
- 1828770 | Senior Applied Scientist-Word | ['Redmond, Washington, United States'] | 0-25 % | Sep 23, 2025 | https://jobs.careers.microsoft.com/global/en/job/1828770/
- 1882171 | Senior Software Engineer | ['Multiple Locat

In [7]:
from collections import defaultdict
from datetime import datetime
import os

# Group jobs by date_posted
jobs_by_date = defaultdict(list)

for job_id in sorted_jobs_detailed:
    job = full_detailed_list.get(job_id, {})
    date_posted = job.get('date_posted', 'unknown')
    
    # Create a clean filename from date
    if date_posted and date_posted != 'unknown':
        try:
            # Parse the date and format it for filename
            parsed_date = parse_date(date_posted)
            if parsed_date != datetime.max:
                # Format as "18_september_2025"
                filename_date = parsed_date.strftime("%d_%B_%Y").lower()
            else:
                filename_date = date_posted.replace("-", "_").replace(" ", "_").replace(",", "")
        except:
            filename_date = date_posted.replace("-", "_").replace(" ", "_").replace(",", "")
    else:
        filename_date = "unknown_date"
    
    jobs_by_date[filename_date].append({
        "job_id": job_id,
        "title": job.get('title'),
        "locations": job.get('locations'),
        "travel": job.get('travel'),
        "date_posted": date_posted,
        "url": job.get('url'),
        "required_qualifications_text": job.get('required_qualifications_text'),
        "preferred_qualifications_text": job.get('preferred_qualifications_text'),
        "other_requirements_text": job.get('other_requirements_text'),
        "pay_ranges": job.get('pay_ranges')
    })

# Save each date group to a separate file
output_dir = "jobs_by_date"
os.makedirs(output_dir, exist_ok=True)

for date_str, jobs_list in jobs_by_date.items():
    filename = f"jobs_{date_str}.json"
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(jobs_list, f, ensure_ascii=False, indent=2)
    
    print(f"Saved {len(jobs_list)} jobs to {filepath}")

print(f"\nTotal files created: {len(jobs_by_date)}")
print(f"Total jobs saved: {sum(len(jobs) for jobs in jobs_by_date.values())}")

Saved 1 jobs to jobs_by_date\jobs_18_september_2025.json
Saved 6 jobs to jobs_by_date\jobs_23_september_2025.json
Saved 35 jobs to jobs_by_date\jobs_24_september_2025.json
Saved 25 jobs to jobs_by_date\jobs_25_september_2025.json
Saved 25 jobs to jobs_by_date\jobs_26_september_2025.json
Saved 2 jobs to jobs_by_date\jobs_27_september_2025.json
Saved 23 jobs to jobs_by_date\jobs_29_september_2025.json

Total files created: 7
Total jobs saved: 117
