# Setup

- Prepare environment and imports.
- Define input/output paths; create output folder.
- List available files and show examples.
- Set basic config (years, fuzzy thresholds).

Check availability and directories

In [None]:
# notebook setup
from pathlib import Path
import re
import pandas as pd
from unidecode import unidecode
from rapidfuzz import fuzz, process
from pandas.errors import EmptyDataError

# paths
DATA_DIR = Path("E:/Daniel/Stockholm/submission/python/data")
OUTPUT_DIR = Path("E:/Daniel/Stockholm/submission/notebooks/submission/python/output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# files
members_path = DATA_DIR / "congress_members_with_parties.csv"
election_paths = sorted(DATA_DIR.glob("congressional_elections_*.csv"))

print("Members file:", "OK" if members_path.exists() else "NOT FOUND")
print("Election files:", len(election_paths))
print("Examples:", [p.name for p in election_paths[:5]])

# config
YEARS_EXPECTED = list(range(1992, 2026))
FUZZY_HIGH_MIN = 92
FUZZY_MED_MIN = 88


Members file: OK
Election files: 34
Examples: ['congressional_elections_1992.csv', 'congressional_elections_1993.csv', 'congressional_elections_1994.csv', 'congressional_elections_1995.csv', 'congressional_elections_1996.csv']


# name normalisation

- Lowercase and remove accents.
- Strip suffixes (jr/sr/ii–v).
- Remove punctuation and collapse spaces.
- Optional removal of onomastic stopwords (e.g., *de, del, van*).
- Returns a clean string for matching.



In [70]:
# normalise names
STOPWORDS_ONOMASTIC = {
    "de","del","la","las","los","y","e","da","do","dos","das",
    "van","von","der","den","di","della","dalla","dela","al","el","bin","ibn"
}
_SUFFIX_RE = re.compile(r"\b(jr|jr\.|sr|sr\.|ii|iii|iv|v)\b", re.IGNORECASE)

# tweak: do not drop 1-letter tokens (preserve initials)
def normalise_name(s: str, remove_stopwords: bool = True) -> str:
    if s is None:
        return ""
    s = unidecode(str(s)).lower()
    s = _SUFFIX_RE.sub(" ", s)
    s = re.sub(r"[-–—'’.,]", " ", s)
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    if not s:
        return ""
    tokens = s.split()
    if remove_stopwords:
        # keep initials (len == 1), drop stopwords only if len > 1
        tokens = [t for t in tokens if not (t in STOPWORDS_ONOMASTIC and len(t) > 1)]
    return " ".join(tokens)


# members standardisation

- Load reference members with selected columns.
- Build `firstlast_raw` = first + last (names).
- Create `display_name_std` via `normalise_name`.
- Save cleaned table to `members_std.csv`.
- Print basic counts and preview.

In [71]:
# members
usecols = ["name","first_name","last_name","title","state","house_or_senate","start_year","end_year","party"]
m = pd.read_csv(members_path, usecols=usecols, low_memory=False).copy()
m["firstlast_raw"] = (
    m["first_name"].fillna("").astype(str).str.strip() + " " +
    m["last_name"].fillna("").astype(str).str.strip()
).str.strip()
m["display_name_std"] = m["firstlast_raw"].map(normalise_name)

out_members = OUTPUT_DIR / "members_std.csv"
m.to_csv(out_members, index=False)

choices = m["display_name_std"].astype(str).tolist()

print("Saved:", out_members)
print("Members rows:", len(m), "| Unique display names:", m["display_name_std"].nunique())
display(m.head(8)[["name","first_name","last_name","display_name_std"]])


Saved: E:\Daniel\Stockholm\submission\notebooks\submission\python\output\members_std.csv
Members rows: 2873 | Unique display names: 2586


Unnamed: 0,name,first_name,last_name,display_name_std
0,"Abdnor, James",James,Abdnor,james abdnor
1,"Abdnor, James",James,Abdnor,james abdnor
2,"Abercrombie, Neil",Neil,Abercrombie,neil abercrombie
3,"Abercrombie, Neil",Neil,Abercrombie,neil abercrombie
4,"Abourezk, James",James,Abourezk,james abourezk
5,"Abourezk, James",James,Abourezk,james abourezk
6,"Abraham, Ralph Lee",Ralph Lee,Abraham,ralph lee abraham
7,"Abraham, Spencer",Spencer,Abraham,spencer abraham


# elections standardisation

- Read all yearly election files (skip empty files).
- Keep key fields, add `display_name_std` with `normalise_name` (similar to member).
- Attach `source_file` for traceability and cast `year`.
- Concatenate all years into a single table and save as `elections_std_all.csv`.
- Print row counts, sample years, and a small preview.

In [72]:
# elections
keep = ["name","party","year","url","status","election"]
frames = []
for p in election_paths:
    try:
        df = pd.read_csv(p, usecols=keep, low_memory=False)
    except EmptyDataError:
        continue
    df["display_name_std"] = df["name"].map(normalise_name)
    df["source_file"] = p.name
    frames.append(df)

e_all = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=keep + ["display_name_std","source_file"])
e_all["year"] = pd.to_numeric(e_all["year"], errors="coerce").astype("Int64")

out_elec = OUTPUT_DIR / "elections_std_all.csv"
e_all.to_csv(out_elec, index=False)

print("Saved:", out_elec)
print("Rows:", len(e_all), "| Years:", e_all["year"].dropna().unique()[:10])
display(e_all.head(8))

Saved: E:\Daniel\Stockholm\submission\notebooks\submission\python\output\elections_std_all.csv
Rows: 24244 | Years: <IntegerArray>
[1992, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003]
Length: 10, dtype: Int64


Unnamed: 0,name,url,status,party,year,election,display_name_std,source_file
0,Mary Jordan,/candidate/1046/mary-jordan,lost,Green Party,1992,congressional,mary jordan,congressional_elections_1992.csv
1,Frank Murkowski,/candidate/53267/frank-murkowski,won,Republican,1992,congressional,frank murkowski,congressional_elections_1992.csv
2,Tony Smith,/candidate/1047/tony-smith,lost,Democratic,1992,congressional,tony smith,congressional_elections_1992.csv
3,John Devens,/candidate/3/john-devens,lost,Democratic,1992,congressional,john devens,congressional_elections_1992.csv
4,Mike Milligan,/candidate/1/mike-milligan,lost,Green Party,1992,congressional,mike milligan,congressional_elections_1992.csv
5,Michael States,/candidate/1044/michael-states,lost,,1992,congressional,michael states,congressional_elections_1992.csv
6,Donald Young,/candidate/26717/donald-young,won,Republican,1992,congressional,donald young,congressional_elections_1992.csv
7,Richard Sellers,/candidate/1035/richard-sellers,lost,Republican,1992,congressional,richard sellers,congressional_elections_1992.csv


# counts verification

- Compare original vs standardised row counts for members.
- Sum rows across all election files vs concatenated standard table.
- Report differences and list any skipped empty files

In [73]:
# verify counts

# members counts
mem_orig_rows = pd.read_csv(members_path, usecols=["name"], low_memory=False).shape[0]
mem_std_rows  = pd.read_csv(out_members,   usecols=["display_name_std"], low_memory=False).shape[0]

# elections counts
elec_orig_rows, skipped = 0, []
for p in election_paths:
    try:
        elec_orig_rows += pd.read_csv(p, usecols=["name"], low_memory=False).shape[0]
    except EmptyDataError:
        skipped.append(p.name)

elec_std_rows = pd.read_csv(out_elec, usecols=["display_name_std"], low_memory=False).shape[0]

print("Members original rows:", mem_orig_rows)
print("Members std rows     :", mem_std_rows)
print(" → diff (std - orig) :", mem_std_rows - mem_orig_rows)
print()
print("Elections original rows (sum):", elec_orig_rows)
print("Elections std rows           :", elec_std_rows)
print(" → diff (std - sum)          :", elec_std_rows - elec_orig_rows)
if skipped:
    print("\nSkipped empty files:", len(skipped))
    print("Examples:", skipped[:5])

Members original rows: 2873
Members std rows     : 2873
 → diff (std - orig) : 0

Elections original rows (sum): 24244
Elections std rows           : 24244
 → diff (std - sum)          : 0

Skipped empty files: 2
Examples: ['congressional_elections_1993.csv', 'congressional_elections_1995.csv']


# Stricter matching

**Goal.** High-precision links by combining surname blocking, robust given-name rules, and conservative fuzzy similarity.

---

### Inputs
- `members_std.csv`  
  - `firstlast_raw`, `display_name_std` (stopwords removed), `display_name_ns` (stopwords kept).
- `elections_std_all.csv`  
  - Same normalised name fields.

> Note: Suffixes like *Jr.*, *Sr.*, *II/III* are stripped during normalisation. 
---

### Blocking (surname)
- Build candidate sets **only** within the same **last-token** surname block (`member_last` ↔ `elec_last`).
- Additional guard: require intersection between the member’s surname set and the candidate’s surname set.  
  - `surname_set()` uses the last **two** tokens when `USE_LAST2_SURNAME=True` (helps with hyphenations/double surnames).
- **No cross-surname fallback** (prioritises precision over recall).

---

### Aliases (given names)
- Small canonical map of nicknames ↔ formal names (e.g., *tom ↔ thomas*, *bill ↔ william*).
- Generate first-name **variants** (nick→formal and formal→nick) used only for scoring.

---

### Given-names rule
- Split each name into **given names** (all tokens before the surname) and the **surname**.
- **First given name**
  - Accept canonical equality or nickname equivalence.
  - If one side is an **initial**, require the **same initial**; also allow the “**A. Donald**” ↔ “**Donald**” fallback.
- **Additional given names on the *member* side**
  - If the candidate has a token at that position → require **same canonical** **or** **same initial**.
  - If the candidate lacks that token → **accept** (missing middle on candidate).
- **Extra given names on the candidate** beyond the member are **ignored** (no penalty).
- This blocks false positives such as **“Albert L. Smith”** ↔ **“Albert Alan Smith”** (L ≠ A).

---

### Similarity scoring
- RapidFuzz `token_sort_ratio` on two views:
  1) **Full** name vs full (`display_name_ns`).
  2) **First+last** only (to reduce middle-name noise).
- For each alias variant, take the max of the two scores; overall **score = max** across variants.

---

### Thresholds & limits
- Keep a candidate when `score ≥ MIN_SCORE` (here **90**).
- Cap per member at `TOPK` (here **35**) and **de-duplicate** by `(normalised candidate name, year, candidate name)`.

---

### Outputs
- **`per_member_top{TOPK}.csv`**  
  Wide table with up to `TOPK` candidates per member: `elec_name_j`, `elec_year_j`, `elec_party_j`, `elec_score_j`.
- Quick diagnostics: number of members **with**/**without** candidates and a small preview.


In [74]:
# stricter matching (no "extras-only-initials" veto)
TOPK = 35
MIN_SCORE = 90
REQUIRE_SURNAME = True
USE_LAST2_SURNAME = True

# fresh data
m = pd.read_csv(OUTPUT_DIR / "members_std.csv", low_memory=False)
e_all = pd.read_csv(OUTPUT_DIR / "elections_std_all.csv", low_memory=False)

# ensure 'e' is not treated as an onomastic stopword
try:
    STOPWORDS_ONOMASTIC = set(STOPWORDS_ONOMASTIC) - {"e"}
except NameError:
    pass

# alias canon
ALIAS = {
    "tom":"thomas","mike":"michael","dan":"daniel","rob":"robert","ron":"ronald",
    "joe":"joseph","pete":"peter","charlie":"charles","doug":"douglas","ben":"benjamin",
    "bill":"william","bob":"robert","pat":"patrick","jim":"james","liz":"elizabeth"
}
FORMAL = {**{k: v for k, v in ALIAS.items()}, **{v: v for v in ALIAS.values()}}

# helpers
_last = lambda s: (str(s).strip().split()[-1] if str(s).strip() else "")

def surname_set(name_std: str) -> set:
    toks = [t for t in str(name_std).split() if len(t) > 1 and t not in STOPWORDS_ONOMASTIC]
    if not toks:
        return set()
    return set(toks[-2:]) if (USE_LAST2_SURNAME and len(toks) >= 2) else {toks[-1]}

def first_last_only(name_ns: str) -> str:
    toks = str(name_ns).split()
    if not toks:
        return ""
    return f"{toks[0]} {toks[-1]}".strip()

def alias_variants_ns(name_ns: str) -> set:
    parts = str(name_ns).split()
    if not parts:
        return {name_ns}
    first, rest = parts[0], parts[1:]
    variants = {name_ns}
    if first in ALIAS:
        variants.add(" ".join([ALIAS[first], *rest]))
    inv = {v: k for k, v in ALIAS.items()}
    if first in inv:
        variants.add(" ".join([inv[first], *rest]))
    return variants

def given_names_ok(member_ns: str, elec_ns: str) -> bool:
    """
    Given-name rules (tokens before surname):
      - First given name: allow alias/initial coherence, including 'A. Donald' ↔ 'Donald'.
      - Extra given names on MEMBER: if CANDIDATE has a token at that position, require same initial or canonical equality; if missing, accept.
      - Extra given names on CANDIDATE beyond MEMBER are ignored.
    """
    mtoks = str(member_ns).split()
    etoks = str(elec_ns).split()
    if len(mtoks) < 2 or len(etoks) < 2:
        return False

    mgiven = mtoks[:-1]
    egiven = etoks[:-1]

    m1 = mgiven[0]; e1 = egiven[0]
    m1c = FORMAL.get(m1, m1); e1c = FORMAL.get(e1, e1)
    m2 = mgiven[1] if len(mgiven) > 1 else ""
    m2c = FORMAL.get(m2, m2) if m2 else ""
    m1i, e1i = m1[:1], e1[:1]; m2i = m2[:1] if m2 else ""

    if len(m1) == 1:
        if e1i == m1i:
            pass
        elif m2 and len(e1) > 1 and e1c == m2c:
            pass
        else:
            return False
    else:
        if len(e1) == 1:
            if e1i == m1i or (m2 and e1i == m2i):
                pass
            else:
                return False
        else:
            if e1c == m1c:
                pass
            elif m2 and e1c == m2c:
                pass
            elif m2 and len(m2) == 1 and e1i == m2i:
                pass
            else:
                return False

    for pos in range(1, len(mgiven)):
        m_tok = mgiven[pos]
        if pos < len(egiven):
            e_tok = egiven[pos]
            if FORMAL.get(m_tok, m_tok) == FORMAL.get(e_tok, e_tok):
                continue
            if m_tok[:1] == e_tok[:1]:
                continue
            return False
        else:
            continue

    return True

# ensure ns columns
if "display_name_ns" not in m.columns:
    m["display_name_ns"] = m["firstlast_raw"].map(lambda s: normalise_name(s, remove_stopwords=False))
if "display_name_ns" not in e_all.columns:
    e_all["display_name_ns"] = e_all["name"].map(lambda s: normalise_name(s, remove_stopwords=False))

# robust std->ns map
m_ns_map = (
    m.sort_values(["display_name_std"])
     .drop_duplicates(subset=["display_name_std"], keep="first")
     .set_index("display_name_std")["display_name_ns"]
     .to_dict()
)

# surname blocking
m["member_last"] = m["display_name_std"].map(_last)
e_all["elec_last"] = e_all["display_name_std"].map(_last)
idx_e = {ln: g.index.tolist() for ln, g in e_all.groupby("elec_last")}

# aggregate member metadata
agg_base = (
    m.groupby("display_name_std", as_index=False)
     .agg(
         member_name=("name", "first"),
         member_parties=("party", lambda s: ", ".join(sorted({str(x) for x in s.dropna()}))),
         member_last=("member_last", "first"),
     )
)
mm = m.copy()
mm["start_year"] = pd.to_numeric(mm["start_year"], errors="coerce")
mm["end_year"]   = pd.to_numeric(mm["end_year"],   errors="coerce")
periods = (
    mm.dropna(subset=["start_year", "end_year"])
      .groupby("display_name_std")
      .apply(lambda g: "; ".join(sorted({f"{int(a)}-{int(b)}" for a,b in zip(g["start_year"], g["end_year"])})))
      .reset_index(name="member_periods")
)
agg = agg_base.merge(periods, on="display_name_std", how="left").fillna({"member_periods": ""})

# matching
rows = []
for _, r in agg.iterrows():
    mem_std_from_file = r["display_name_std"]
    mem_ns = m_ns_map.get(str(mem_std_from_file), "") or str(mem_std_from_file)
    mem_std = normalise_name(mem_ns, remove_stopwords=True)

    mem_surn = surname_set(mem_std)
    cand_idx = idx_e.get(r["member_last"], [])
    picked = []

    if cand_idx:
        scored = []
        variants_ns = alias_variants_ns(mem_ns)
        for i in cand_idx:
            cs_std = e_all.at[i, "display_name_std"]
            cs_ns  = e_all.at[i, "display_name_ns"]

            if REQUIRE_SURNAME and mem_surn.isdisjoint(surname_set(cs_std)):
                continue
            if not given_names_ok(mem_ns, cs_ns):
                continue

            cs_fl = first_last_only(cs_ns)
            best_scores = []
            for v in variants_ns:
                v_fl    = first_last_only(v)
                sc_full = fuzz.token_sort_ratio(v,   cs_ns)
                sc_fl   = fuzz.token_sort_ratio(v_fl, cs_fl)
                best_scores.append(max(sc_full, sc_fl))
            sc = max(best_scores)

            if sc >= MIN_SCORE:
                scored.append((sc, i))

        top = sorted(scored, key=lambda x: x[0], reverse=True)[:max(200, TOPK*10)]
        seen = set()
        for sc, i in top:
            cand = e_all.loc[i]
            key = (cand.get("display_name_ns"), cand.get("year"), cand.get("name"))
            if key in seen:
                continue
            seen.add(key)
            picked.append((sc, cand))
            if len(picked) >= TOPK:
                break

    rec = {
        "member_std": mem_std,
        "member_name": r["member_name"],
        "member_parties": r.get("member_parties", ""),
        "member_periods": r.get("member_periods", "")
    }
    for j, (sc, c) in enumerate(picked, start=1):
        rec[f"elec_std_{j}"]   = c.get("display_name_ns")
        rec[f"elec_name_{j}"]  = c.get("name")
        rec[f"elec_year_{j}"]  = int(c.get("year")) if pd.notna(c.get("year")) else None
        rec[f"elec_party_{j}"] = c.get("party")
        rec[f"elec_score_{j}"] = int(sc)
    rows.append(rec)

summary = pd.DataFrame(rows)
out_summary = OUTPUT_DIR / f"per_member_top{TOPK}.csv"
summary.to_csv(out_summary, index=False)
print("Saved:", out_summary, "| Rows:", len(summary))

# quick report
score_cols = [c for c in summary.columns if c.startswith("elec_score_")]
summary["num_candidates"] = summary[score_cols].notna().sum(axis=1) if score_cols else 0
summary["top_score"] = summary[score_cols].max(axis=1).fillna(0).astype(int)

matches_only = summary[summary["num_candidates"] > 0].sort_values(["top_score","member_std"], ascending=[False, True])
unmatched_only = summary[summary["num_candidates"] == 0].sort_values("member_std")

print("With candidates:", len(matches_only), "| Without candidates:", len(unmatched_only))
display(matches_only.head(10)[["member_std","top_score","num_candidates","elec_name_1","elec_score_1"]])






  .apply(lambda g: "; ".join(sorted({f"{int(a)}-{int(b)}" for a,b in zip(g["start_year"], g["end_year"])})))


Saved: E:\Daniel\Stockholm\submission\notebooks\submission\python\output\per_member_top35.csv | Rows: 2586
With candidates: 1679 | Without candidates: 907


Unnamed: 0,member_std,top_score,num_candidates,elec_name_1,elec_score_1
2,aaron bean,100,2,Aaron Bean,100.0
3,aaron schock,100,4,Aaron Schock,100.0
4,abby finkenauer,100,2,Abby Finkenauer,100.0
5,abigail davis spanberger,100,3,Abigail Spanberger,100.0
8,abraham j hamadeh,100,1,Abraham Hamadeh,100.0
10,adam b schiff,100,13,Adam Schiff,100.0
12,adam gray,100,2,Adam Gray,100.0
13,adam h putnam,100,5,Adam Putnam,100.0
14,adam kinzinger,100,6,Adam Kinzinger,100.0
15,adam smith,100,15,Adam Smith,100.0


# Final outputs & audit artefacts

**Goal.** Convert the per-member wide results into a tidy, analysis-ready table, also produce unmatched members and a *borderline* review set for manual inspection.

---

### What this cell writes
- **`final_matches.csv`** — long (tidy) table with **all kept links** per member.
- **`unmatched_members.csv`** — members with **zero** candidates after strict matching.
- **`borderline_review.csv`** — **near-miss** links that passed all gates but scored in **[80, 89]** for human review.

---

### Steps

1) **Load the per-member summary**  
   Reads `per_member_top{TOPK}.csv` (wide format: `elec_name_1 … elec_name_K`, etc.).

2) **Detect available slots**  
   Infers which `elec_*_{j}` columns exist to be stacked.

3) **Stack wide → long**  
   - For each slot `j`, keep non-null rows and rename to:
     - `name_elec`, `election_year`, `party_elec`, `score`.
   - Add `slot = j` to preserve the **rank** within each member (1 = best).

4) **Normalise & classify match type**  
   - Build `elec_std = normalise_name(name_elec, remove_stopwords=True)`.
   - `match_type = "exact"` if `elec_std == member_std`, else `"fuzzy"`.
   - Cast `election_year` and `score` to integer dtypes.

5) **Order rows**  
   Sort by `member_std`, then `election_year`, then `slot`.

6) **Write matches** → `final_matches.csv`  
   Columns:
   - `member_std`, `member_name`, `member_parties`, `member_periods`
   - `election_year`, `name_elec`, `party_elec`, `match_type`, `score`, `slot`

7) **Write unmatched** → `unmatched_members.csv`  
   Members with no `elec_name_*` populated in the summary.

8) **Build borderline set** → `borderline_review.csv`  
   - Reuses **surname blocking** and **given-names rule**.  
   - Scores each candidate with the same similarity logic as strict matching.  
   - Keeps those with `score ∈ [80, 89]`.  
   - May include rows for members who already have accepted links — this is intentional for **audit**.

9) **Print a brief report**  
   - Total members in summary.
   - Unique members matched.
   - Total links written to `final_matches.csv`.
   - Count of borderline rows.



In [75]:
# final outputs

# 1) Load summary (wide)
if "summary" not in locals():
    _sum_path = OUTPUT_DIR / f"per_member_top{TOPK}.csv"  # adjust if you saved with a different suffix
    summary = pd.read_csv(_sum_path, low_memory=False)

# 2) Detect slots
slots = sorted({int(c.split("_")[-1]) for c in summary.columns if c.startswith("elec_name_")})

# 3) Stack results
base_cols = ["member_std","member_name","member_parties","member_periods"]
rows = []
for j in slots:
    nm, yr, pt, sc = f"elec_name_{j}", f"elec_year_{j}", f"elec_party_{j}", f"elec_score_{j}"
    keep = [col for col in [nm, yr, pt, sc] if col in summary.columns]
    if nm not in keep:
        continue
    dj = summary[base_cols + keep].copy()
    dj = dj[dj[nm].notna()]
    if dj.empty:
        continue
    dj = dj.rename(columns={
        nm: "name_elec",
        yr: "election_year",
        pt: "party_elec",
        sc: "score",
    })
    dj["slot"] = j
    rows.append(dj)

final_long = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(
    columns=base_cols+["name_elec","election_year","party_elec","score","slot"]
)

# 4) Normalise + type
if not final_long.empty:
    final_long["elec_std"]   = final_long["name_elec"].map(lambda s: normalise_name(s, remove_stopwords=True))
    final_long["match_type"] = (final_long["elec_std"] == final_long["member_std"]).map(lambda x: "exact" if x else "fuzzy")
    final_long["election_year"] = pd.to_numeric(final_long["election_year"], errors="coerce").astype("Int64")
    final_long["score"]         = pd.to_numeric(final_long["score"], errors="coerce").astype("Int64")

# 5) Sort nicely
final_long = final_long.sort_values(["member_std","election_year","slot"], na_position="last")

# 6) Save matches
final_cols = [
    "member_std","member_name","member_parties","member_periods",
    "election_year","name_elec","party_elec","match_type","score","slot"
]
final_path = OUTPUT_DIR / "final_matches.csv"
final_long[final_cols].to_csv(final_path, index=False)

# 7) Save unmatched
cand_cols = [c for c in summary.columns if c.startswith("elec_name_")]
summary["num_candidates"] = summary[cand_cols].notna().sum(axis=1) if cand_cols else 0
unmatched = summary[summary["num_candidates"].fillna(0).eq(0)][
    ["member_std","member_name","member_parties","member_periods"]
].copy()
unmatched_path = OUTPUT_DIR / "unmatched_members.csv"
unmatched.to_csv(unmatched_path, index=False)

# 8) Borderline build (85–89)
BORDER_MIN, BORDER_MAX = 80, 89

# Ensure inputs/helpers exist
if "m" not in locals():
    m = pd.read_csv(OUTPUT_DIR / "members_std.csv", low_memory=False)
if "e_all" not in locals():
    e_all = pd.read_csv(OUTPUT_DIR / "elections_std_all.csv", low_memory=False)

# Ensure NS columns
if "display_name_ns" not in m.columns:
    m["display_name_ns"] = m["firstlast_raw"].map(lambda s: normalise_name(s, remove_stopwords=False))
if "display_name_ns" not in e_all.columns:
    e_all["display_name_ns"] = e_all["name"].map(lambda s: normalise_name(s, remove_stopwords=False))

# Blocking indices (if missing)
if "idx_e" not in locals() or "agg" not in locals():
    # member surname for blocking
    m["member_last"] = m["display_name_std"].map(lambda s: (str(s).strip().split()[-1] if str(s).strip() else ""))
    e_all["elec_last"] = e_all["display_name_std"].map(lambda s: (str(s).strip().split()[-1] if str(s).strip() else ""))
    idx_e = {ln: g.index.tolist() for ln, g in e_all.groupby("elec_last")}
    # aggregate minimal member info
    agg = (
        m.groupby("display_name_std", as_index=False)
         .agg(
             member_name=("name","first"),
             member_parties=("party", lambda s: ", ".join(sorted({str(x) for x in s.dropna()}))),
             member_last=("member_last","first")
         )
    )
    mm = m.copy()
    mm["start_year"] = pd.to_numeric(mm["start_year"], errors="coerce")
    mm["end_year"]   = pd.to_numeric(mm["end_year"], errors="coerce")
    periods = (
        mm.dropna(subset=["start_year","end_year"])
          .groupby("display_name_std")
          .apply(lambda g: "; ".join(sorted({f"{int(a)}-{int(b)}" for a,b in zip(g["start_year"], g["end_year"])})))
          .reset_index(name="member_periods")
    )
    agg = agg.merge(periods, on="display_name_std", how="left").fillna({"member_periods": ""})

# Borderline search
border_rows = []
for _, r in agg.iterrows():
    mem_std = r["display_name_std"]
    # if this member already has accepted links in final_long, we can still collect near-miss for audit
    mem_ns = m.loc[m["display_name_std"] == mem_std, "display_name_ns"].iloc[0]
    mem_surn = surname_set(mem_std)

    cand_idx = idx_e.get(r["member_last"], [])
    if not cand_idx:
        continue

    variants_ns = alias_variants_ns(mem_ns)
    for i in cand_idx:
        cs_std = e_all.at[i, "display_name_std"]
        cs_ns  = e_all.at[i, "display_name_ns"]

        # surname + given-names gates
        if mem_surn.isdisjoint(surname_set(cs_std)):
            continue
        if not given_names_ok(mem_ns, cs_ns):
            continue

        # similarity (best of full vs first+last across aliases)
        cs_fl = " ".join([str(cs_ns).split()[0], str(cs_ns).split()[-1]]) if str(cs_ns).split() else ""
        best = []
        for v in variants_ns:
            toks_v = str(v).split()
            v_fl = " ".join([toks_v[0], toks_v[-1]]) if toks_v else ""
            sc_full = fuzz.token_sort_ratio(v,   cs_ns)
            sc_fl   = fuzz.token_sort_ratio(v_fl, cs_fl)
            best.append(max(sc_full, sc_fl))
        sc = max(best) if best else 0

        if BORDER_MIN <= sc <= BORDER_MAX:
            border_rows.append({
                "member_std": mem_std,
                "member_name": r["member_name"],
                "member_parties": r.get("member_parties",""),
                "member_periods": r.get("member_periods",""),
                "name_elec": e_all.at[i, "name"],
                "election_year": e_all.at[i, "year"],
                "party_elec": e_all.at[i, "party"],
                "score": int(sc)
            })

borderline = pd.DataFrame(border_rows).sort_values(
    ["member_std","election_year","score"], ascending=[True, True, False]
)

border_path = OUTPUT_DIR / "borderline_review.csv"
borderline.to_csv(border_path, index=False)

# 9) Report summary
n_members_total   = summary.shape[0]
n_members_matched = final_long["member_std"].nunique() if not final_long.empty else 0
n_links_total     = len(final_long)
n_borderline      = len(borderline)

print("Saved:", final_path)
print("Saved:", unmatched_path)
print("Saved:", border_path)
print(f"Members total:   {n_members_total}")
print(f"Members matched: {n_members_matched}")
print(f"Links total:     {n_links_total}")
print(f"Borderline (85–89): {n_borderline}")

# 10) Quick check example
chk = final_long[final_long["member_std"].eq("a donald mceachin")]
print("Rows for 'a donald mceachin':", len(chk))
display(chk.head(10))




Saved: E:\Daniel\Stockholm\submission\notebooks\submission\python\output\final_matches.csv
Saved: E:\Daniel\Stockholm\submission\notebooks\submission\python\output\unmatched_members.csv
Saved: E:\Daniel\Stockholm\submission\notebooks\submission\python\output\borderline_review.csv
Members total:   2586
Members matched: 1679
Links total:     8164
Borderline (85–89): 91
Rows for 'a donald mceachin': 4


Unnamed: 0,member_std,member_name,member_parties,member_periods,name_elec,election_year,party_elec,score,slot,elec_std,match_type
0,a donald mceachin,"McEachin, A. Donald",Democratic,2017-2022,Donald McEachin,2016,Democratic,93,1,donald mceachin,fuzzy
1679,a donald mceachin,"McEachin, A. Donald",Democratic,2017-2022,Donald McEachin,2018,Democratic,93,2,donald mceachin,fuzzy
3127,a donald mceachin,"McEachin, A. Donald",Democratic,2017-2022,Donald McEachin,2020,Democratic,93,3,donald mceachin,fuzzy
4299,a donald mceachin,"McEachin, A. Donald",Democratic,2017-2022,Donald McEachin,2022,Democratic,93,4,donald mceachin,fuzzy
