In [1]:
import pandas as pd
import numpy as np
import string

In [3]:
def levenshtein_distance(s1, s2):
    """
    Returns the Levenshtein edit distance between two strings s1 and s2.
    The distance is the minimum number of single-character edits
    (insertions, deletions, substitutions) required to transform s1 into s2.
    """
    if not s1:
        return len(s2)
    if not s2:
        return len(s1)

    rows = len(s1) + 1
    cols = len(s2) + 1
    dp = [[0]*cols for _ in range(rows)]

    for i in range(rows):
        dp[i][0] = i
    for j in range(cols):
        dp[0][j] = j

    for i in range(1, rows):
        for j in range(1, cols):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,
                dp[i][j-1] + 1,
                dp[i-1][j-1] + cost
            )
    return dp[-1][-1]


def digit_level_accuracy(true_val, pred_val):
    """
    Compares two numeric values (or strings) digit by digit,
    counting how many match exactly. Returns (matched_digits, total_digits_in_truth).
    """
    t_str = str(true_val)
    p_str = str(pred_val)

    matched = 0
    total = len(t_str)


    min_len = min(len(t_str), len(p_str))
    for i in range(min_len):
        if t_str[i] == p_str[i]:
            matched += 1

    return matched, total


def evaluate_accuracy(df_true, df_pred, numeric_cols, text_cols):
    """
    Given two DataFrames (human-verified truth, AI-extracted) with the same shape/rows,
    computes:
      - Overall digit-level accuracy for numeric columns
      - Average Levenshtein distance for text columns
    Returns a dict with metrics.
    """

    total_matched_digits = 0
    total_groundtruth_digits = 0
    for col in numeric_cols:
        for i in range(len(df_true)):
            matched, total = digit_level_accuracy(df_true[col].iloc[i], df_pred[col].iloc[i])
            total_matched_digits += matched
            total_groundtruth_digits += total

    numeric_accuracy = 0.0
    if total_groundtruth_digits > 0:
        numeric_accuracy = total_matched_digits / total_groundtruth_digits

    total_lev_dist = 0
    count_entries = 0
    for col in text_cols:
        for i in range(len(df_true)):
            true_str = str(df_true[col].iloc[i])
            pred_str = str(df_pred[col].iloc[i])
            dist = levenshtein_distance(true_str, pred_str)
            total_lev_dist += dist
            count_entries += 1

    avg_lev_dist = 0.0
    if count_entries > 0:
        avg_lev_dist = total_lev_dist / count_entries

    return {
        "digit_level_accuracy": numeric_accuracy,
        "avg_levenshtein_dist": avg_lev_dist
    }

def compare_digits_with_confusion(true_val, pred_val, confusion_matrix):
    """
    Compares two numeric values (or strings) digit by digit.
    Updates a 10x10 'confusion_matrix' with counts of (true_digit, pred_digit).

    Returns:
      matched_digits: int
      total_true_digits: int
    """
    t_str = str(true_val)
    p_str = str(pred_val)

    matched = 0
    total = len(t_str)

    min_len = min(len(t_str), len(p_str))
    for i in range(min_len):
        gt_digit = t_str[i]
        pred_digit = p_str[i]

        if gt_digit.isdigit() and pred_digit.isdigit():
            gt_idx = int(gt_digit)
            pd_idx = int(pred_digit)
            confusion_matrix[gt_idx, pd_idx] += 1

            if gt_digit == pred_digit:
                matched += 1
        else:
            pass

    return matched, total

# Cell 3: Evaluate numeric accuracy with confusion details

def evaluate_numeric_with_confusion(df_true, df_pred, numeric_cols):
    import numpy as np

    master_confusion = np.zeros((10,10), dtype=int)

    # Track overall matched/total
    overall_matched = 0
    overall_total   = 0

    # Per-column results
    per_column_stats = {}

    for col in numeric_cols:
        col_conf = np.zeros((10,10), dtype=int)
        col_matched = 0
        col_total = 0

        for i in range(len(df_true)):
            true_val = df_true[col].iloc[i]
            pred_val = df_pred[col].iloc[i]

            # Compare digits, update *column* confusion
            m, t = compare_digits_with_confusion(true_val, pred_val, col_conf)
            # Also update the *master* confusion for all numeric columns
            compare_digits_with_confusion(true_val, pred_val, master_confusion)

            col_matched += m
            col_total   += t

        # per-column digit-level accuracy
        col_acc = col_matched / col_total if col_total > 0 else 0

        per_column_stats[col] = {
            "digit_level_accuracy": col_acc,
            "confusion_matrix": col_conf
        }

        overall_matched += col_matched
        overall_total   += col_total

    # Overall digit-level accuracy across all numeric columns
    overall_acc = overall_matched / overall_total if overall_total > 0 else 0

    # Return the top-level fields matching the usage code
    return {
        "digit_level_accuracy": overall_acc,       # formerly "overall_digit_accuracy"
        "confusion_matrix": master_confusion,      # formerly "master_confusion"
        "per_column": per_column_stats
    }


def analyze_digit_confusion(confusion_matrix):
    """
    Prints info about digit confusion: top confusions, etc.
    """
    print("Digit Confusion Matrix (rows=ground-truth, cols=prediction):")
    print(confusion_matrix)
    print()

    row_sums = confusion_matrix.sum(axis=1)
    for digit in range(10):
        total_count = row_sums[digit]
        if total_count == 0:
            continue
        row = confusion_matrix[digit]
        sorted_preds = sorted(
            range(10),
            key=lambda x: row[x],
            reverse=True
        )

        top_pred = sorted_preds[0]
        top_count = row[top_pred]

        if len(sorted_preds) > 1:
            second_pred = sorted_preds[1]
            second_count = row[second_pred]
        else:
            second_pred = None
            second_count = 0

        print(f"Ground Truth Digit: {digit}")
        print(f"  Most predicted as: {top_pred} ({top_count} times)")
        if digit != top_pred:
            print(f"  --> This indicates confusion of {digit} -> {top_pred}")
        # Show second
        if second_pred is not None:
            print(f"  Second predicted as: {second_pred} ({second_count} times)")
        print()


In [4]:
def compare_chars_with_confusion(true_str, pred_str, confusion_matrix, char_list):
    """
    Compare two strings character-by-character, updating confusion_matrix
    (size = len(char_list) x len(char_list)).

    - `char_list` is a list or string of valid characters. For example,
      you might use `string.ascii_lowercase` for 'a'..'z'.
    - We convert both strings to lowercase here (you can do more normalization if desired).

    We do not handle insertions/deletions specially. We only compare up to min_len.
    """
    t_str = true_str.lower()
    p_str = pred_str.lower()

    min_len = min(len(t_str), len(p_str))
    for i in range(min_len):
        gt_char = t_str[i]
        pd_char = p_str[i]

        if gt_char in char_list and pd_char in char_list:
            gt_idx = char_list.index(gt_char)
            pd_idx = char_list.index(pd_char)
            confusion_matrix[gt_idx, pd_idx] += 1

In [5]:
def evaluate_text_with_char_confusion(df_true, df_pred, text_cols, char_list):
    """
    For each text column, build a character-level confusion matrix.
    Also track exact string match rate per column.

    Returns a dictionary with:
      - "per_column": { col_name: { "char_confusion_matrix": ..., "exact_match_rate": ...}, ... }
      - "master_char_confusion": big confusion matrix aggregated across all text cols
    """
    import numpy as np

    size = len(char_list)
    master_confusion = np.zeros((size, size), dtype=int)

    per_col_results = {}

    for col in text_cols:
        col_confusion = np.zeros((size, size), dtype=int)

        exact_matches = 0
        total_rows = len(df_true)

        for i in range(total_rows):
            true_str = str(df_true[col].iloc[i])
            pred_str = str(df_pred[col].iloc[i])

            compare_chars_with_confusion(true_str, pred_str, col_confusion, char_list)
            compare_chars_with_confusion(true_str, pred_str, master_confusion, char_list)

            if true_str.strip().lower() == pred_str.strip().lower():
                exact_matches += 1

        exact_rate = exact_matches / total_rows if total_rows > 0 else 0

        per_col_results[col] = {
            "char_confusion_matrix": col_confusion,
            "exact_match_rate": exact_rate
        }

    return {
        "per_column": per_col_results,
        "master_char_confusion": master_confusion
    }


In [6]:
def analyze_char_confusion(confusion_matrix, char_list):
    """
    Prints or interprets a character confusion matrix (rows=GT char, cols=Pred char).
    Now includes the second most commonly predicted character for each row.
    """
    row_sums = confusion_matrix.sum(axis=1)

    print("Character Confusion Matrix (rows=GT, cols=Prediction). Shape:", confusion_matrix.shape)
    print()

    for i, row_total in enumerate(row_sums):
        if row_total == 0:
            continue

        gt_char = char_list[i]
        row = confusion_matrix[i]

        sorted_preds = sorted(range(len(char_list)), key=lambda x: row[x], reverse=True)

        top_pred_idx = sorted_preds[0]
        top_count = row[top_pred_idx]

        if len(sorted_preds) > 1:
            second_pred_idx = sorted_preds[1]
            second_count = row[second_pred_idx]
        else:
            second_pred_idx = None
            second_count = 0

        print(f"Ground Truth Character: '{gt_char}' (count in GT = {row_total})")

        top_pred_char = char_list[top_pred_idx]
        print(f"  Most commonly predicted as: '{top_pred_char}' (count={top_count})")

        if top_pred_char == gt_char:
            if second_pred_idx is not None:
                second_pred_char = char_list[second_pred_idx]
                print(f"  Second most predicted (possible confusion): '{second_pred_char}' (count={second_count})")
        else:
            print(f"    --> Confusion: '{gt_char}' -> '{top_pred_char}'")
            if second_pred_idx is not None:
                second_pred_char = char_list[second_pred_idx]
                print(f"  Second predicted: '{second_pred_char}' (count={second_count})")

        print()


In [7]:
def evaluate_numeric_errors(df_true, df_pred, numeric_cols, debug=False):
    """
    Computes Mean Absolute Error (MAE) and Mean Absolute Percentage Error (MAPE)
    for the specified numeric_cols, both overall and per-column.

    If debug=True, prints out row-by-row details about parsing and detection of NaN/inf.

    Returns a dict:
    {
      "overall_mae": float,
      "overall_mape": float,
      "per_column": {
          col_name: {
              "mae": float,
              "mape": float
          }, ...
      }
    }
    """
    import numpy as np

    overall_abs_errors = []
    overall_pct_errors = []

    per_column_stats = {}

    # Track how many rows were successfully parsed (for debugging)
    parse_success_count = {col: 0 for col in numeric_cols}
    parse_fail_count    = {col: 0 for col in numeric_cols}

    # Also track how many rows we skip due to NaN or inf
    nan_or_inf_count = {col: 0 for col in numeric_cols}

    for col in numeric_cols:
        col_abs_errors = []
        col_pct_errors = []

        for i in range(len(df_true)):
            true_val = df_true[col].iloc[i]
            pred_val = df_pred[col].iloc[i]

            # 1) Attempt to parse ground-truth as float
            try:
                true_num = float(str(true_val).replace(",", "").strip())
            except:
                if debug:
                    print(f"[DEBUG] Row {i}, col '{col}' -> Failed to parse true_val = {true_val!r}")
                parse_fail_count[col] += 1
                continue

            # 2) Attempt to parse predicted as float
            try:
                pred_num = float(str(pred_val).replace(",", "").strip())
            except:
                if debug:
                    print(f"[DEBUG] Row {i}, col '{col}' -> Failed to parse pred_val = {pred_val!r}")
                parse_fail_count[col] += 1
                continue

            # 3) Check if either is nan or inf
            if (np.isnan(true_num) or np.isnan(pred_num)
                or np.isinf(true_num) or np.isinf(pred_num)):
                if debug:
                    print(f"[DEBUG] Row {i}, col '{col}' -> true_num={true_num}, pred_num={pred_num}, skipping (NaN/inf).")
                nan_or_inf_count[col] += 1
                continue

            # If we reach here, we have valid finite floats
            parse_success_count[col] += 1

            # 4) Calculate absolute error
            abs_err = abs(pred_num - true_num)
            col_abs_errors.append(abs_err)
            overall_abs_errors.append(abs_err)

            # 5) Calculate % error if true_num != 0
            if true_num != 0:
                pct_err = abs_err / abs(true_num)
                col_pct_errors.append(pct_err)
                overall_pct_errors.append(pct_err)
            else:
                # If you want to treat "true == 0 but pred != 0" as 100% error,
                # you could do: col_pct_errors.append(1.0) etc.
                # For now, we skip it.
                pass

        # 6) Compute MAE and MAPE for this column
        if len(col_abs_errors) > 0:
            col_mae  = float(np.mean(col_abs_errors))
        else:
            col_mae  = float('nan')

        if len(col_pct_errors) > 0:
            col_mape = float(np.mean(col_pct_errors))
        else:
            col_mape = float('nan')

        per_column_stats[col] = {
            "mae":  col_mae,
            "mape": col_mape
        }

    # 7) Overall errors
    if len(overall_abs_errors) > 0:
        overall_mae = float(np.mean(overall_abs_errors))
    else:
        overall_mae = float('nan')

    if len(overall_pct_errors) > 0:
        overall_mape = float(np.mean(overall_pct_errors))
    else:
        overall_mape = float('nan')

    # 8) Debug summary
    if debug:
        print("\n[DEBUG] Parsing & Skipping Summary for Numeric Columns:")
        for col in numeric_cols:
            print(f"  {col}: parsed={parse_success_count[col]}, parse_fail={parse_fail_count[col]}, nan_or_inf_skips={nan_or_inf_count[col]}")
        print()

    return {
        "overall_mae":  overall_mae,
        "overall_mape": overall_mape,
        "per_column":   per_column_stats
    }


In [11]:
############################
# Combined Usage Example   #
# with Numeric & Text Conf #
############################

import string

# 1) Specify your column types
numeric_columns = ["YEAR", "CONGRESSIONAL_DISTRICT", "VOTES"]
text_columns    = ["STATE", "RACE_TYPE", "CANDIDATE_NAME", "CANDIDATE_PARTY"]

# For text confusion, define a character list
char_list = string.ascii_lowercase  # or expand to handle punctuation, uppercase, etc.

########################################
# 1920, page 9
########################################
df_human_1920_p9 = pd.read_csv(r"1920pg9/1920pg9_correct.csv")
df_ai_1920_p9    = pd.read_csv(r"1920pg9/1920p9_rawoutput.csv")

print("=== 1920 Page 9 ===")

# -- 1. Overall accuracy (digit-level + Levenshtein)
results_1920_p9 = evaluate_accuracy(df_human_1920_p9, df_ai_1920_p9, numeric_columns, text_columns)
print("Overall Digit-Level Accuracy:", results_1920_p9["digit_level_accuracy"])
print("Average Levenshtein Distance (text):", results_1920_p9["avg_levenshtein_dist"])
print("---------------------------------")

# -- 2. Numeric confusion
num_results_1920_p9 = evaluate_numeric_with_confusion(df_human_1920_p9, df_ai_1920_p9, numeric_columns)
print("Digit-Level Accuracy (via confusion approach):", num_results_1920_p9["digit_level_accuracy"])
analyze_digit_confusion(num_results_1920_p9["confusion_matrix"])
print("---------------------------------")

# -- 3. Text confusion
txt_results_1920_p9 = evaluate_text_with_char_confusion(df_human_1920_p9, df_ai_1920_p9, text_columns, char_list)
master_char_conf_1920_p9 = txt_results_1920_p9["master_char_confusion"]
print("Text Character Confusion:")
analyze_char_confusion(master_char_conf_1920_p9, char_list)
print("---------------------------------")

# Optionally, show per-column text exact match rates:
for col, info in txt_results_1920_p9["per_column"].items():
    print(f"Column: {col}")
    print(f"  Exact Match Rate: {info['exact_match_rate']:.2f}")
print("===============================================\n\n")


########################################
# 1940, page 1
########################################
df_human_1940_p1 = pd.read_csv(r"1940pg1/1940pg1_correct.csv")
df_ai_1940_p1    = pd.read_csv(r"1940pg1/1940pg1_rawoutput.csv")

print("=== 1940 Page 1 ===")

# -- 1. Overall accuracy
results_1940_p1 = evaluate_accuracy(df_human_1940_p1, df_ai_1940_p1, numeric_columns, text_columns)
print("Overall Digit-Level Accuracy:", results_1940_p1["digit_level_accuracy"])
print("Average Levenshtein Distance (text):", results_1940_p1["avg_levenshtein_dist"])
print("---------------------------------")

# -- 2. Numeric confusion
num_results_1940_p1 = evaluate_numeric_with_confusion(df_human_1940_p1, df_ai_1940_p1, numeric_columns)
print("Digit-Level Accuracy (via confusion approach):", num_results_1940_p1["digit_level_accuracy"])
analyze_digit_confusion(num_results_1940_p1["confusion_matrix"])
print("---------------------------------")

# -- 3. Text confusion
txt_results_1940_p1 = evaluate_text_with_char_confusion(df_human_1940_p1, df_ai_1940_p1, text_columns, char_list)
master_char_conf_1940_p1 = txt_results_1940_p1["master_char_confusion"]
print("Text Character Confusion:")
analyze_char_confusion(master_char_conf_1940_p1, char_list)
print("---------------------------------")

# Optionally, show per-column text exact match rates:
for col, info in txt_results_1940_p1["per_column"].items():
    print(f"Column: {col}")
    print(f"  Exact Match Rate: {info['exact_match_rate']:.2f}")
print("===============================================\n\n")


########################################
# 1980, page 3
########################################
df_human_1980_p3 = pd.read_csv(r"1980pg3/1980pg3_correct.csv")
df_ai_1980_p3    = pd.read_csv(r"1980pg3/1980pg3_rawoutput.csv")

print("=== 1980 Page 3 ===")

# -- 1. Overall accuracy
results_1980_p3 = evaluate_accuracy(df_human_1980_p3, df_ai_1980_p3, numeric_columns, text_columns)
print("Overall Digit-Level Accuracy:", results_1980_p3["digit_level_accuracy"])
print("Average Levenshtein Distance (text):", results_1980_p3["avg_levenshtein_dist"])
print("---------------------------------")

# -- 2. Numeric confusion
num_results_1980_p3 = evaluate_numeric_with_confusion(df_human_1980_p3, df_ai_1980_p3, numeric_columns)
print("Digit-Level Accuracy (via confusion approach):", num_results_1980_p3["digit_level_accuracy"])
analyze_digit_confusion(num_results_1980_p3["confusion_matrix"])
print("---------------------------------")

# -- 3. Text confusion
txt_results_1980_p3 = evaluate_text_with_char_confusion(df_human_1980_p3, df_ai_1980_p3, text_columns, char_list)
master_char_conf_1980_p3 = txt_results_1980_p3["master_char_confusion"]
print("Text Character Confusion:")
analyze_char_confusion(master_char_conf_1980_p3, char_list)
print("---------------------------------")

# Optionally, show per-column text exact match rates:
for col, info in txt_results_1980_p3["per_column"].items():
    print(f"Column: {col}")
    print(f"  Exact Match Rate: {info['exact_match_rate']:.2f}")
print("===============================================\n\n")


=== 1920 Page 9 ===
Overall Digit-Level Accuracy: 0.9850427350427351
Average Levenshtein Distance (text): 0.03353658536585366
---------------------------------
Digit-Level Accuracy (via confusion approach): 0.8675213675213675
Digit Confusion Matrix (rows=ground-truth, cols=prediction):
[[178   0   0   0   0   0   0   0   0   1]
 [  0 132   0   0   0   0   0   0   0   0]
 [  0   0 132   0   0   0   0   0   0   0]
 [  0   0   0  44   2   0   0   0   0   0]
 [  0   0   1   0  44   0   0   0   0   0]
 [  0   0   0   1   0  44   0   0   2   1]
 [  0   1   0   0   0   0  36   0   1   0]
 [  0   0   0   0   0   0   0  46   0   0]
 [  0   0   0   1   0   0   0   0  36   0]
 [  2   0   0   0   1   0   0   0   0 120]]

Ground Truth Digit: 0
  Most predicted as: 0 (178 times)
  Second predicted as: 9 (1 times)

Ground Truth Digit: 1
  Most predicted as: 1 (132 times)
  Second predicted as: 0 (0 times)

Ground Truth Digit: 2
  Most predicted as: 2 (132 times)
  Second predicted as: 0 (0 times)

Gr

In [12]:
# New Usage Example for the 1940 CSVs

import pandas as pd
import string

# 1) Specify columns
numeric_columns = ["YEAR", "CONGRESSIONAL_DISTRICT", "VOTES"]
text_columns    = ["STATE", "RACE_TYPE", "CANDIDATE_NAME", "CANDIDATE_PARTY"]

# 2) Load the new CSV files
df_human_1940new = pd.read_csv(r"1940full/1940human.csv")
df_ai_1940new    = pd.read_csv(r"1940full/1940raw.csv")

print("=== 1940 NEW DATA ===")

# -- A) Overall accuracy (digit-level + Levenshtein)
results_1940new = evaluate_accuracy(
    df_human_1940new,
    df_ai_1940new,
    numeric_columns,
    text_columns
)
print("Overall Digit-Level Accuracy:", results_1940new["digit_level_accuracy"])
print("Average Levenshtein Distance (text):", results_1940new["avg_levenshtein_dist"])
print("---------------------------------")

# -- B) Numeric confusion
num_results_1940new = evaluate_numeric_with_confusion(
    df_human_1940new,
    df_ai_1940new,
    numeric_columns
)
print("Digit-Level Accuracy (via confusion approach):", num_results_1940new["digit_level_accuracy"])
analyze_digit_confusion(num_results_1940new["confusion_matrix"])
print("---------------------------------")

# -- C) Text character confusion
char_list = string.ascii_lowercase  # or expand for punctuation, uppercase, etc.
txt_results_1940new = evaluate_text_with_char_confusion(
    df_human_1940new,
    df_ai_1940new,
    text_columns,
    char_list
)
master_char_conf_1940new = txt_results_1940new["master_char_confusion"]
print("Text Character Confusion:")
analyze_char_confusion(master_char_conf_1940new, char_list)
print("---------------------------------")

# -- D) Absolute and Percentage Errors
err_results = evaluate_numeric_errors(df_human_1940new, df_ai_1940new, numeric_columns, debug=True)

print("Overall MAE:", err_results["overall_mae"])
print("Overall MAPE:", err_results["overall_mape"])
for col, stats in err_results["per_column"].items():
    print(f"Column: {col}, MAE={stats['mae']}, MAPE={stats['mape']}")

# -- Optionally, show per-column text exact match rates:
for col, info in txt_results_1940new["per_column"].items():
    print(f"Column: {col}")
    print(f"  Exact Match Rate: {info['exact_match_rate']:.2f}")
print("===============================================\n")


=== 1940 NEW DATA ===
Overall Digit-Level Accuracy: 0.9814772467413675
Average Levenshtein Distance (text): 0.07880434782608696
---------------------------------
Digit-Level Accuracy (via confusion approach): 0.8705694031557283
Digit Confusion Matrix (rows=ground-truth, cols=prediction):
[[471   0   0   2   0   0   1   0   0   9]
 [  0 667   2   0   0   1   0   0   0   1]
 [  0   2 621   0   0   0   0   1   0   0]
 [  0   0   7 236   1   3   0   1   2   1]
 [  0   0   1   1 578   0   0   0   0   0]
 [  0   0   0   2   0 214   0   1   0   0]
 [  4   0   0   1   2   1 183   0   0   4]
 [  0   0   0   0   0   0   0 168   0   1]
 [  0   0   0   3   0   1   0   0 161   1]
 [  2   0   0   2   1   1   5   1   0 508]]

Ground Truth Digit: 0
  Most predicted as: 0 (471 times)
  Second predicted as: 9 (9 times)

Ground Truth Digit: 1
  Most predicted as: 1 (667 times)
  Second predicted as: 2 (2 times)

Ground Truth Digit: 2
  Most predicted as: 2 (621 times)
  Second predicted as: 1 (2 times)



In [13]:
# Combined Usage Example with Confusion Analysis

# 1) Specify column types
numeric_columns = ["YEAR", "CONGRESSIONAL_DISTRICT", "VOTES"]
text_columns = ["STATE", "RACE_TYPE", "CANDIDATE_NAME", "CANDIDATE_PARTY"]

########################################
# 1920, page 9
########################################
df_human_1920_p9 = pd.read_csv(r"1920pg9/1920pg9_correct.csv")
df_ai_1920_p9    = pd.read_csv(r"1920pg9/1920p9_rawoutput.csv")

# -- Overall accuracy (digit-level + Levenshtein) --
results_1920_p9 = evaluate_accuracy(df_human_1920_p9, df_ai_1920_p9, numeric_columns, text_columns)
print("=== 1920 Page 9 ===")
print("Digit-Level Accuracy (overall):", results_1920_p9["digit_level_accuracy"])
print("Avg Levenshtein Distance (text):", results_1920_p9["avg_levenshtein_dist"])
print("---------------------------------")

# -- Confusion matrix for numeric columns --
conf_results_1920_p9 = evaluate_numeric_with_confusion(df_human_1920_p9, df_ai_1920_p9, numeric_columns)
digit_acc_1920_p9 = conf_results_1920_p9["digit_level_accuracy"]
conf_mat_1920_p9  = conf_results_1920_p9["confusion_matrix"]

print("Digit-Level Accuracy (via confusion approach):", digit_acc_1920_p9)
print("Confusion Matrix (rows=ground-truth, cols=prediction):\n", conf_mat_1920_p9, "\n")
analyze_digit_confusion(conf_mat_1920_p9)
print("===============================================\n\n")


########################################
# 1940, page 1
########################################
df_human_1940_p1 = pd.read_csv(r"1940pg1/1940pg1_correct.csv")
df_ai_1940_p1    = pd.read_csv(r"1940pg1/1940pg1_rawoutput.csv")

# -- Overall accuracy --
results_1940_p1 = evaluate_accuracy(df_human_1940_p1, df_ai_1940_p1, numeric_columns, text_columns)
print("=== 1940 Page 1 ===")
print("Digit-Level Accuracy (overall):", results_1940_p1["digit_level_accuracy"])
print("Avg Levenshtein Distance (text):", results_1940_p1["avg_levenshtein_dist"])
print("---------------------------------")

# -- Confusion matrix for numeric columns --
conf_results_1940_p1 = evaluate_numeric_with_confusion(df_human_1940_p1, df_ai_1940_p1, numeric_columns)
digit_acc_1940_p1 = conf_results_1940_p1["digit_level_accuracy"]
conf_mat_1940_p1  = conf_results_1940_p1["confusion_matrix"]

print("Digit-Level Accuracy (via confusion approach):", digit_acc_1940_p1)
print("Confusion Matrix (rows=ground-truth, cols=prediction):\n", conf_mat_1940_p1, "\n")
analyze_digit_confusion(conf_mat_1940_p1)
print("===============================================\n\n")


########################################
# 1980, page 3
########################################
df_human_1980_p3 = pd.read_csv(r"1980pg3/1980pg3_correct.csv")
df_ai_1980_p3    = pd.read_csv(r"1980pg3/1980pg3_rawoutput.csv")

# -- Overall accuracy --
results_1980_p3 = evaluate_accuracy(df_human_1980_p3, df_ai_1980_p3, numeric_columns, text_columns)
print("=== 1980 Page 3 ===")
print("Digit-Level Accuracy (overall):", results_1980_p3["digit_level_accuracy"])
print("Avg Levenshtein Distance (text):", results_1980_p3["avg_levenshtein_dist"])
print("---------------------------------")

# -- Confusion matrix for numeric columns --
conf_results_1980_p3 = evaluate_numeric_with_confusion(df_human_1980_p3, df_ai_1980_p3, numeric_columns)
digit_acc_1980_p3 = conf_results_1980_p3["digit_level_accuracy"]
conf_mat_1980_p3  = conf_results_1980_p3["confusion_matrix"]

print("Digit-Level Accuracy (via confusion approach):", digit_acc_1980_p3)
print("Confusion Matrix (rows=ground-truth, cols=prediction):\n", conf_mat_1980_p3, "\n")
analyze_digit_confusion(conf_mat_1980_p3)
print("===============================================\n\n")


=== 1920 Page 9 ===
Digit-Level Accuracy (overall): 0.9850427350427351
Avg Levenshtein Distance (text): 0.03353658536585366
---------------------------------
Digit-Level Accuracy (via confusion approach): 0.8675213675213675
Confusion Matrix (rows=ground-truth, cols=prediction):
 [[178   0   0   0   0   0   0   0   0   1]
 [  0 132   0   0   0   0   0   0   0   0]
 [  0   0 132   0   0   0   0   0   0   0]
 [  0   0   0  44   2   0   0   0   0   0]
 [  0   0   1   0  44   0   0   0   0   0]
 [  0   0   0   1   0  44   0   0   2   1]
 [  0   1   0   0   0   0  36   0   1   0]
 [  0   0   0   0   0   0   0  46   0   0]
 [  0   0   0   1   0   0   0   0  36   0]
 [  2   0   0   0   1   0   0   0   0 120]] 

Digit Confusion Matrix (rows=ground-truth, cols=prediction):
[[178   0   0   0   0   0   0   0   0   1]
 [  0 132   0   0   0   0   0   0   0   0]
 [  0   0 132   0   0   0   0   0   0   0]
 [  0   0   0  44   2   0   0   0   0   0]
 [  0   0   1   0  44   0   0   0   0   0]
 [  0   0  

In [14]:
numeric_columns = ["YEAR", "CONGRESSIONAL_DISTRICT", "VOTES"]
text_columns = ["STATE", "RACE_TYPE", "CANDIDATE_NAME", "CANDIDATE_PARTY"]

# 1920, page 9
df_human_1920_p9 = pd.read_csv(r"1920pg9/1920pg9_correct.csv")
df_ai_1920_p9    = pd.read_csv(r"1920pg9/1920p9_rawoutput.csv")

results_1920_p1 = evaluate_accuracy(df_human_1920_p9, df_ai_1920_p9, numeric_columns, text_columns)
print("=== 1920 Page 9 ===")
print("Digit-Level Accuracy:", results_1920_p1["digit_level_accuracy"])
print("Avg Levenshtein Distance:", results_1920_p1["avg_levenshtein_dist"])
print("---------------------------------\n")

# 1940, page 1
df_human_1940_p1 = pd.read_csv(r"1940pg1/1940pg1_correct.csv")
df_ai_1940_p1    = pd.read_csv(r"1940pg1/1940pg1_rawoutput.csv")

results_1940_p1 = evaluate_accuracy(df_human_1940_p1, df_ai_1940_p1, numeric_columns, text_columns)
print("=== 1940 Page 1 ===")
print("Digit-Level Accuracy:", results_1940_p1["digit_level_accuracy"])
print("Avg Levenshtein Distance:", results_1940_p1["avg_levenshtein_dist"])
print("---------------------------------\n")

# 4) 1980, page 3
df_human_1980_p3 = pd.read_csv(r"1980pg3/1980pg3_correct.csv")
df_ai_1980_p3    = pd.read_csv(r"1980pg3/1980pg3_rawoutput.csv")

results_1940_p1 = evaluate_accuracy(df_human_1980_p3, df_ai_1980_p3, numeric_columns, text_columns)
print("=== 1940 Page 1 ===")
print("Digit-Level Accuracy:", results_1980_p3["digit_level_accuracy"])
print("Avg Levenshtein Distance:", results_1980_p3["avg_levenshtein_dist"])
print("---------------------------------\n")


=== 1920 Page 9 ===
Digit-Level Accuracy: 0.9850427350427351
Avg Levenshtein Distance: 0.03353658536585366
---------------------------------

=== 1940 Page 1 ===
Digit-Level Accuracy: 0.9908814589665653
Avg Levenshtein Distance: 0.7241379310344828
---------------------------------

=== 1940 Page 1 ===
Digit-Level Accuracy: 0.9876237623762376
Avg Levenshtein Distance: 0.0
---------------------------------

