In [1]:
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm

%matplotlib inline
%config InlineBackend.figure_format = "retina"

sns.set(rc={"figure.figsize" : (25, 15)})
sns.set(font_scale=2)
sns.set_style("ticks")

In [2]:
CROSSTAB_COLS = ["CD1", "CD2", "CD3", "CD4", "VOTED_2020_TRUMP", "VOTED_2020_BIDEN", "VOTED_2020_DIDNT",
                 "CHILD_LT_18", "EDU_NO_COLL", "EDU_COLL"]
CROSSTAB_COL_GROUPS = [["CD1", "CD2", "CD3", "CD4"], ["VOTED_2020_TRUMP", "VOTED_2020_BIDEN", "VOTED_2020_DIDNT"],
                       ["CHILD_LT_18"], ["EDU_NO_COLL", "EDU_COLL"]]
CONST_COLS = ["CANDIDATE", "LV"]

In [3]:
# source: https://sos.iowa.gov/elections/pdf/VRStatsArchive/2024/CongOct24.pdf
CD_REG_DF = pandas.DataFrame(
    [[129541 + 48380, 127321 + 42982, 134188 + 44986, 85143 + 38710],
     [149869 + 34385, 149972 + 30288, 151254 + 33128, 191956 + 45281],
     [2394 + 584 + 1670 + 549, 2434 + 534 + 1509 + 528,
      2831 + 535 + 1674 + 514, 2388 + 493 + 1754 + 572]],
    index=["DEM", "REP", "OTHER"], columns=["CD1", "CD2", "CD3", "CD4"])

CD_REG_DF.sum(axis=1) / CD_REG_DF.sum(axis=1).sum()

DEM      0.446568
REP      0.539058
OTHER    0.014374
dtype: float64

In [4]:
def f(weights):
    # TODO: I don't think this is right
    return 1 + ((np.std(weights) / np.mean(weights)) ** 2)

---

## Load Crosstab

In [5]:
crosstab_df = pandas.read_csv("data/iowa_2024/iowa_2024_crosstab.csv")

In [6]:
crosstab_df = crosstab_df[CONST_COLS + CROSSTAB_COLS].copy().fillna(0).set_index("CANDIDATE")

In [7]:
total_wgt = crosstab_df["LV"].loc["WGT"]
total_wgt

849

In [8]:
total_wgt - (crosstab_df["LV"].loc["HARRIS"] + crosstab_df["LV"].loc["TRUMP"])

78

In [9]:
other_df = crosstab_df[~crosstab_df.index.isin(["HARRIS", "TRUMP", "Total Unweighted Respondents", "WGT"])].copy()
other = other_df.sum(axis=0)
other.name = "OTHER"
other["LV"] = other["LV"] + 2
other

LV                  78.0
CD1                 20.0
CD2                 21.0
CD3                 18.0
CD4                 19.0
VOTED_2020_TRUMP    25.0
VOTED_2020_BIDEN    10.0
VOTED_2020_DIDNT    10.0
CHILD_LT_18         21.0
EDU_NO_COLL         51.0
EDU_COLL            24.0
Name: OTHER, dtype: float64

In [10]:
crosstab_df = pandas.concat(
    [crosstab_df[crosstab_df.index.isin(["HARRIS", "TRUMP"])], pandas.DataFrame([other])]).reset_index(names=["CANDIDATE"])

In [11]:
temp = None

for group_cols in CROSSTAB_COL_GROUPS:
    df = crosstab_df[CONST_COLS + group_cols].copy()
    if group_cols[0] != "CD1":
        unk_col = group_cols[0].split("_")[0] + "_" + "UNK"
        df[unk_col] = df["LV"] - df[group_cols].sum(axis=1)
    if temp is None:
        temp = df.copy()
    else:
        temp = temp.merge(df, on=CONST_COLS, how="left")

crosstab_df = temp.copy()

In [12]:
crosstab_df

Unnamed: 0,CANDIDATE,LV,CD1,CD2,CD3,CD4,VOTED_2020_TRUMP,VOTED_2020_BIDEN,VOTED_2020_DIDNT,VOTED_UNK,CHILD_LT_18,CHILD_UNK,EDU_NO_COLL,EDU_COLL,EDU_UNK
0,HARRIS,399.0,113.0,99.0,100.0,87.0,12.0,322.0,48.0,17.0,98.0,301.0,205.0,191.0,3.0
1,TRUMP,372.0,78.0,97.0,91.0,107.0,299.0,13.0,45.0,15.0,114.0,258.0,270.0,97.0,5.0
2,OTHER,78.0,20.0,21.0,18.0,19.0,25.0,10.0,10.0,33.0,21.0,57.0,51.0,24.0,3.0


In [13]:
crosstab_df.set_index("CANDIDATE")["LV"] / crosstab_df["LV"].sum()

CANDIDATE
HARRIS    0.469965
TRUMP     0.438163
OTHER     0.091873
Name: LV, dtype: float64

In [14]:
0.469965 - 0.438163

0.031802

---

## Basic Cell-by-Cell Weighting

### Data Prep

In [234]:
# cols = ["CD1", "CD2", "CD3", "CD4"]
# ct_df = crosstab_df.set_index("CANDIDATE")[cd_cols].copy()

cols = ["B1", "B2", "B3"]
index = ["A1", "A2", "A3", "A4"]

ct_df = pandas.DataFrame([[20, 40, 40], [50, 140, 310], [100, 50, 50], [30, 100, 70]],
                         columns=cols, index=index)

ct_df

Unnamed: 0,B1,B2,B3
A1,20,40,40
A2,50,140,310
A3,100,50,50
A4,30,100,70


In [235]:
# pop_df = CD_REG_DF.copy()
# pop_df.index = ["HARRIS", "TRUMP", "OTHER"]

pop_df = pandas.DataFrame([[80, 40, 55], [60, 150, 340], [170, 60, 200], [55, 165, 125]],
                          index=index, columns=cols)

pop_df

Unnamed: 0,B1,B2,B3
A1,80,40,55
A2,60,150,340
A3,170,60,200
A4,55,165,125


### Cell Weighting

In [236]:
temp = ct_df.join(pop_df, rsuffix="_reg")

In [237]:
for col in cols:
    temp[col + "_weight"] = temp[col + "_reg"] / temp[col]

In [238]:
f(temp[[c for c in temp.columns if c.endswith("_weight")]].values)

1.3066977054439586

In [239]:
temp

Unnamed: 0,B1,B2,B3,B1_reg,B2_reg,B3_reg,B1_weight,B2_weight,B3_weight
A1,20,40,40,80,40,55,4.0,1.0,1.375
A2,50,140,310,60,150,340,1.2,1.071429,1.096774
A3,100,50,50,170,60,200,1.7,1.2,4.0
A4,30,100,70,55,165,125,1.833333,1.65,1.785714


---

## Basic Raking

In [240]:
expected_col = pop_df.sum(axis=0)
expected_row = pop_df[cols].sum(axis=1)

In [241]:
expected_row

A1    175
A2    550
A3    430
A4    345
dtype: int64

In [242]:
expected_col

B1    365
B2    415
B3    720
dtype: int64

In [243]:
temp = ct_df.copy()

In [244]:
temp = temp * (expected_row.sum() / ct_df.sum(axis=0).sum())
temp

Unnamed: 0,B1,B2,B3
A1,30.0,60.0,60.0
A2,75.0,210.0,465.0
A3,150.0,75.0,75.0
A4,45.0,150.0,105.0


In [245]:
with tqdm() as pbar:
    while True:
        rake_rows = pop_df.sum(axis=1) / temp.sum(axis=1)
        temp = temp.mul(rake_rows, axis=0)
        rake_cols = pop_df.sum(axis=0) / temp.sum(axis=0)
        temp = temp.mul(rake_cols, axis=1)
        if temp.sum(axis=1).round(0).astype(int).equals(expected_row) and temp.sum(axis=0).round(0).astype(int).equals(expected_col):
             break
        else:
            pbar.update()

2it [00:00, 238.63it/s]


In [246]:
temp = temp.div(ct_df)
temp

Unnamed: 0,B1,B2,B3
A1,1.811188,1.45283,2.016621
A2,1.084102,0.869604,1.207066
A3,2.195626,1.761203,2.444663
A4,1.833618,1.470821,2.041595


In [247]:
f(temp.values)

1.0728032347521452