In [93]:
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm

%matplotlib inline
%config InlineBackend.figure_format = "retina"

sns.set(rc={"figure.figsize" : (25, 15)})
sns.set(font_scale=2)
sns.set_style("ticks")

In [2]:
CROSSTAB_COLS = ["CD1", "CD2", "CD3", "CD4", "VOTED_2020_TRUMP", "VOTED_2020_BIDEN", "VOTED_2020_DIDNT",
                 "CHILD_LT_18", "EDU_NO_COLL", "EDU_COLL"]
CROSSTAB_COL_GROUPS = [["CD1", "CD2", "CD3", "CD4"], ["VOTED_2020_TRUMP", "VOTED_2020_BIDEN", "VOTED_2020_DIDNT"],
                       ["CHILD_LT_18"], ["EDU_NO_COLL", "EDU_COLL"]]
CONST_COLS = ["CANDIDATE", "LV"]

In [105]:
# source: https://sos.iowa.gov/elections/pdf/VRStatsArchive/2024/CongOct24.pdf
CD_REG_DF = pandas.DataFrame(
    [[129541 + 48380, 127321 + 42982, 134188 + 44986, 85143 + 38710],
     [149869 + 34385, 149972 + 30288, 151254 + 33128, 191956 + 45281],
     [2394 + 584 + 1670 + 549, 2434 + 534 + 1509 + 528,
      2831 + 535 + 1674 + 514, 2388 + 493 + 1754 + 572]],
    index=["DEM", "REP", "OTHER"], columns=["CD1", "CD2", "CD3", "CD4"])

CD_REG_DF.sum(axis=1) / CD_REG_DF.sum(axis=1).sum()

DEM      0.446568
REP      0.539058
OTHER    0.014374
dtype: float64

In [4]:
def f(weights):
    # TODO: I don't think this is right
    return 1 + ((np.std(weights) / np.mean(weights)) ** 2)

---

## Load Crosstab

In [5]:
crosstab_df = pandas.read_csv("data/iowa_2024/iowa_2024_crosstab.csv")

In [6]:
crosstab_df = crosstab_df[CONST_COLS + CROSSTAB_COLS].copy().fillna(0).set_index("CANDIDATE")

In [7]:
total_wgt = crosstab_df["LV"].loc["WGT"]
total_wgt

849

In [8]:
total_wgt - (crosstab_df["LV"].loc["HARRIS"] + crosstab_df["LV"].loc["TRUMP"])

78

In [9]:
other_df = crosstab_df[~crosstab_df.index.isin(["HARRIS", "TRUMP", "Total Unweighted Respondents", "WGT"])].copy()
other = other_df.sum(axis=0)
other.name = "OTHER"
other["LV"] = other["LV"] + 2
other

LV                  78.0
CD1                 20.0
CD2                 21.0
CD3                 18.0
CD4                 19.0
VOTED_2020_TRUMP    25.0
VOTED_2020_BIDEN    10.0
VOTED_2020_DIDNT    10.0
CHILD_LT_18         21.0
EDU_NO_COLL         51.0
EDU_COLL            24.0
Name: OTHER, dtype: float64

In [10]:
crosstab_df = pandas.concat(
    [crosstab_df[crosstab_df.index.isin(["HARRIS", "TRUMP"])], pandas.DataFrame([other])]).reset_index(names=["CANDIDATE"])

In [11]:
temp = None

for group_cols in CROSSTAB_COL_GROUPS:
    df = crosstab_df[CONST_COLS + group_cols].copy()
    if group_cols[0] != "CD1":
        unk_col = group_cols[0].split("_")[0] + "_" + "UNK"
        df[unk_col] = df["LV"] - df[group_cols].sum(axis=1)
    if temp is None:
        temp = df.copy()
    else:
        temp = temp.merge(df, on=CONST_COLS, how="left")

crosstab_df = temp.copy()

In [12]:
crosstab_df

Unnamed: 0,CANDIDATE,LV,CD1,CD2,CD3,CD4,VOTED_2020_TRUMP,VOTED_2020_BIDEN,VOTED_2020_DIDNT,VOTED_UNK,CHILD_LT_18,CHILD_UNK,EDU_NO_COLL,EDU_COLL,EDU_UNK
0,HARRIS,399.0,113.0,99.0,100.0,87.0,12.0,322.0,48.0,17.0,98.0,301.0,205.0,191.0,3.0
1,TRUMP,372.0,78.0,97.0,91.0,107.0,299.0,13.0,45.0,15.0,114.0,258.0,270.0,97.0,5.0
2,OTHER,78.0,20.0,21.0,18.0,19.0,25.0,10.0,10.0,33.0,21.0,57.0,51.0,24.0,3.0


In [13]:
crosstab_df.set_index("CANDIDATE")["LV"] / crosstab_df["LV"].sum()

CANDIDATE
HARRIS    0.469965
TRUMP     0.438163
OTHER     0.091873
Name: LV, dtype: float64

In [14]:
0.469965 - 0.438163

0.031802

---

## Basic Cell-by-Cell Weighting

### Data Prep

In [219]:
cd_cols = ["CD1", "CD2", "CD3", "CD4"]
ct_df = crosstab_df.set_index("CANDIDATE")[cd_cols].copy()
ct_df

Unnamed: 0_level_0,CD1,CD2,CD3,CD4
CANDIDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HARRIS,113.0,99.0,100.0,87.0
TRUMP,78.0,97.0,91.0,107.0
OTHER,20.0,21.0,18.0,19.0


In [220]:
pop_df = CD_REG_DF.copy()
pop_df.index = ["HARRIS", "TRUMP", "OTHER"]
pop_df

Unnamed: 0,CD1,CD2,CD3,CD4
HARRIS,177921,170303,179174,123853
TRUMP,184254,180260,184382,237237
OTHER,5197,5005,5554,5207


### Cell Weighting

In [221]:
temp = ct_df.div(ct_df.sum(axis=1), axis=0).join(pop_df.div(pop_df.sum(axis=1), axis=0), rsuffix="_reg")

In [222]:
temp["CD1_weight"] = temp["CD1_reg"] / temp["CD1"]
temp["CD2_weight"] = temp["CD2_reg"] / temp["CD2"]
temp["CD3_weight"] = temp["CD3_reg"] / temp["CD3"]
temp["CD4_weight"] = temp["CD4_reg"] / temp["CD4"]

In [223]:
f(temp[["CD1_weight", "CD2_weight", "CD3_weight", "CD4_weight"]].values)

1.0082143353106263

In [224]:
temp

Unnamed: 0_level_0,CD1,CD2,CD3,CD4,CD1_reg,CD2_reg,CD3_reg,CD4_reg,CD1_weight,CD2_weight,CD3_weight,CD4_weight
CANDIDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
HARRIS,0.283208,0.24812,0.250627,0.218045,0.273199,0.261501,0.275123,0.190177,0.964658,1.05393,1.09774,0.872191
TRUMP,0.209115,0.260054,0.243968,0.286863,0.23438,0.2293,0.234543,0.301777,1.120818,0.88174,0.961369,1.05199
OTHER,0.25641,0.269231,0.230769,0.24359,0.247913,0.238754,0.264943,0.24839,0.966861,0.886801,1.148086,1.019706


In [225]:
temp2 = (ct_df *
         temp[["CD1_weight", "CD2_weight", "CD3_weight", "CD4_weight"]].rename(
             columns={c : c.replace("_weight", "") for c in temp.columns if c.endswith("_weight")}))
temp2

Unnamed: 0_level_0,CD1,CD2,CD3,CD4
CANDIDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HARRIS,109.006326,104.339029,109.773998,75.880647
TRUMP,87.42381,85.528759,87.484543,112.562888
OTHER,19.337213,18.622812,20.665554,19.374422


In [226]:
temp2.sum(axis=1) / temp2.sum(axis=1).sum()

CANDIDATE
HARRIS    0.469412
TRUMP     0.438824
OTHER     0.091765
dtype: float64

---

## Basic Raking

In [227]:
expected_col = pop_df.sum(axis=1)
expected_row = pop_df[cd_cols].sum(axis=0)

In [228]:
expected_row

CD1    367372
CD2    355568
CD3    369110
CD4    366297
dtype: int64

In [229]:
expected_col

HARRIS    651251
TRUMP     786133
OTHER      20963
dtype: int64

In [230]:
temp = ct_df.copy()

In [231]:
with tqdm() as pbar:
    while True:
        rake_rows = pop_df.sum(axis=1) / temp.sum(axis=1)
        temp = temp.mul(rake_rows, axis=0)
        rake_cols = pop_df.sum(axis=0) / temp.sum(axis=0)
        temp = temp.mul(rake_cols, axis=1)
        if temp.sum(axis=1).astype(int).equals(expected_col) and temp.sum(axis=0).astype(int).equals(expected_row):
            break
        else:
            pbar.update()

9it [00:00, 410.44it/s]


In [232]:
temp

Unnamed: 0_level_0,CD1,CD2,CD3,CD4
CANDIDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HARRIS,190872.775196,154185.724387,166999.202193,139193.298225
TRUMP,170920.61886,195981.471686,197146.966125,222083.943329
OTHER,5578.605945,5400.803927,4963.831682,5019.758446


In [233]:
f(temp.values)

1.4861372796912073

In [238]:
temp2 = ct_df * temp
temp2

Unnamed: 0_level_0,CD1,CD2,CD3,CD4
CANDIDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HARRIS,21568620.0,15264390.0,16699920.0,12109820.0
TRUMP,13331810.0,19010200.0,17940370.0,23762980.0
OTHER,111572.1,113416.9,89348.97,95375.41


In [239]:
temp2.sum(axis=1) / temp2.sum(axis=1).sum()

CANDIDATE
HARRIS    0.468549
TRUMP     0.528526
OTHER     0.002924
dtype: float64