In [26]:
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from cell_weighting import CellReweighter, RakeReweighter

%matplotlib inline
%config InlineBackend.figure_format = "retina"

sns.set(rc={"figure.figsize" : (25, 15)})
sns.set(font_scale=2)
sns.set_style("ticks")

In [13]:
%load_ext autoreload
%autoreload 2

In [2]:
CROSSTAB_COLS = ["CD1", "CD2", "CD3", "CD4", "VOTED_2020_TRUMP", "VOTED_2020_BIDEN", "VOTED_2020_DIDNT",
                 "CHILD_LT_18", "EDU_NO_COLL", "EDU_COLL"]
CROSSTAB_COL_GROUPS = [["CD1", "CD2", "CD3", "CD4"], ["VOTED_2020_TRUMP", "VOTED_2020_BIDEN", "VOTED_2020_DIDNT"],
                       ["CHILD_LT_18"], ["EDU_NO_COLL", "EDU_COLL"]]
CONST_COLS = ["CANDIDATE", "LV"]

In [3]:
# source: https://sos.iowa.gov/elections/pdf/VRStatsArchive/2024/CongOct24.pdf
CD_REG_DF = pandas.DataFrame(
    [[129541 + 48380, 127321 + 42982, 134188 + 44986, 85143 + 38710],
     [149869 + 34385, 149972 + 30288, 151254 + 33128, 191956 + 45281],
     [2394 + 584 + 1670 + 549, 2434 + 534 + 1509 + 528,
      2831 + 535 + 1674 + 514, 2388 + 493 + 1754 + 572]],
    index=["DEM", "REP", "OTHER"], columns=["CD1", "CD2", "CD3", "CD4"])

CD_REG_DF.sum(axis=1) / CD_REG_DF.sum(axis=1).sum()

DEM      0.446568
REP      0.539058
OTHER    0.014374
dtype: float64

---

## Load Crosstab

In [5]:
crosstab_df = pandas.read_csv("data/iowa_2024/iowa_2024_crosstab.csv")

In [6]:
crosstab_df = crosstab_df[CONST_COLS + CROSSTAB_COLS].copy().fillna(0).set_index("CANDIDATE")

In [7]:
total_wgt = crosstab_df["LV"].loc["WGT"]
total_wgt

849

In [8]:
total_wgt - (crosstab_df["LV"].loc["HARRIS"] + crosstab_df["LV"].loc["TRUMP"])

78

In [9]:
other_df = crosstab_df[~crosstab_df.index.isin(["HARRIS", "TRUMP", "Total Unweighted Respondents", "WGT"])].copy()
other = other_df.sum(axis=0)
other.name = "OTHER"
other["LV"] = other["LV"] + 2
other

LV                  78.0
CD1                 20.0
CD2                 21.0
CD3                 18.0
CD4                 19.0
VOTED_2020_TRUMP    25.0
VOTED_2020_BIDEN    10.0
VOTED_2020_DIDNT    10.0
CHILD_LT_18         21.0
EDU_NO_COLL         51.0
EDU_COLL            24.0
Name: OTHER, dtype: float64

In [10]:
crosstab_df = pandas.concat(
    [crosstab_df[crosstab_df.index.isin(["HARRIS", "TRUMP"])], pandas.DataFrame([other])]).reset_index(names=["CANDIDATE"])

In [11]:
temp = None

for group_cols in CROSSTAB_COL_GROUPS:
    df = crosstab_df[CONST_COLS + group_cols].copy()
    if group_cols[0] != "CD1":
        unk_col = group_cols[0].split("_")[0] + "_" + "UNK"
        df[unk_col] = df["LV"] - df[group_cols].sum(axis=1)
    if temp is None:
        temp = df.copy()
    else:
        temp = temp.merge(df, on=CONST_COLS, how="left")

crosstab_df = temp.copy()

In [12]:
crosstab_df

Unnamed: 0,CANDIDATE,LV,CD1,CD2,CD3,CD4,VOTED_2020_TRUMP,VOTED_2020_BIDEN,VOTED_2020_DIDNT,VOTED_UNK,CHILD_LT_18,CHILD_UNK,EDU_NO_COLL,EDU_COLL,EDU_UNK
0,HARRIS,399.0,113.0,99.0,100.0,87.0,12.0,322.0,48.0,17.0,98.0,301.0,205.0,191.0,3.0
1,TRUMP,372.0,78.0,97.0,91.0,107.0,299.0,13.0,45.0,15.0,114.0,258.0,270.0,97.0,5.0
2,OTHER,78.0,20.0,21.0,18.0,19.0,25.0,10.0,10.0,33.0,21.0,57.0,51.0,24.0,3.0


In [13]:
crosstab_df.set_index("CANDIDATE")["LV"] / crosstab_df["LV"].sum()

CANDIDATE
HARRIS    0.469965
TRUMP     0.438163
OTHER     0.091873
Name: LV, dtype: float64

In [14]:
0.469965 - 0.438163

0.031802

---

## Basic Cell-by-Cell Weighting

### Data Prep

In [39]:
# cols = ["CD1", "CD2", "CD3", "CD4"]
# ct_df = crosstab_df.set_index("CANDIDATE")[cd_cols].copy()

cols = ["B1", "B2", "B3"]
index = ["A1", "A2", "A3", "A4"]

ct_df = pandas.DataFrame([[20, 40, 40], [50, 140, 310], [100, 50, 50], [30, 100, 70]],
                         columns=cols, index=index)

ct_df

Unnamed: 0,B1,B2,B3
A1,20,40,40
A2,50,140,310
A3,100,50,50
A4,30,100,70


In [40]:
sample_df = pandas.DataFrame([[80, 40, 55], [60, 150, 340], [170, 60, 200], [55, 165, 125]],
                             index=index, columns=cols)

sample_df

Unnamed: 0,B1,B2,B3
A1,80,40,55
A2,60,150,340
A3,170,60,200
A4,55,165,125


In [41]:
# pop_df = CD_REG_DF.copy()
# pop_df.index = ["HARRIS", "TRUMP", "OTHER"]

pop_df = sample_df + 100
pop_df

Unnamed: 0,B1,B2,B3
A1,180,140,155
A2,160,250,440
A3,270,160,300
A4,155,265,225


In [42]:
sample_df.sum(axis=1) / sample_df.sum(axis=1).sum()

A1    0.116667
A2    0.366667
A3    0.286667
A4    0.230000
dtype: float64

### Cell Weighting

In [23]:
cr = CellReweighter(ct_df, sample_df, pop_df, cols)

In [24]:
reweighted_df = cr.reweight()
reweighted_df

Survey crosstab vs. survey sample F = 1.3066977054439586
Survey sample vs. population F = 1.0940603944609888


Unnamed: 0,B1,B2,B3
A1,720.0,140.0,213.125
A2,192.0,267.857143,482.580645
A3,459.0,192.0,1200.0
A4,284.166667,437.25,401.785714


In [25]:
reweighted_df.sum(axis=1) / reweighted_df.sum(axis=1).sum()

A1    0.215065
A2    0.188874
A3    0.370959
A4    0.225101
dtype: float64

---

## Basic Raking

In [27]:
rr = RakeReweighter(ct_df, sample_df, pop_df, cols)

In [36]:
raked_df = rr.reweight()

4it [00:00, 1170.04it/s]
Survey crosstab vs. survey sample F = 1.0729043729523149
4it [00:00, 1333.32it/s]
Survey sample vs. population F = 1.0532796180516186


In [37]:
raked_df

Unnamed: 0,B1,B2,B3
A1,419.246971,166.288668,260.201605
A2,114.201586,226.482609,584.20751
A3,696.239215,194.932174,739.446725
A4,203.572993,484.466925,417.671478


In [38]:
raked_df.sum(axis=1) / raked_df.sum(axis=1).sum()

A1    0.187651
A2    0.205214
A3    0.361800
A4    0.245334
dtype: float64