In [None]:
! pip install --upgrade codetiming ipython-autotime numpy pandas census us mechanicalsoup pandas_bokeh geopandas #torch
get_ipython().kernel.do_shutdown(True)
from google.colab import drive
drive.mount('/content/drive')

In [1]:
repo_path = '/content/drive/MyDrive/gerrymandering/2022-10/voting_predictor'
%cd {repo_path}
%load_ext google.colab.data_table
%load_ext autotime
%load_ext autoreload
%autoreload
from model import *
print('using device', device)
pd.set_option('plotting.backend', 'pandas_bokeh')
pd.plotting.output_notebook()


class VotingPredictor(torch.nn.Module):
    def __init__(self, input_size, layer_size, activation, election='all'):
        self.layer_size = listify(layer_size)
        self.activation  = listify(activation)
        self.election    = election
        self.rmse_train  = []
        self.rmse_test   = []
        self.P = torch.FloatTensor([[1,1,1,0,0,0],[0,0,0,1,1,1,]]).T.to(device)
        if len(self.activation) - len(self.layer_size) == 1:
            self.layer_size.append(6)
        assert len(self.layer_size) == len(self.activation), 'layer_size and activation must have same length'
        super().__init__()
        L = []
        p = input_size
        for q, f in zip(self.layer_size, self.activation):
            L.append(torch.nn.Linear(p, q))
            L.append(f())
            p = q
        self.nn = torch.nn.Sequential(*L)
    
    def forward(self, W, X):
        prop = self.nn(tensorify(X))
        pred = ((prop * tensorify(W)) @ self.P).squeeze()
        return pred

    def predict(self, W, X, Y_true=None):
        Y_pred = self(W, X).detach().cpu().numpy()
        votes_pred = Y_pred.sum(axis=0).round().astype(int)
        pct_pred = 100.0 * votes_pred / votes_pred.sum()

        if Y_true is None:
            df = prep(pd.DataFrame(
                data   =[[self.election,  votes_pred[0],  votes_pred[1],  pct_pred[0],  pct_pred[1]]],
                columns= ['election'   , 'votes_pred_d', 'votes_pred_r', 'pct_pred_d', 'pct_pred_r']))
        else:
            if torch.is_tensor(Y_true):
                Y_true = Y_true.detach().cpu().numpy()
            
            votes_true = Y_true.sum(axis=0).round().astype(int)
            pct_true = 100.0 * votes_true / votes_true.sum()
            votes_err  = votes_pred - votes_true
            pct_err  = pct_pred - pct_true
            df = prep(pd.DataFrame(
                data   =[[self.election,  votes_pred[0],  votes_true[0],  votes_err[0],  votes_pred[1],  votes_true[1],  votes_err[1],  pct_pred[0],  pct_pred[1],  pct_true[0],  pct_true[1],  pct_err[0]]],
                columns= ['election'   , 'votes_pred_d', 'votes_true_d', 'votes_err_d', 'votes_pred_r', 'votes_true_r', 'votes_err_r', 'pct_pred_d', 'pct_pred_r', 'pct_true_d', 'pct_true_r', 'pct_err']))
        return df, Y_pred

    def main(self, W, X, Y):
        test_mask = (X.reset_index()['election'] == self.election).values
        train_mask = ~test_mask
        if train_mask.all():
            test_mask = train_mask
        W_test  = tensorify(W[ test_mask])
        X_test  = tensorify(X[ test_mask])
        Y_test  = tensorify(Y[ test_mask])
        W_train = tensorify(W[train_mask])
        X_train = tensorify(X[train_mask])
        Y_train = tensorify(Y[train_mask])

        torch.manual_seed(3)
        loss_fcn = torch.nn.MSELoss()
        optimizer = torch.optim.Adam(self.parameters())
        for k in range(1000):
            optimizer.zero_grad()
            loss = loss_fcn(self(W_train, X_train), Y_train)
            loss.backward()
            optimizer.step()
            self.rmse_train.append(np.sqrt(loss_fcn(self(W_train, X_train), Y_train).item()))
            self.rmse_test .append(np.sqrt(loss_fcn(self(W_test , X_test ), Y_test ).item()))
            r = self.rmse_test[-100:]
            if len(r) >= 100 and np.var(r) / np.mean(r) < 0.0001:
                break
        self.results, self.Y_pred, self.Y_true  = self.predict(W_test, X_test, Y_test)


def train(W, X, Y, elections, param_grid):
    mkdir(MODEL_PATH, overwrite=True)
    models = []
    for k, kwargs in enumerate(cartesian(param_grid)):
        print(k, kwargs)
        E = []
        for election in elections+['all']:
            model = VotingPredictor(election=election, input_size=X.shape[1], **kwargs).to(device)
            model.main(W, X, Y)
            E.append(model)
        res = pd.concat([e.results for e in E], ignore_index=True)
        res.index = 0*res.index + k
        model.results = res
        models.append(model)
        torch.save(model, MODEL_PATH / f'model{str(k).rjust(3,"0")}.pt')

    results = pd.concat([model.results for model in models])
    summary = (results
        .assign(m = lambda x: x['election'].isin(elections))
        .assign(e = lambda x: x['pct_err'].abs())
        .assign(a = lambda x: x['m'] * x['e'])
        .assign(b = lambda x:~x['m'] * x['e'])
        .groupby(level=0).agg(
            rmse = ('a', lambda x: (x**2).mean()**(1/2)),
            max  = ('e', 'max'),
            all  = ('b', 'max'),
            ct   = ('m', 'sum'),
        )
    )
    summary.insert(0, 'activation', [[f.__name__ for f in model.activation] for model in models])
    summary.insert(0, 'layer_size', [model.layer_size for model in models])
    summary.sort_values('rmse', inplace=True)

    results.to_csv(MODEL_PATH / 'results.csv')
    summary.to_csv(MODEL_PATH / 'summary.csv')
    display(results.round(2))
    display(summary.round(2))
    return models, results, summary

def load_model(k):
    return torch.load(MODEL_PATH / f'model{str(k).rjust(3,"0")}.pt')

elections = ['2016_general_President', '2018_general_USSen', '2018_general_Governor', '2020_general_USSen', '2020_general_President']
feat = [
    'dist_to_border', # 'aland', 'polsby_popper',
    # 'white_tot_pop'       , 'hisp_tot_pop'       , 'other_tot_pop'       ,
    # 'white_vap_pop'       , 'hisp_vap_pop'       , 'other_vap_pop'       ,
    'white_vap_density'   , 'hisp_vap_density'   , 'other_vap_density'   ,
    'white_vap_poverty'   , 'hisp_vap_poverty'   , 'other_vap_poverty'   ,
    'white_vap_elderly'   , 'hisp_vap_elderly'   , 'other_vap_elderly'   ,
    'white_vap_highschool', 'hisp_vap_highschool', 'other_vap_highschool',
    'white_vap_homeowner' , 'hisp_vap_homeowner' , 'other_vap_homeowner' ,
    'hisp_vap_spanish_at_home_english_well',
]
targ = ['d', 'r']
weig = ['white_vap_pop', 'hisp_vap_pop', 'other_vap_pop']
df, W, X, Y = extract_dataset(elections, feat, targ, weig)

param_grid = {
    'layer_size': range(3, 10, 1),
    'activation': [
        [torch.nn.ReLU, torch.nn.Sigmoid],
        [torch.nn.ReLU, torch.nn.Tanh],
    ],
}
# models, results, summary = train(W, X, Y, elections, param_grid)

/content/drive/.shortcut-targets-by-id/1KllOAyZGJmKAuI4yIyUAnFVqBdaUeO3v/gerrymandering/2022-10/voting_predictor
using device cuda
time: 5.57 s (started: 2022-10-29 04:07:02 +00:00)


In [12]:
predictor = load_model(2)  # load pretrained model
# we use model trained on ALL 5 elections, but predict based on 2020 ACS (most recent available)
elections = ['2020_general_President']  # gets 2020 ACS (most recent available)
df, W, X, Y = extract_dataset(elections, feat, targ, weig)
results, votes = predictor.predict(W, X)
results['election'] = '2022_general_Governor'
display(results.round(2))

df['votes'] = votes.sum(axis=1)
df['votes_d'] = votes[:,0]
df['votes_r'] = votes[:,1]
df['pct_d'] = df['votes_d'] / df['votes'] * 100.0
df['pct_r'] = df['votes_r'] / df['votes'] * 100.0
df['clr'] = df['pct_r'] * 2 - 100

df.round().reset_index().plot_bokeh(
    category='clr',
    colormap="RdBu",
    line_width=0.01,
    colormap_range = [-100, 100],
    hovertool_columns = [
        'pct_d',
        'pct_r',
        'votes_d',
        'votes_r',
        'county',
        'vtd2020',
        ],
    )

Unnamed: 0_level_0,election,votes_pred_d,votes_pred_r,pct_pred_d,pct_pred_r
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2022_general_Governor,4394691,5082015,46.37,53.63


time: 14.8 s (started: 2022-10-29 04:18:40 +00:00)


In [11]:
predictor = load_model(2)  # load pretrained model
elections = ['2016_general_President', '2018_general_USSen', '2018_general_Governor', '2020_general_USSen', '2020_general_President']
df, W, X, Y = extract_dataset(elections, feat, targ, weig)
results, votes_pred = predictor.predict(W, X, Y)
votes_true = Y.values
display(results.round(2))

df['pct_pred_r'] = 100.0 * (votes_pred[:,1] / np.fmax(votes_pred[:,0] + votes_pred[:,1], 1)).clip(0, 1)
df['pct_true_r'] = 100.0 * (votes_true[:,1] / np.fmax(votes_true[:,0] + votes_true[:,1], 1)).clip(0, 1)
df['pct_pred_r'] = df.groupby(level=0)['pct_pred_r'].transform('mean')
df['pct_true_r'] = df.groupby(level=0)['pct_true_r'].transform('mean')
df['pct_err'] = df['pct_true_r'] - df['pct_pred_r']

df.query('election == @elections[-1]').round().reset_index().plot_bokeh(
    category='pct_err',
    colormap="RdBu",
    line_width=0.01,
    colormap_range = [-100, 100],
    hovertool_columns = [
        'pct_true_r',
        'pct_pred_r',
        'pct_err',
        'county',
        'vtd2020',
        ],
    )

Unnamed: 0_level_0,election,votes_pred_d,votes_true_d,votes_err_d,votes_pred_r,votes_true_r,votes_err_r,pct_pred_d,pct_pred_r,pct_true_d,pct_true_r,pct_err
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,all,21376800,21612190,-235390,25048810,25449923,-401113,46.05,53.95,45.92,54.08,0.12


time: 17 s (started: 2022-10-29 04:17:56 +00:00)


In [2]:
predictor = load_model(2)  # load pretrained model
elections = ['2016_general_President', '2018_general_USSen', '2018_general_Governor', '2020_general_USSen', '2020_general_President']
df, W, X, Y = extract_dataset(elections, feat, targ, weig)
results, votes_pred = predictor.predict(W, X, Y)
votes_true = Y.values
display(results.round(2))

df['pct_pred_r'] = 100.0 * (votes_pred[:,1] / np.fmax(votes_pred[:,0] + votes_pred[:,1], 1)).clip(0, 1)
df['pct_true_r'] = 100.0 * (votes_true[:,1] / np.fmax(votes_true[:,0] + votes_true[:,1], 1)).clip(0, 1)
df['pct_err'] = df['pct_true_r'] - df['pct_pred_r']
df = df.dissolve(by=['vtd2020', 'county'], aggfunc='mean')
df.crs
# df = df.reset_index().groupby(['vtd2020', 'county', 'geometry'])[['pct_true_r', 'pct_pred_r', 'pct_err']].mean()
# # .mean().clip(-100, 100)
display(df['pct_err'].describe())
# df.head(2)
df.iloc[:1000].reset_index().plot_bokeh(
    category='pct_err',
    colormap="RdBu",
    line_width=0.1,
    colormap_range = [-100, 100],
    hovertool_columns = [
        'pct_true_r',
        'pct_pred_r',
        'pct_err',
        # 'pct_d',
        # 'pct_r',
        # 'votes_d',
        # 'votes_r',
        'county',
        'vtd2020',
        ],
    )
# # df.shape

Unnamed: 0_level_0,election,votes_pred_d,votes_true_d,votes_err_d,votes_pred_r,votes_true_r,votes_err_r,pct_pred_d,pct_pred_r,pct_true_d,pct_true_r,pct_err
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,all,21376800,21612190,-235390,25048810,25449923,-401113,46.05,53.95,45.92,54.08,0.12


count    8806.000000
mean        3.390421
std        13.588155
min       -76.230698
25%        -4.379894
50%         3.627317
75%        11.182392
max       100.000000
Name: pct_err, dtype: float64

time: 15.3 s (started: 2022-10-29 03:49:41 +00:00)


In [104]:
type(df), df.crs, df.shape
# df.head()

(geopandas.geodataframe.GeoDataFrame, <Geographic 2D CRS: EPSG:4326>
 Name: WGS 84
 Axis Info [ellipsoidal]:
 - Lat[north]: Geodetic latitude (degree)
 - Lon[east]: Geodetic longitude (degree)
 Area of Use:
 - name: World.
 - bounds: (-180.0, -90.0, 180.0, 90.0)
 Datum: World Geodetic System 1984 ensemble
 - Ellipsoid: WGS 84
 - Prime Meridian: Greenwich, (8806, 40))

time: 6.43 ms (started: 2022-10-29 03:47:22 +00:00)


|index|election|votes\_pred\_d|votes\_pred\_r|pct\_pred\_d|pct\_pred\_r|
|---|---|---|---|---|---|
|0|2022_general_Governor|4394691|5082015|46\.37|53\.63|

In [None]:
## old
mount_path = '/content/drive'
repo_path = f'{mount_path}/MyDrive/gerrymandering/2022-10/voting_predictor'
from google.colab import drive
drive.mount(mount_path)
%cd {repo_path}
%load_ext google.colab.data_table
%load_ext autoreload
%autoreload
from model import *
%cd {repo_path}
%load_ext google.colab.data_table

%load_ext autoreload
%autoreload
from model import *

class Feedforward(torch.nn.Module):
    def __init__(self, input_size, hidden_size, activation):
        super(Feedforward, self).__init__()
        assert len(hidden_size) == len(activation), 'hidden_size and activation must have same length'
        L = []
        p = input_size
        for q, f in zip(hidden_size, activation):
            L.append(torch.nn.Linear(p, q))
            L.append(f())
            p = q
        self.nn = torch.nn.Sequential(*L)
        self.rmse_train = []
        self.rmse_test  = []

    def forward(self, X, W):
        prop = self.nn(X)
        # print(self.prop.shape, W.shape, B.shape)
        pred = ((prop * W) @ B).squeeze()
        # print(self.pred.shape)
        return pred

    def get_rmse(self):
        self.rmse_train.append(np.sqrt(loss_fcn(self(X_train, W_train), Y_train).item()))
        self.rmse_test .append(np.sqrt(loss_fcn(self(X_test , W_test ), Y_test ).item()))


feat = [
    # 'aland',
    # 'polsby_popper',
    'dist_to_border',
    # 'white_tot_pop'       , 'hisp_tot_pop'       , 'other_tot_pop'       ,
    # 'white_vap_pop'       , 'hisp_vap_pop'       , 'other_vap_pop'       ,
    'white_vap_density'   , 'hisp_vap_density'   , 'other_vap_density'   ,
    'white_vap_poverty'   , 'hisp_vap_poverty'   , 'other_vap_poverty'   ,
    'white_vap_elderly'   , 'hisp_vap_elderly'   , 'other_vap_elderly'   ,
    'white_vap_highschool', 'hisp_vap_highschool', 'other_vap_highschool',
    'white_vap_homeowner' , 'hisp_vap_homeowner' , 'other_vap_homeowner' ,
    'hisp_vap_spanish_at_home_english_well',
]
targ = ['d', 'r']
weig = ['white_vap_pop', 'hisp_vap_pop', 'other_vap_pop']

elections = ['2020_general_President', '2016_general_President', '2018_general_USSen', '2020_general_USSen']
df = pd.concat([features(*elec.split('_')) for elec in elections])[feat+targ+weig].sample(frac=1)
W = df[weig].astype(float)
W = W.join(W, lsuffix='_d', rsuffix='_r')
Y = df[targ].astype(float)
X = df[feat].astype(float)
X = (X - X.min()) / (X.max() - X.min())
B = torch.FloatTensor([[1,1,1,0,0,0],[0,0,0,1,1,1,]]).T.to(device)

def voting_predictor(election, activation, hidden=(50, 6)):
    assert hidden[-1]==6, f'hidden must end with 6'
    W_test  = torch.FloatTensor(W.query('election == @election').values).to(device)
    X_test  = torch.FloatTensor(X.query('election == @election').values).to(device)
    Y_test  = torch.FloatTensor(Y.query('election == @election').values).to(device)
    W_train = torch.FloatTensor(W.query('election != @election').values).to(device)
    X_train = torch.FloatTensor(X.query('election != @election').values).to(device)
    Y_train = torch.FloatTensor(Y.query('election != @election').values).to(device)

    model = Feedforward(
        input_size = X_train.shape[1],
        hidden_size = hidden,
        activation = activation,
    ).to(device)
    loss_fcn = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters())
    
    steps = 10000
    for k in range(steps):
        optimizer.zero_grad()  # Clear gradient
        loss = loss_fcn(model(X_train, W_train), Y_train) # Compute train loss
        loss.backward()  # Backward propagation
        optimizer.step()  # Learn
        model.get_rmse()
        r = model.rmse_test[-100:]
        if len(r) >= lag and np.var(r) / np.mean(r) < 0.0001:
            break

    model.election = election
    model.cnt_pred = model(X_test, W_test).detach().cpu().numpy().sum(axis=0).round().astype(int)
    model.cnt_true = Y_test.detach().cpu().numpy().sum(axis=0).round().astype(int)
    model.cnt_err  = model.cnt_pred - model.cnt_true
    model.pct_pred = 100.0 * model.cnt_pred / model.cnt_pred.sum()
    model.pct_true = 100.0 * model.cnt_true / model.cnt_true.sum()
    model.pct_err  = model.pct_pred - model.pct_true
    return model
    # result = {
    #     'cnt_pred':cnt_pred, 'cnt_true':cnt_true, 'cnt_err':cnt_err,
    #     'pct_pred':pct_pred, 'pct_true':pct_true, 'pct_err':pct_err, }


    # print(elec, Y_pred, Y_true, Y_err, result_true.round(2), result_pred.round(2), result_err.round(2))
    
    # err = model(X_test, W_test) - Y_test
    # err_pct = (err.sum(axis=0) / Y_test.sum(axis=0)) * 100
    # print(elec, err_pct.detach().cpu().numpy().round(2))

res = dict()
for activation in [[torch.nn.ReLU, torch.nn.Sigmoid]]:
    for election in elections:
        model = voting_predictor(election, activation, hidden=(50, 6))
        res[election] = model
        print(model.election)
