In [1]:
#Libraries
import pandas as pd
import numpy as np
import scipy.stats as stat

from mlgear.utils import show, display_columns
from surveyweights import run_weighting_iteration, run_weighting_scheme, normalize_weights, get_census

In [2]:
#Load Processed Data¶
survey = pd.read_csv('responses_processed.csv')
show(survey)

                      biden_approval         vote2016      vote2020  \
0     Neither approve nor disapprove  Hillary Clinton  Did not vote   
1                Strongly disapprove  Hillary Clinton  Donald Trump   
2                Somewhat disapprove            Other  Donald Trump   
3                   Somewhat approve     Did not vote  Did not vote   
4                   Somewhat approve            Other     Joe Biden   
...                              ...              ...           ...   
1957                         Approve     Did not vote     Joe Biden   
1958             Somewhat disapprove     Donald Trump  Did not vote   
1959                Somewhat approve  Hillary Clinton     Joe Biden   
1960                Strongly approve  Hillary Clinton     Joe Biden   
1961  Neither approve nor disapprove     Donald Trump  Did not vote   

                 gss_trust       gss_bible       gss_spanking  social_none  \
0     Can't be too careful    I don’t know           Disagree        

In [3]:
# US national Weighting
demographics = ['gender', 'race', 'education', 'urban_rural', 'income', 'age', 'vote2016', 'region',
                'gss_trust', 'gss_bible', 'gss_spanking', 'vote2020']
run_weighting_iteration(survey, weigh_on=demographics, census='US')

## gender ##
Female    1.262117
Male      1.187538
Other     0.550345
dtype: float64
ERROR: 0.22742666025407496
-
-
## race ##
Asian or Asian American      0.298301
Black or African American    1.681557
Hispanic or Latino           1.560691
Other                        0.571902
White or Caucasian           0.887550
dtype: float64
ERROR: 0.30526383083425235
-
-
## education ##
Completed graduate school     0.312206
Graduated from college        0.240856
Graduated from high school    0.891720
Less than high school         3.170770
Some college, no degree       0.384448
dtype: float64
ERROR: 0.6350241456260558
-
-
## urban_rural ##
Rural       1.070730
Suburban    0.982412
Urban       0.946858
dtype: float64
ERROR: 0.03584808493564909
-
-
## income ##
Between $100,000 and $150,000    1.395949
Between $15,000 and $49,999      0.599314
Between $50,000 and $74,999      0.621489
Between $75,000 and $99,999      0.722263
Over $150,000                    2.391797
Under $15,000                  

{'errors': [0.22742666025407496,
  0.30526383083425235,
  0.6350241456260558,
  0.03584808493564909,
  0.5085835398134239,
  0.5915827100111385,
  0.24484353028096892,
  0.137049632511535,
  0.06926352400616165,
  0.5707191021526505,
  0.6022492482819521,
  0.4719012303091271],
 'error_table': {'education': 2.170770496214128,
  'gss_spanking': 1.436519022785272,
  'income': 1.3917965098241192,
  'age': 1.2524038385590992,
  'gss_bible': 1.2460815113416146,
  'vote2020': 1.1898089057552488,
  'vote2016': 0.7167553796622541,
  'race': 0.7016991939151622,
  'region': 0.5827564100118945,
  'gender': 0.4496548332096322,
  'gss_trust': 0.0813920329660821,
  'urban_rural': 0.07072998252797591},
 'weights': {'gender': Female    1.262117
  Male      1.187538
  Other     0.550345
  dtype: float64,
  'race': Asian or Asian American      0.298301
  Black or African American    1.681557
  Hispanic or Latino           1.560691
  Other                        0.571902
  White or Caucasian           0.

In [4]:
output = run_weighting_scheme(survey, iters=50, weigh_on=demographics, census='US', verbose=1, early_terminate=False)

ITER 1/1 - initialization - ERROR 50
ITER 2/50 - weight gender - ERROR 4.399755239016989
ITER 3/50 - weight race - ERROR 4.156310629227998
ITER 4/50 - weight education - ERROR 3.8225674887367425
ITER 5/50 - weight urban_rural - ERROR 3.211688007695111
ITER 6/50 - weight income - ERROR 3.1857820467082707
ITER 7/50 - weight age - ERROR 3.0581367450858155
ITER 8/50 - weight vote2016 - ERROR 3.0949712277250168
ITER 9/50 - weight region - ERROR 2.369019488317725
ITER 10/50 - weight gss_trust - ERROR 2.098156979659262
ITER 11/50 - weight gss_bible - ERROR 1.9869737038634732
ITER 12/50 - weight gss_spanking - ERROR 1.8443236360561084
ITER 13/50 - weight vote2020 - ERROR 1.9653052940251794
ITER 14/50 - weight gender - ERROR 1.0318631673688972
ITER 15/50 - weight education - ERROR 1.1488964530961285
ITER 16/50 - weight region - ERROR 0.9501368132996025
ITER 17/50 - weight age - ERROR 0.9362855846672701
ITER 18/50 - weight vote2016 - ERROR 0.9188743046721225
ITER 19/50 - weight income - ERROR 0.

In [5]:
survey = output['final_df']
_ = run_weighting_iteration(survey, weigh_on=demographics, census='US')

## gender ##
Female    0.990695
Male      0.995048
Other     1.014257
dtype: float64
ERROR: 0.0072005509355196305
-
-
## race ##
Asian or Asian American      1.002342
Black or African American    1.006710
Hispanic or Latino           0.994187
Other                        1.019899
White or Caucasian           0.976862
dtype: float64
ERROR: 0.016706727137372698
-
-
## education ##
Completed graduate school     0.985830
Graduated from college        0.985131
Graduated from high school    0.999282
Less than high school         1.036789
Some college, no degree       0.992968
dtype: float64
ERROR: 0.010731578668145064
-
-
## urban_rural ##
Rural       0.967166
Suburban    0.996483
Urban       1.036351
dtype: float64
ERROR: 0.017841991058979566
-
-
## income ##
Between $100,000 and $150,000    1.023974
Between $15,000 and $49,999      0.972348
Between $50,000 and $74,999      0.992528
Between $75,000 and $99,999      0.987086
Over $150,000                    1.033103
Under $15,000            

In [6]:
survey.to_csv('responses_processed_national_weighted.csv', index=False)

In [7]:
# Altenate Weighting
census = get_census()
survey_ = survey.copy()
survey_['social_twitter'] = survey_['social_twitter'].astype(str)
census['social_twitter'] = {'False': 0.8, 'True': 0.22}
survey_['social_fb'] = survey_['social_fb'].astype(str)
census['social_fb'] = {'False': 0.31, 'True': 0.69}

demographics = ['gender', 'race', 'education', 'urban_rural', 'income', 'age', 'vote2016', 'vote2020', 'region',
                'gss_trust', 'gss_bible', 'gss_spanking', 'social_twitter', 'social_fb']
output = run_weighting_scheme(survey_, iters=50, weigh_on=demographics, census='US', verbose=1, early_terminate=False)
survey['weight_alt'] = output['final_weights']

ITER 1/1 - initialization - ERROR 50
ITER 2/50 - weight gender - ERROR 5.064258115955265
ITER 3/50 - weight race - ERROR 4.8093054965552815
ITER 4/50 - weight education - ERROR 4.500683075370874
ITER 5/50 - weight urban_rural - ERROR 3.9716387754705287
ITER 6/50 - weight income - ERROR 3.9534352095566914
ITER 7/50 - weight age - ERROR 3.7105003440269124
ITER 8/50 - weight vote2016 - ERROR 3.7552308011049966
ITER 9/50 - weight vote2020 - ERROR 2.983848975229699
ITER 10/50 - weight region - ERROR 3.011614082198449
ITER 11/50 - weight gss_trust - ERROR 2.7509248393123067
ITER 12/50 - weight gss_bible - ERROR 2.6789755524622647
ITER 13/50 - weight gss_spanking - ERROR 2.6568174309591277
ITER 14/50 - weight social_twitter - ERROR 2.8286069741760134
ITER 15/50 - weight social_fb - ERROR 2.751072173368966
ITER 16/50 - weight gender - ERROR 1.9497677965138178
ITER 17/50 - weight vote2016 - ERROR 1.2019797047393133
ITER 18/50 - weight education - ERROR 1.4436133399271402
ITER 19/50 - weight inc

In [8]:
#Analysis of Weights
survey_ = survey.fillna('NA - Not presented')
for var in sorted(list(survey.columns)):
    print('## {} ##'.format(var))
    print('unweighted')
    print(survey_[var].value_counts(normalize=True)[sorted(survey_[var].unique())] * 100)
    print('-')
    print('weighted')
    print(survey_[var].value_counts(normalize=True) * survey_.groupby(var)['weight'].mean() * 100)
    print('-')
    print('alt weight')
    print(survey_[var].value_counts(normalize=True) * survey_.groupby(var)['weight_alt'].mean() * 100)
    print('-')
    print('-')
    print('-')

## age ##
unweighted
18-24    22.018349
25-44    56.523955
45-64    17.329256
65+       4.128440
Name: age, dtype: float64
-
weighted
18-24    13.473369
25-44    35.069725
45-64    34.580525
65+      16.876381
dtype: float64
-
alt weight
18-24    13.516114
25-44    35.151103
45-64    34.738288
65+      16.594495
dtype: float64
-
-
-
## biden_approval ##
unweighted
Approve                           24.821611
Disapprove                         5.759429
Neither approve nor disapprove    15.596330
Somewhat approve                  22.935780
Somewhat disapprove                7.951070
Strongly approve                  13.404689
Strongly disapprove                9.531091
Name: biden_approval, dtype: float64
-
weighted
Approve                           19.975830
Disapprove                         5.378945
Neither approve nor disapprove    16.044088
Somewhat approve                  12.118302
Somewhat disapprove                6.449851
Strongly approve                  15.251213
Strongly disa

Name: vote2016, dtype: float64
-
weighted
Did not vote       36.560783
Donald Trump       31.260257
Hillary Clinton    28.741067
Other               3.437893
dtype: float64
-
alt weight
Did not vote       36.425863
Donald Trump       29.258220
Hillary Clinton    30.687313
Other               3.628604
dtype: float64
-
-
-
## vote2020 ##
unweighted
Did not vote    18.093782
Donald Trump    15.902141
Joe Biden       61.264016
Other            4.740061
Name: vote2020, dtype: float64
-
weighted
Did not vote    18.093782
Donald Trump    38.414016
Joe Biden       42.017890
Other            1.474312
dtype: float64
-
alt weight
Did not vote    17.954100
Donald Trump    36.923473
Joe Biden       43.639233
Other            1.483194
dtype: float64
-
-
-
## voted2016 ##
unweighted
False    36.697248
Name: voted2016, dtype: float64
-
weighted
False    36.560783
True     63.439217
dtype: float64
-
alt weight
False    36.425863
True     63.574137
dtype: float64
-
-
-
## voted2020 ##
unweighted
False  