In [41]:
import numpy as np
import pandas as pd

from mlgear.utils import show, display_columns
from surveyweights import run_weighting_iteration, run_weighting_scheme, normalize_weights
from survey_dud_detector import detect_straightlining, detect_low_incidence


def is_valid_prolific_id(pid):
    if len(pid) != 24:
        return False
    try:
        pid = int(pid, 16)
        return True
    except:
        return False


def sjoin(x):
    return ';'.join(x[x.notnull()].astype(str))


def transform_age(age):
    if age < 18:
        return 'Under 18'
    if age < 24:
        return '18-24'
    if age < 45:
        return '25-44'
    if age < 64:
        return '45-64'
    return '65+'


def transform_income(inc):
    if inc == 'Between $30,000 and $49,999' or inc == 'Between $15,000 and $29,999':
        return 'Between $15,000 and $49,999'
    else:
        return inc


def transform_education(educ):
    if educ in ['1st grade', '2nd grade', '3rd grade', '4th grade', '5th grade', '6th grade', '7th grade',
                '8th grade', '9th grade', '10th grade', '11th grade', 'Did not attend school']: 
        return 'Less than high school'
    if educ in ['1 year of college', '2 years of college', '3 years of college']:
        return 'Some college, no degree'
    if educ == 'Some graduate school':
        return 'Graduated from college'
    return educ


def transform_race(race):
    if race in ['Another race', 'American Indian or Alaska Native', 'Native Hawaiian or other Pacific Islander']:
        return 'Other'
    else:
        return race


def transform_region(state):
    if state in ['Illinois', 'Indiana', 'Iowa', 'Michigan', 'Minnesota', 'Ohio', 'Pennsylvania', 'Wisconsin']:
        return 'Midwest'
    elif state in ['Alaska', 'Idaho', 'Kansas', 'Montana', 'Nebraska', 'North Dakota', 'South Dakota',
                   'Utah', 'Wyoming', 'Oklahoma']:
        return 'Mountains'
    elif state in ['Connecticut', 'Delaware', 'District of Columbia (DC)', 'Maine', 'Maryland', 'Massachusetts',
                   'New Hampshire', 'New Jersey', 'New York', 'Rhode Island', 'Vermont']:
        return 'Northeast'
    elif state in ['California', 'Hawaii', 'Oregon', 'Washington', 'Guam', 'Puerto Rico', 'Virgin Islands']:
        return 'Pacific'
    elif state in ['Missouri', 'Tennessee', 'Alabama', 'Arkansas', 'Kentucky', 'Louisiana', 'Mississippi',
                   'Texas', 'Virginia', 'West Virginia']:
        return 'South'
    elif state in ['Arizona', 'Colorado', 'Nevada', 'New Mexico']:
        return 'Southwest'
    elif state in ['Florida', 'Georgia', 'North Carolina', 'South Carolina']:
        return 'Southeast'
    else:
        return 'Other'


def transform_2016_vote(vote):
    vote = vote.split(',')[0]
    if vote in ['Gary Johnson', 'Jill Stein', 'Another candidate']:
        vote = 'Other'
    return vote

def transform_2020_vote(vote):
    vote = vote.split(',')[0]
    if vote in ['Another candidate']:
        vote = 'Other'
    return vote


def transform_gss_trust(trust):
    if trust == 'You can’t be too careful':
        return 'Can\'t be too careful'
    elif trust == 'Most people can be trusted':
        return 'Can trust'
    else:
        return trust
    

def transform_gss_bible(bible):
    if bible == 'The Bible is an ancient book of fables, legends, history, and moral precepts recorded by man':
        return 'Book of fables'
    elif bible == 'The Bible is the actual word of God and it is to be taken literally, word for word':
        return 'Word of God'
    elif bible == 'The Bible is the inspired word of God but not everything should be taken literally, word for word':
        return 'Inspired word'
    else:
        return bible
    
    
def simplify_likert(likert):
    likert = likert.replace('’', '\'')
    if likert == 'Strongly agree':
        return 'Agree'
    elif likert == 'Strongly disagree':
        return 'Disagree'
    elif likert == 'Neither agree or disagree':
        return 'Don\'t know'
    elif likert == 'Neither agree nor disagree':
        return 'Don\'t know'
    else:
        return likert
    
    
    

In [42]:
variable_map = {'prolific_id': 'What is your Prolific ID?',
                'biden_approval': 'Do you approve or disapprove of the way Joe Biden is handling his job as President?',
                'vote2016': 'In the 2016 Presidential election, who did you vote for?',
                'vote2020': 'In the 2020 Presidential election, who did you vote for?',                             
                'gss_trust': 'Generally speaking, would you say that most people can be trusted or that you can\'t be too careful in dealing with people?',
                'gss_bible': 'Which of these statements comes closest to describing your feelings about the Bible?',
                'gss_spanking': 'How much do you agree or disagree with the following? “It is sometimes necessary to discipline a child with a good, hard spanking.”', # It is sometimes necessary to discipline a child with a good, hard spanking
                'social_none': 'Which of these social media networks do you use? (check all that apply)', # None of the above`
                'social_fb': 'Unnamed: 48', # Facebook
                'social_twitter': 'Unnamed: 49', # Twitter
                'social_instagram': 'Unnamed: 50', # Instagram
                'social_tiktok': 'Unnamed: 51', # TikTok
                'social_pinterest': 'Unnamed: 52', # Pinterest
                'social_attention_check': 'Unnamed: 53', # Yapyap
                'gender': 'What is your gender?',
                'birth_year': 'In what year were you born? Please write your answer as 4 digits only',
                'race': 'What is your race/ethnicity?',
                'education': 'What is the highest level of education you have completed?',
                'income': 'What is your annual income?',
                'urban_rural': 'Which of the following best describes the area in which you live?',
                'state': 'In what state or U.S. territory do you live?',
                'honesty': 'How honestly have you answered these questions? People depend on the honesty of your answers - if you admit to being dishonest, you will still be paid.'}

survey = pd.read_csv('farm_poll_2021_Raw.csv')
survey.columns = [c.replace('\xa0', '') for c in survey.columns]
variable_map = {v: k for k, v in variable_map.items()}
survey = survey[variable_map.keys()].rename(variable_map, axis=1)
survey = survey.drop(0)
None


In [43]:
print('Processing age...')
survey['birth_year'] = survey['birth_year'].astype(float)
survey['age'] = (2020 - survey['birth_year'].fillna(2020)).astype(float)
survey = survey.drop('birth_year', axis=1)
survey['age'] = survey['age'].apply(transform_age)
survey = survey[survey['age'] != 'Under 18']
None


Processing age...


In [44]:
#Data Quality

survey['valid_id'] = survey['prolific_id'].apply(is_valid_prolific_id)
survey['valid_id'].value_counts()


True     1985
False       9
Name: valid_id, dtype: int64

In [45]:
survey['prolific_id'].duplicated().sum()

2

In [46]:
survey['honesty'].value_counts()

Completely honestly    1691
Very honestly           277
Somewhat honestly        23
Not honestly at all       3
Name: honesty, dtype: int64

In [47]:
demographics = ['gender', 'race', 'education', 'urban_rural', 'income', 'age',
                'vote2016', 'vote2020', 'gss_trust', 'gss_bible', 'gss_spanking']

low_incidence_counts = detect_low_incidence(survey[demographics],
                                            low_incidence_threshold=0.04)
survey = survey.reset_index(drop=True)
survey['meta_low_incidence_count'] = low_incidence_counts.reset_index(drop=True)
low_incidence_counts.value_counts()

1614.624086    1631
128.505544      120
82.561168        77
62.230303        63
54.406721        38
22.163117        22
4.330151         13
2.781999          7
6.570921          7
4.952818          3
3.182045          3
2.096926          2
0.166891          2
0.746813          2
1.133275          1
1.763930          1
0.253254          1
0.038187          1
dtype: int64

In [48]:
(low_incidence_counts < 0.5).value_counts()

False    1990
True        4
dtype: int64

In [49]:
#More processing
print('Processing income...')
survey['income'] = survey['income'].astype(str).apply(transform_income)

print('Processing education...')
survey['education'] = survey['education'].astype(str).apply(transform_education)

for c in survey.columns:
    if  c.startswith('social_') :
        print('Processing {}...'.format(c))
        survey[c] = survey[c].apply(lambda x: isinstance(x, str))

for c in survey.columns:
    if c.startswith('social_'):
        survey[c] = survey[c].astype(bool)
    elif not c.startswith('meta_'):
        survey[c] = survey[c].astype(str)
        
survey['valid_id'] = survey['valid_id'].astype(bool)

print('Processing race...')
survey['race'] = survey['race'].apply(transform_race)

print('Processing gender...')
survey['gender'] = survey['gender'].fillna('Other')

print('Processing region...')
survey['region'] = survey['state'].apply(transform_region)

print('Processing 2016 vote...')
survey['vote2016'] = survey['vote2016'].apply(transform_2016_vote)

print('Processing 2020 vote...')
survey['vote2020'] = survey['vote2020'].apply(transform_2020_vote)

print('Processing GSS trust...')
survey['gss_trust'] = survey['gss_trust'].apply(transform_gss_trust)

print('Processing GSS Bible...')
survey['gss_bible'] = survey['gss_bible'].apply(transform_gss_bible)

print('Processing GSS Spanking...')
survey['gss_spanking'] = survey['gss_spanking'].apply(simplify_likert)

print('Processing voted...')
survey['voted2016'] = (survey['vote2016'] != 'Did not vote')

print('Processing voted...')
survey['voted2020'] = (survey['vote2020'] != 'Did not vote')

print('Processing noncollege white...')
survey['race_white'] = (survey['race'] == 'White or Caucasian')
survey['college'] = (~survey['education'].isin(['Graduated from high school',
                                                'Less than high school']))
survey['noncollege_white'] = (survey['race_white'] & ~survey['college'])
None

Processing income...
Processing education...
Processing social_none...
Processing social_fb...
Processing social_twitter...
Processing social_instagram...
Processing social_tiktok...
Processing social_pinterest...
Processing social_attention_check...
Processing race...
Processing gender...
Processing region...
Processing 2016 vote...
Processing 2020 vote...
Processing GSS trust...
Processing GSS Bible...
Processing GSS Spanking...
Processing voted...
Processing voted...
Processing noncollege white...


In [50]:
survey['social_attention_check'].value_counts()

False    1992
True        2
Name: social_attention_check, dtype: int64

In [51]:
print('Initial survey... N={}'.format(len(survey)))

survey = survey.reset_index(drop=True)
survey = survey[survey['valid_id']]
survey = survey.drop('valid_id', axis=1)
print('Dropping invalid ID... N={}'.format(len(survey)))

survey = survey.drop_duplicates('prolific_id', keep='last')
print('Dropping duplicate ID... N={}'.format(len(survey)))

survey = survey[~survey['honesty'].isin(['Somewhat honestly', 'Not honestly at all'])]
survey = survey.drop('honesty', axis=1)
print('Dropping dishonest... N={}'.format(len(survey)))

survey = survey[~survey['social_attention_check']]
survey = survey.drop('social_attention_check', axis=1)
print('Dropping failed news attention check... N={}'.format(len(survey)))

survey = survey[survey['meta_low_incidence_count'] > 0.5]
survey = survey.drop('meta_low_incidence_count', axis=1)
print('Dropping multiple low incidence... N={}'.format(len(survey)))


Initial survey... N=1994
Dropping invalid ID... N=1994
Dropping duplicate ID... N=1992
Dropping dishonest... N=1967
Dropping failed news attention check... N=1966
Dropping multiple low incidence... N=1962


In [52]:
survey = survey.drop('prolific_id', axis=1)
show(survey[sorted(list(survey.columns))])

        age                  biden_approval  college  \
0     18-24  Neither approve nor disapprove     True   
1     45-64             Strongly disapprove     True   
2     25-44             Somewhat disapprove     True   
3     25-44                Somewhat approve     True   
4     25-44                Somewhat approve     True   
...     ...                             ...      ...   
1989  45-64                         Approve     True   
1990  25-44             Somewhat disapprove     True   
1991  25-44                Somewhat approve     True   
1992  25-44                Strongly approve     True   
1993  25-44  Neither approve nor disapprove     True   

                      education  gender       gss_bible       gss_spanking  \
0       Some college, no degree  Female    I don’t know           Disagree   
1        Graduated from college  Female  Book of fables     Somewhat agree   
2        Graduated from college  Female   Inspired word           Disagree   
3        Gradua

In [53]:
display_columns(survey)

## age ##
25-44    56.523955
18-24    22.018349
45-64    17.329256
65+       4.128440
Name: age, dtype: float64
-
-
## biden_approval ##
Approve                           24.821611
Somewhat approve                  22.935780
Neither approve nor disapprove    15.596330
Strongly approve                  13.404689
Strongly disapprove                9.531091
Somewhat disapprove                7.951070
Disapprove                         5.759429
Name: biden_approval, dtype: float64
-
-
## college ##
True     85.015291
False    14.984709
Name: college, dtype: float64
-
-
## education ##
Graduated from college        37.716616
Some college, no degree       30.835882
Completed graduate school     16.462793
Graduated from high school    13.557594
Less than high school          1.427115
Name: education, dtype: float64
-
-
## gender ##
Male      49.847095
Female    48.827727
Other      1.325178
Name: gender, dtype: float64
-
-
## gss_bible ##
Book of fables    53.516820
Inspired word     31.39653

In [54]:
survey.to_csv('responses_processed.csv', index=False)