In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import savReaderWriter

In [2]:
#load survey data

with savReaderWriter.SavReader('Dataset - 2017 Pew Research Center Science and News Survey (2017).sav') as reader:
    data = reader.all()

In [3]:
#load metadata, extract variable names, convert from bytes to utf-8

with savReaderWriter.SavHeaderReader('Dataset - 2017 Pew Research Center Science and News Survey (2017).sav') as header:
    metadata = header.all()
    variables = metadata.varNames

for i in range(len(variables)):
    variables[i] = variables[i].decode('utf-8')

In [4]:
#add variables to data and convert to dataframe, remove those who don't identify as Rep, Dem, or Independent (296)

pew_df = pd.DataFrame(data)
pew_df.columns = variables

party_df = pew_df.loc[(pew_df['PARTY'] != -1) & (pew_df['PARTY'] != 4)]

Find n, ages, education, race/ethnicity, gender, income, marital status, employment, by party for demographic table

In [7]:
# n per party
party_df.groupby('PARTY').size()

PARTY
1.0    1158
2.0    1372
3.0    1198
dtype: int64

Get counts for age groups in each party

In [82]:
#Party 1= Rep, 2=Dem, 3=Ind
# Age 1 = 18-29, 2 = 30-44, 3 = 45-59, 4 = 60+
party_df.groupby(['PARTY','ppagect4']).size()

PARTY  ppagect4
1.0    1.0         125
       2.0         244
       3.0         383
       4.0         406
2.0    1.0         223
       2.0         297
       3.0         402
       4.0         450
3.0    1.0         201
       2.0         271
       3.0         361
       4.0         365
dtype: int64

Get counts for education groups in each party

In [8]:
# 1 = less than HS, 2 = HS, 3 = some college, 4 = Bachelors or higher
party_df.groupby(['PARTY','PPEDUCAT']).size()

PARTY  PPEDUCAT
1.0    1.0          51
       2.0         357
       3.0         336
       4.0         414
2.0    1.0         120
       2.0         357
       3.0         365
       4.0         530
3.0    1.0          97
       2.0         316
       3.0         357
       4.0         428
dtype: int64

Get counts for race/enthnicity groups in each party

In [9]:
# 1 = white, non-Hispanic; 2 = Black, non-Hispanic; 3 = Other, non-Hispanic; 4 = Hispanic; 5 = 2+races, non-Hispanic
party_df.groupby(['PARTY','PPETHM']).size()

PARTY  PPETHM
1.0    1.0       1021
       2.0          9
       3.0         28
       4.0         63
       5.0         37
2.0    1.0        737
       2.0        279
       3.0         73
       4.0        233
       5.0         50
3.0    1.0        892
       2.0         87
       3.0         57
       4.0        110
       5.0         52
dtype: int64

Get counts for gender in each party

In [10]:
# 1 = Male, 2 = Female
party_df.groupby(['PARTY','PPGENDER']).size()

PARTY  PPGENDER
1.0    1.0         586
       2.0         572
2.0    1.0         620
       2.0         752
3.0    1.0         656
       2.0         542
dtype: int64

Get counts for income groups per party, original data was given in 21 different income brackets. I converted it into 4 income brackets for easier reporting and interpretation.

In [8]:
# See codebook for categories
income = list(party_df.groupby(['PARTY','PPINCIMP']).size())
party_income21 = []
party_income4 = []
income_brackets = ['<$30,000', '$30,000-$49,999', '$50,000-$99,999', '$100,000+']

#recategorize into list of 3 parties
for i in range(3):
         party_income21.append(income[i*21:i*21+21])

#recategorize into list of 3 parties with 4 income brackets
for i in range(len(party_income21)):
    for ii in range(4):
        if ii == 0:
            party_income4.append(sum(party_income21[i][0:9]))
        elif ii == 1:
            party_income4.append(sum(party_income21[i][9:13]))
        elif ii == 2:
            party_income4.append(sum(party_income21[i][13:16]))
        elif ii == 3:
            party_income4.append(sum(party_income21[i][16:]))

party_income_reshaped = []
#reshape into 2d list by party
for i in range(3):
    party_income_reshaped.append(party_income4[i*4:i*4+4])

income_df = pd.DataFrame(party_income_reshaped)
income_df.columns = income_brackets
income_df.index = ['Republican', 'Democrat', 'Independent']
income_df

Unnamed: 0,"<$30,000","$30,000-$49,999","$50,000-$99,999","$100,000+"
Republican,170,350,355,283
Democrat,340,377,362,293
Independent,303,314,301,280
