## 26_extended_census data
## This notebook handles making an extended subset of Census data for use in modeling 

In [1]:
# imports
import pandas as pd
import numpy as np
import requests
import pickle as pkl
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

### Select desired columns from Census data for a subset
- Voting data contains 'srprec' or voting precints.
- Statewidedatabase.org provides conversion tables between precints and census tracts.

#### Bring in census DP03 data and select columns.

In [3]:
dp03_18 = pd.read_pickle('./census_data/DP03_clean.pkl')
dp03_18.shape

(583, 143)

In [71]:
dp03_18.head(3)

Unnamed: 0,Geographic Area Name,GEO_ID,Estimate EMPLOYMENT STATUS Population 16 years and over,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Unemployed,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Armed Forces,Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force,Estimate EMPLOYMENT STATUS Civilian labor force,...,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL All people Under 18 years Related children of the householder under 18 years Related children of the householder 5 to 17 years,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL All people 18 years and over,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL All people 18 years and over 18 to 64 years,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL All people 18 years and over 65 years and over,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL People in families,Estimate PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL Unrelated individuals 15 years and over,Geographic Area Name.1,state,county,tract
1,"Census Tract 626.44, Orange County, California",1400000US06059062644,6767,3967,3967,3811,156,0,2800,3967,...,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,"Census Tract 626.44, Orange County, California",6,59,62644
2,"Census Tract 626.40, Orange County, California",1400000US06059062640,3062,2446,2436,2320,116,10,616,2436,...,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,"Census Tract 626.40, Orange County, California",6,59,62640
3,"Census Tract 630.08, Orange County, California",1400000US06059063008,833,431,431,412,19,0,402,431,...,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,-888888888.0,"Census Tract 630.08, Orange County, California",6,59,63008


In [4]:
dp03_18['tract'] = dp03_18['tract'].astype('int64')

In [5]:
dp03_18['tract'].dtype

dtype('int64')

In [6]:
# to select columns, read a (very long) list of column names, and select which to keep.
# helpful to copy into a .py file named 'variables' for future reference

In [13]:
#uncomment in order to see full list of columns names

# dp03_18.columns.tolist()             

In [28]:
dp03_cols = [
             'GEO_ID',
             'tract',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Unemployed',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Armed Forces',
             'Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force',
             'Estimate EMPLOYMENT STATUS Females 16 years and over In labor force Civilian labor force Employed',
             'Estimate EMPLOYMENT STATUS Own children of the householder under 6 years All parents in family in labor force',
             'Estimate EMPLOYMENT STATUS Own children of the householder 6 to 17 years All parents in family in labor force',
             'Estimate OCCUPATION Civilian employed population 16 years and over Management, business, science, and arts occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Service occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Sales and office occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Natural resources, construction, and maintenance occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Production, transportation, and material moving occupations',
             'Estimate INCOME AND BENEFITS (IN 2018 INFLATION-ADJUSTED DOLLARS) Total households Median household income (dollars)',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population With health insurance coverage With private health insurance',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population With health insurance coverage With public coverage',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population No health insurance coverage'
             ]

In [29]:
dp03_18sub = dp03_18[dp03_cols]

In [30]:
dp03_18sub.head(3)

Unnamed: 0,GEO_ID,tract,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Unemployed,Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Armed Forces,Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force,Estimate EMPLOYMENT STATUS Females 16 years and over In labor force Civilian labor force Employed,Estimate EMPLOYMENT STATUS Own children of the householder under 6 years All parents in family in labor force,Estimate EMPLOYMENT STATUS Own children of the householder 6 to 17 years All parents in family in labor force,"Estimate OCCUPATION Civilian employed population 16 years and over Management, business, science, and arts occupations",Estimate OCCUPATION Civilian employed population 16 years and over Service occupations,Estimate OCCUPATION Civilian employed population 16 years and over Sales and office occupations,"Estimate OCCUPATION Civilian employed population 16 years and over Natural resources, construction, and maintenance occupations","Estimate OCCUPATION Civilian employed population 16 years and over Production, transportation, and material moving occupations",Estimate INCOME AND BENEFITS (IN 2018 INFLATION-ADJUSTED DOLLARS) Total households Median household income (dollars),Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population With health insurance coverage With private health insurance,Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population With health insurance coverage With public coverage,Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population No health insurance coverage
1,1400000US06059062644,62644,3811,156,0,2800,1652,133,862,2547,262,819,91,92,146953,7854,1504,245
2,1400000US06059062640,62640,2320,116,10,616,1234,122,447,1202,287,563,63,205,84632,2687,889,263
3,1400000US06059063008,63008,412,19,0,402,239,4,43,255,22,104,0,31,100396,668,396,0


In [31]:
dp03_18sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 583 entries, 1 to 583
Data columns (total 18 columns):
 #   Column                                                                                                                                    Non-Null Count  Dtype 
---  ------                                                                                                                                    --------------  ----- 
 0   GEO_ID                                                                                                                                    583 non-null    object
 1   tract                                                                                                                                     583 non-null    int64 
 2   Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed                                      583 non-null    object
 3   Estimate EMPLOYMENT STATUS Population 16 years and over In l

In [32]:
# change multiple columns to numeric

num_cols = ['Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Unemployed',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Armed Forces',
             'Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force',
             'Estimate EMPLOYMENT STATUS Females 16 years and over In labor force Civilian labor force Employed',
             'Estimate EMPLOYMENT STATUS Own children of the householder under 6 years All parents in family in labor force',
             'Estimate EMPLOYMENT STATUS Own children of the householder 6 to 17 years All parents in family in labor force',
             'Estimate OCCUPATION Civilian employed population 16 years and over Management, business, science, and arts occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Service occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Sales and office occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Natural resources, construction, and maintenance occupations',
             'Estimate OCCUPATION Civilian employed population 16 years and over Production, transportation, and material moving occupations',
             'Estimate INCOME AND BENEFITS (IN 2018 INFLATION-ADJUSTED DOLLARS) Total households Median household income (dollars)',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population With health insurance coverage With private health insurance',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population With health insurance coverage With public coverage',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population No health insurance coverage']

In [33]:
for col in num_cols:
    dp03_18sub[col] = pd.to_numeric(dp03_18sub[col])

# will generate a warning (not an error) about making a change a copy of a slide of a dataframe.  
# we will just proceed.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp03_18sub[col] = pd.to_numeric(dp03_18sub[col])


In [34]:
dp03_18sub.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 583 entries, 1 to 583
Data columns (total 18 columns):
 #   Column                                                                                                                                    Non-Null Count  Dtype 
---  ------                                                                                                                                    --------------  ----- 
 0   GEO_ID                                                                                                                                    583 non-null    object
 1   tract                                                                                                                                     583 non-null    int64 
 2   Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed                                      583 non-null    int64 
 3   Estimate EMPLOYMENT STATUS Population 16 years and over In l

#### create a dictionary to rename columns for readability

In [35]:
col_names = {'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Employed': 'employed',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Civilian labor force Unemployed':'unemployed',
             'Estimate EMPLOYMENT STATUS Population 16 years and over In labor force Armed Forces':'empl_military',
             'Estimate EMPLOYMENT STATUS Population 16 years and over Not in labor force': 'not_inlaborforce',
             'Estimate EMPLOYMENT STATUS Females 16 years and over In labor force Civilian labor force Employed': 'working_women',
             'Estimate EMPLOYMENT STATUS Own children of the householder under 6 years All parents in family in labor force': 'parents_work_under6',
             'Estimate EMPLOYMENT STATUS Own children of the householder 6 to 17 years All parents in family in labor force': 'parents_work_0617',
             'Estimate OCCUPATION Civilian employed population 16 years and over Management, business, science, and arts occupations': 'occ_mgmt_sci_art',
             'Estimate OCCUPATION Civilian employed population 16 years and over Service occupations': 'occ_service_sector',
             'Estimate OCCUPATION Civilian employed population 16 years and over Sales and office occupations': 'occ_sales_gen_office',
             'Estimate OCCUPATION Civilian employed population 16 years and over Natural resources, construction, and maintenance occupations': 'occ_constr_maintc',
             'Estimate OCCUPATION Civilian employed population 16 years and over Production, transportation, and material moving occupations': 'occ_manuf_transpo',
             'Estimate INCOME AND BENEFITS (IN 2018 INFLATION-ADJUSTED DOLLARS) Total households Median household income (dollars)': 'hh_med_income',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population With health insurance coverage With private health insurance': 'hlthins_priv',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population With health insurance coverage With public coverage': 'hlthins_public',
             'Estimate HEALTH INSURANCE COVERAGE Civilian noninstitutionalized population No health insurance coverage': 'hlthins_none'}

In [39]:
# df.rename(columns={"A": "a", "B": "c"})
dp03_18sub.rename(columns=col_names, inplace=True)

#will generate a warning (not an error) about "set on a copy of a slice from a DataFrame".  we will continue.  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [40]:
dp03_18sub.head(3)

Unnamed: 0,GEO_ID,tract,employed,unemployed,empl_military,not_inlaborforce,working_women,parents_work_under6,parents_work_0617,occ_mgmt_sci_art,occ_service_sector,occ_sales_gen_office,occ_constr_maintc,occ_manuf_transpo,hh_med_income,hlthins_priv,hlthins_public,hlthins_none
1,1400000US06059062644,62644,3811,156,0,2800,1652,133,862,2547,262,819,91,92,146953,7854,1504,245
2,1400000US06059062640,62640,2320,116,10,616,1234,122,447,1202,287,563,63,205,84632,2687,889,263
3,1400000US06059063008,63008,412,19,0,402,239,4,43,255,22,104,0,31,100396,668,396,0


In [41]:
pd.set_option('display.max_rows', 10)
dp03_18sub['tract'].value_counts()

88702     1
21822     1
21802     1
52521     1
32040     1
         ..
86601     1
110109    1
99906     1
1601      1
52506     1
Name: tract, Length: 583, dtype: int64

In [42]:
#save subset of Census data to pickle file for enhancing voter data, and for modeling
dp03_18sub.to_pickle('./census_data/DP03_extended.pkl')