In [95]:
import numpy as np
import pandas as pd
import janitor

Data on nativity and citizenship by county, from ACS

In [96]:

nat_cit = pd.read_csv('nativity_citizenship_county.csv', skiprows=1)

In [128]:
nat_cit_clean = nat_cit.clean_names()
nat_cit_clean.head()

Unnamed: 0,geography,geographic_area_name,estimate!!total_,margin_of_error!!total_,estimate!!total_!!u_s_citizen_born_in_the_united_states,margin_of_error!!total_!!u_s_citizen_born_in_the_united_states,estimate!!total_!!u_s_citizen_born_in_puerto_rico_or_u_s_island_areas,margin_of_error!!total_!!u_s_citizen_born_in_puerto_rico_or_u_s_island_areas,estimate!!total_!!u_s_citizen_born_abroad_of_american_parent_s_,margin_of_error!!total_!!u_s_citizen_born_abroad_of_american_parent_s_,estimate!!total_!!u_s_citizen_by_naturalization,margin_of_error!!total_!!u_s_citizen_by_naturalization,estimate!!total_!!not_a_u_s_citizen,margin_of_error!!total_!!not_a_u_s_citizen,unnamed_14
0,0500000US01003,"Baldwin County, Alabama",261608.0,*****,247172.0,3402.0,163.0,247.0,1496.0,742.0,8219.0,1766.0,4558.0,2610.0,
1,0500000US01015,"Calhoun County, Alabama",116427.0,*****,109548.0,1559.0,269.0,283.0,1722.0,692.0,2608.0,1062.0,2280.0,970.0,
2,0500000US01043,"Cullman County, Alabama",92604.0,*****,90390.0,725.0,0.0,213.0,317.0,308.0,584.0,481.0,1313.0,691.0,
3,0500000US01049,"DeKalb County, Alabama",73122.0,*****,67142.0,2013.0,0.0,213.0,154.0,144.0,699.0,612.0,5127.0,1999.0,
4,0500000US01051,"Elmore County, Alabama",91042.0,*****,89124.0,986.0,109.0,104.0,548.0,503.0,492.0,397.0,769.0,731.0,


In [129]:
nat_cit_clean.drop('unnamed_14', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nat_cit_clean.drop('unnamed_14', axis=1, inplace=True)


What I want to do with this:
- Clean this data
- Merge with county-level shapefile
- Merge with AOR shapefile
- Merge with encounters data



Cleaning plan:
- Improve column names
- Drop NAs
- Create columns of the proportions vs total

In [130]:
# drop rows with NAs
print(nat_cit_clean.isna().sum())
nat_cit_clean = nat_cit_clean.dropna()
nat_cit_clean.head()


geography                                                                        0
geographic_area_name                                                             0
estimate!!total_                                                                18
margin_of_error!!total_                                                         18
estimate!!total_!!u_s_citizen_born_in_the_united_states                         18
margin_of_error!!total_!!u_s_citizen_born_in_the_united_states                  18
estimate!!total_!!u_s_citizen_born_in_puerto_rico_or_u_s_island_areas           18
margin_of_error!!total_!!u_s_citizen_born_in_puerto_rico_or_u_s_island_areas    18
estimate!!total_!!u_s_citizen_born_abroad_of_american_parent_s_                 18
margin_of_error!!total_!!u_s_citizen_born_abroad_of_american_parent_s_          18
estimate!!total_!!u_s_citizen_by_naturalization                                 18
margin_of_error!!total_!!u_s_citizen_by_naturalization                          18
esti

Unnamed: 0,geography,geographic_area_name,estimate!!total_,margin_of_error!!total_,estimate!!total_!!u_s_citizen_born_in_the_united_states,margin_of_error!!total_!!u_s_citizen_born_in_the_united_states,estimate!!total_!!u_s_citizen_born_in_puerto_rico_or_u_s_island_areas,margin_of_error!!total_!!u_s_citizen_born_in_puerto_rico_or_u_s_island_areas,estimate!!total_!!u_s_citizen_born_abroad_of_american_parent_s_,margin_of_error!!total_!!u_s_citizen_born_abroad_of_american_parent_s_,estimate!!total_!!u_s_citizen_by_naturalization,margin_of_error!!total_!!u_s_citizen_by_naturalization,estimate!!total_!!not_a_u_s_citizen,margin_of_error!!total_!!not_a_u_s_citizen
0,0500000US01003,"Baldwin County, Alabama",261608.0,*****,247172.0,3402.0,163.0,247.0,1496.0,742.0,8219.0,1766.0,4558.0,2610.0
1,0500000US01015,"Calhoun County, Alabama",116427.0,*****,109548.0,1559.0,269.0,283.0,1722.0,692.0,2608.0,1062.0,2280.0,970.0
2,0500000US01043,"Cullman County, Alabama",92604.0,*****,90390.0,725.0,0.0,213.0,317.0,308.0,584.0,481.0,1313.0,691.0
3,0500000US01049,"DeKalb County, Alabama",73122.0,*****,67142.0,2013.0,0.0,213.0,154.0,144.0,699.0,612.0,5127.0,1999.0
4,0500000US01051,"Elmore County, Alabama",91042.0,*****,89124.0,986.0,109.0,104.0,548.0,503.0,492.0,397.0,769.0,731.0


In [131]:
nat_cit_clean.columns = nat_cit_clean.columns.str.replace("total_", "", regex=False)
nat_cit_clean.columns = nat_cit_clean.columns.str.replace("!!", "_", regex=False)

In [132]:
nat_cit_clean.columns = nat_cit_clean.columns.str.replace("_u_s_", "", regex=False)
nat_cit_clean.columns = nat_cit_clean.columns.str.replace("the_united_states", "us", regex=False)
nat_cit_clean.columns = nat_cit_clean.columns.str.replace("__not_acitizen", "_not_a_citizen", regex=False)

In [133]:
nat_cit_clean.head()

Unnamed: 0,geography,geographic_area_name,estimate_,margin_of_error_,estimate_citizen_born_in_us,margin_of_error_citizen_born_in_us,estimate_citizen_born_in_puerto_rico_orisland_areas,margin_of_error_citizen_born_in_puerto_rico_orisland_areas,estimate_citizen_born_abroad_of_american_parent_s_,margin_of_error_citizen_born_abroad_of_american_parent_s_,estimate_citizen_by_naturalization,margin_of_error_citizen_by_naturalization,estimate_not_a_citizen,margin_of_error_not_a_citizen
0,0500000US01003,"Baldwin County, Alabama",261608.0,*****,247172.0,3402.0,163.0,247.0,1496.0,742.0,8219.0,1766.0,4558.0,2610.0
1,0500000US01015,"Calhoun County, Alabama",116427.0,*****,109548.0,1559.0,269.0,283.0,1722.0,692.0,2608.0,1062.0,2280.0,970.0
2,0500000US01043,"Cullman County, Alabama",92604.0,*****,90390.0,725.0,0.0,213.0,317.0,308.0,584.0,481.0,1313.0,691.0
3,0500000US01049,"DeKalb County, Alabama",73122.0,*****,67142.0,2013.0,0.0,213.0,154.0,144.0,699.0,612.0,5127.0,1999.0
4,0500000US01051,"Elmore County, Alabama",91042.0,*****,89124.0,986.0,109.0,104.0,548.0,503.0,492.0,397.0,769.0,731.0


In [134]:
def add_proportions(df, denom_col = "estimate_", prefix="estimate_", new_prefix="prop_"):
    est_cols = [col for col in df.columns
                if col.startswith(prefix) and col != denom_col]
    props = df[est_cols].div(df[denom_col], axis=0)
    props = props.rename(columns=lambda x: x.replace(prefix, new_prefix, 1))

    return df.join(props)

In [135]:
nat_cit_clean = add_proportions(nat_cit_clean)


In [136]:
nat_cit_clean.head()

Unnamed: 0,geography,geographic_area_name,estimate_,margin_of_error_,estimate_citizen_born_in_us,margin_of_error_citizen_born_in_us,estimate_citizen_born_in_puerto_rico_orisland_areas,margin_of_error_citizen_born_in_puerto_rico_orisland_areas,estimate_citizen_born_abroad_of_american_parent_s_,margin_of_error_citizen_born_abroad_of_american_parent_s_,estimate_citizen_by_naturalization,margin_of_error_citizen_by_naturalization,estimate_not_a_citizen,margin_of_error_not_a_citizen,prop_citizen_born_in_us,prop_citizen_born_in_puerto_rico_orisland_areas,prop_citizen_born_abroad_of_american_parent_s_,prop_citizen_by_naturalization,prop_not_a_citizen
0,0500000US01003,"Baldwin County, Alabama",261608.0,*****,247172.0,3402.0,163.0,247.0,1496.0,742.0,8219.0,1766.0,4558.0,2610.0,0.944818,0.000623,0.005718,0.031417,0.017423
1,0500000US01015,"Calhoun County, Alabama",116427.0,*****,109548.0,1559.0,269.0,283.0,1722.0,692.0,2608.0,1062.0,2280.0,970.0,0.940916,0.00231,0.01479,0.0224,0.019583
2,0500000US01043,"Cullman County, Alabama",92604.0,*****,90390.0,725.0,0.0,213.0,317.0,308.0,584.0,481.0,1313.0,691.0,0.976092,0.0,0.003423,0.006306,0.014179
3,0500000US01049,"DeKalb County, Alabama",73122.0,*****,67142.0,2013.0,0.0,213.0,154.0,144.0,699.0,612.0,5127.0,1999.0,0.918219,0.0,0.002106,0.009559,0.070116
4,0500000US01051,"Elmore County, Alabama",91042.0,*****,89124.0,986.0,109.0,104.0,548.0,503.0,492.0,397.0,769.0,731.0,0.978933,0.001197,0.006019,0.005404,0.008447


In [137]:
nat_cit_clean.to_csv('nat_cit_cleaned.csv', index=False)