In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# EXTRACT

#### All data is in folder named 'Potential Data Sources'
* Downloaded rural-urban codes CSV files from https://www.ers.usda.gov/data-products/rural-urban-continuum-codes/ and placed in folder named 'Rural_or_Urban_Designations'
* Downloaded age census data from https://www.census.gov/acs/www/data/data-tables-and-tools/ and placed in folder named 'US Census-Age"
* Downloaded race census data from https://www.census.gov/acs/www/data/data-tables-and-tools/ and placed in folder named 'US Census-Race"
* Downloaded health insurance census data from https://www.census.gov/acs/www/data/data-tables-and-tools/ and placed in folder named 'US Census-Health Insurance"
* Downloaded poverty census data from https://www.census.gov/acs/www/data/data-tables-and-tools/ and placed in folder named 'US Census-Poverty"

In [2]:
# Study data files
rural_filepath = "Potential Data Sources/Rural_or_Urban_Designations/ruralurbancodes2013.csv"
age_filepath = "Potential Data Sources/US Census-Age/ACSST1Y2019.S0101_data_with_overlays_2020-10-02T091901.csv"
race_filepath = "Potential Data Sources/US Census-Race/ACSDT1Y2018.B02001_data_with_overlays_2020-10-13T161059.csv"
insurance_filepath = "Potential Data Sources/US Census-Health Insurance/sahie_2018.csv"
poverty_filepath = "Potential Data Sources/US Census-Poverty/est18all (1).csv"

In [3]:
# Read the data into Pandas DataFrame
rural_df = pd.read_csv(rural_filepath)
age_df = pd.read_csv(age_filepath, skiprows=1)
race_df = pd.read_csv(race_filepath, skiprows=1)
insurance_df = pd.read_csv(insurance_filepath, skiprows=79)
poverty_df = pd.read_csv(poverty_filepath, skiprows=3)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# TRANSFORM

## RURAL FILE

In [4]:
#Look at imported rural CSV
rural_df.head()

Unnamed: 0,FIPS,State,County_Name,Population_2010,RUCC_2013,Description
0,1001,AL,Autauga County,54571,2,"Metro - Counties in metro areas of 250,000 to ..."
1,1003,AL,Baldwin County,182265,3,Metro - Counties in metro areas of fewer than ...
2,1005,AL,Barbour County,27457,6,"Nonmetro - Urban population of 2,500 to 19,999..."
3,1007,AL,Bibb County,22915,1,Metro - Counties in metro areas of 1 million p...
4,1009,AL,Blount County,57322,1,Metro - Counties in metro areas of 1 million p...


### rural table

In [5]:
# Return only wanted columns for rural table
rural_columns = ['FIPS', 'State', 'County_Name', 'RUCC_2013', 'Description']
new_rural_df = rural_df[rural_columns].copy()

new_rural_df.head()

Unnamed: 0,FIPS,State,County_Name,RUCC_2013,Description
0,1001,AL,Autauga County,2,"Metro - Counties in metro areas of 250,000 to ..."
1,1003,AL,Baldwin County,3,Metro - Counties in metro areas of fewer than ...
2,1005,AL,Barbour County,6,"Nonmetro - Urban population of 2,500 to 19,999..."
3,1007,AL,Bibb County,1,Metro - Counties in metro areas of 1 million p...
4,1009,AL,Blount County,1,Metro - Counties in metro areas of 1 million p...


In [6]:
#Inspect rural table for column type
new_rural_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   FIPS         3221 non-null   int64 
 1   State        3221 non-null   object
 2   County_Name  3221 non-null   object
 3   RUCC_2013    3221 non-null   int64 
 4   Description  3221 non-null   object
dtypes: int64(2), object(3)
memory usage: 125.9+ KB


In [7]:
# Rename the column headers
newer_rural_df = new_rural_df.rename(columns={'County_Name': 'county',
                                                     'State': 'state_abbr',
                                                     'FIPS': 'fips',
                                                     'Description': 'RUCC Description'})

In [8]:
newer_rural_df.head()

Unnamed: 0,fips,state_abbr,county,RUCC_2013,RUCC Description
0,1001,AL,Autauga County,2,"Metro - Counties in metro areas of 250,000 to ..."
1,1003,AL,Baldwin County,3,Metro - Counties in metro areas of fewer than ...
2,1005,AL,Barbour County,6,"Nonmetro - Urban population of 2,500 to 19,999..."
3,1007,AL,Bibb County,1,Metro - Counties in metro areas of 1 million p...
4,1009,AL,Blount County,1,Metro - Counties in metro areas of 1 million p...


## AGE FILE

In [9]:
#Look at imported age CSV
age_df.head()

Unnamed: 0,id,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under 5 years,Margin of Error!!Total!!Total population!!AGE!!Under 5 years,Estimate!!Total!!Total population!!AGE!!5 to 9 years,Margin of Error!!Total!!Total population!!AGE!!5 to 9 years,Estimate!!Total!!Total population!!AGE!!10 to 14 years,Margin of Error!!Total!!Total population!!AGE!!10 to 14 years,...,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Age dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Age dependency ratio,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age
0,0500000US01003,"Baldwin County, Alabama",223234,*****,10616,926,12826,2513,14373,2687,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
1,0500000US01015,"Calhoun County, Alabama",113605,*****,6699,200,5534,1301,7774,1319,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
2,0500000US01043,"Cullman County, Alabama",83768,*****,5310,610,4563,972,5906,1125,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
3,0500000US01049,"DeKalb County, Alabama",71513,*****,4578,552,4292,994,5519,835,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
4,0500000US01051,"Elmore County, Alabama",81209,*****,4272,557,6638,1274,3812,1049,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)


### age table

In [10]:
# Return only wanted columns
age_columns = ['id', 'Geographic Area Name', 'Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Median age (years)',
                'Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Sex ratio (males per 100 females)',
                'Estimate!!Percent!!Total population!!AGE!!Under 5 years',
                'Estimate!!Percent!!Total population!!AGE!!5 to 9 years',
                'Estimate!!Percent!!Total population!!AGE!!10 to 14 years',
                'Estimate!!Percent!!Total population!!AGE!!15 to 19 years',
                'Estimate!!Percent!!Total population!!AGE!!20 to 24 years',
                'Estimate!!Percent!!Total population!!AGE!!25 to 29 years',
                'Estimate!!Percent!!Total population!!AGE!!30 to 34 years',
                'Estimate!!Percent!!Total population!!AGE!!35 to 39 years',
                'Estimate!!Percent!!Total population!!AGE!!40 to 44 years',
                'Estimate!!Percent!!Total population!!AGE!!45 to 49 years',
                'Estimate!!Percent!!Total population!!AGE!!50 to 54 years',
                'Estimate!!Percent!!Total population!!AGE!!55 to 59 years',
                'Estimate!!Percent!!Total population!!AGE!!60 to 64 years',
                'Estimate!!Percent!!Total population!!AGE!!65 to 69 years',
                'Estimate!!Percent!!Total population!!AGE!!70 to 74 years',
                'Estimate!!Percent!!Total population!!AGE!!75 to 79 years',
                'Estimate!!Percent!!Total population!!AGE!!80 to 84 years',
                'Estimate!!Percent!!Total population!!AGE!!85 years and over']
new_age_df = age_df[age_columns].copy()

new_age_df.head()

Unnamed: 0,id,Geographic Area Name,Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Median age (years),Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Sex ratio (males per 100 females),Estimate!!Percent!!Total population!!AGE!!Under 5 years,Estimate!!Percent!!Total population!!AGE!!5 to 9 years,Estimate!!Percent!!Total population!!AGE!!10 to 14 years,Estimate!!Percent!!Total population!!AGE!!15 to 19 years,Estimate!!Percent!!Total population!!AGE!!20 to 24 years,Estimate!!Percent!!Total population!!AGE!!25 to 29 years,...,Estimate!!Percent!!Total population!!AGE!!40 to 44 years,Estimate!!Percent!!Total population!!AGE!!45 to 49 years,Estimate!!Percent!!Total population!!AGE!!50 to 54 years,Estimate!!Percent!!Total population!!AGE!!55 to 59 years,Estimate!!Percent!!Total population!!AGE!!60 to 64 years,Estimate!!Percent!!Total population!!AGE!!65 to 69 years,Estimate!!Percent!!Total population!!AGE!!70 to 74 years,Estimate!!Percent!!Total population!!AGE!!75 to 79 years,Estimate!!Percent!!Total population!!AGE!!80 to 84 years,Estimate!!Percent!!Total population!!AGE!!85 years and over
0,0500000US01003,"Baldwin County, Alabama",43.0,95.7,4.8,5.7,6.4,6.5,5.1,5.3,...,5.4,5.9,6.3,7.4,6.9,6.3,6.7,4.5,2.0,1.9
1,0500000US01015,"Calhoun County, Alabama",39.6,91.5,5.9,4.9,6.8,6.6,6.1,6.2,...,5.6,6.1,5.7,6.5,7.4,5.5,5.2,3.5,2.3,1.6
2,0500000US01043,"Cullman County, Alabama",41.9,94.0,6.3,5.4,7.1,5.5,5.5,6.4,...,7.1,7.3,6.2,6.0,7.3,4.3,6.6,3.4,2.3,1.9
3,0500000US01049,"DeKalb County, Alabama",37.7,99.6,6.4,6.0,7.7,7.2,7.0,6.0,...,6.6,6.1,6.2,7.0,5.4,5.4,4.5,4.3,1.2,1.3
4,0500000US01051,"Elmore County, Alabama",39.0,97.4,5.3,8.2,4.7,6.0,5.0,7.6,...,5.8,7.1,5.9,5.8,7.8,4.3,5.2,2.9,1.9,1.4


In [11]:
#Inspect age table for column type
new_age_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 22 columns):
 #   Column                                                                                    Non-Null Count  Dtype  
---  ------                                                                                    --------------  -----  
 0   id                                                                                        840 non-null    object 
 1   Geographic Area Name                                                                      840 non-null    object 
 2   Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Median age (years)                 840 non-null    float64
 3   Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Sex ratio (males per 100 females)  840 non-null    float64
 4   Estimate!!Percent!!Total population!!AGE!!Under 5 years                                   840 non-null    float64
 5   Estimate!!Percent!!Total population!!AGE!!5 to 9 years   

In [12]:
# Rename the column headers
newer_age_df = new_age_df.rename(columns={'Geographic Area Name': 'area',
                                          'Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Median age (years)': 'Median age',
                                          'Estimate!!Total!!Total population!!SUMMARY INDICATORS!!Sex ratio (males per 100 females)': 'Sex ratio (males per 100 females)',
                                          'Estimate!!Percent!!Total population!!AGE!!Under 5 years': '% < age 5',
                                          'Estimate!!Percent!!Total population!!AGE!!5 to 9 years': '% age 5-9',
                                          'Estimate!!Percent!!Total population!!AGE!!10 to 14 years': '% age 10-14',
                                          'Estimate!!Percent!!Total population!!AGE!!15 to 19 years': '% age 15-19',
                                          'Estimate!!Percent!!Total population!!AGE!!20 to 24 years': '% age 20-24',
                                          'Estimate!!Percent!!Total population!!AGE!!25 to 29 years': '% age 25-29',
                                          'Estimate!!Percent!!Total population!!AGE!!30 to 34 years': '% age 30-34',
                                          'Estimate!!Percent!!Total population!!AGE!!35 to 39 years': '% age 35-39',
                                          'Estimate!!Percent!!Total population!!AGE!!40 to 44 years': '% age 40-44',
                                          'Estimate!!Percent!!Total population!!AGE!!45 to 49 years': '% age 45-49',
                                          'Estimate!!Percent!!Total population!!AGE!!50 to 54 years': '% age 50-54',
                                          'Estimate!!Percent!!Total population!!AGE!!55 to 59 years': '% age 55-59',
                                          'Estimate!!Percent!!Total population!!AGE!!60 to 64 years': '% age 60-64',
                                          'Estimate!!Percent!!Total population!!AGE!!65 to 69 years': '% age 65-69',
                                          'Estimate!!Percent!!Total population!!AGE!!70 to 74 years': '% age 70-74',
                                          'Estimate!!Percent!!Total population!!AGE!!75 to 79 years': '% age 75-79',
                                          'Estimate!!Percent!!Total population!!AGE!!80 to 84 years': '% age 80-84',
                                          'Estimate!!Percent!!Total population!!AGE!!85 years and over': '% > age 84'})

# Adding state and county columns to the existing dataframe - splitting is done on the basis of comma 
newer_age_df[['county','state']] = newer_age_df.area.str.split(", ",expand=True)

# Adding fips id to df
newer_age_df['fips'] = newer_age_df['id'].str[-5:].astype('int')

newer_age_df.head()

Unnamed: 0,id,area,Median age,Sex ratio (males per 100 females),% < age 5,% age 5-9,% age 10-14,% age 15-19,% age 20-24,% age 25-29,...,% age 55-59,% age 60-64,% age 65-69,% age 70-74,% age 75-79,% age 80-84,% > age 84,county,state,fips
0,0500000US01003,"Baldwin County, Alabama",43.0,95.7,4.8,5.7,6.4,6.5,5.1,5.3,...,7.4,6.9,6.3,6.7,4.5,2.0,1.9,Baldwin County,Alabama,1003
1,0500000US01015,"Calhoun County, Alabama",39.6,91.5,5.9,4.9,6.8,6.6,6.1,6.2,...,6.5,7.4,5.5,5.2,3.5,2.3,1.6,Calhoun County,Alabama,1015
2,0500000US01043,"Cullman County, Alabama",41.9,94.0,6.3,5.4,7.1,5.5,5.5,6.4,...,6.0,7.3,4.3,6.6,3.4,2.3,1.9,Cullman County,Alabama,1043
3,0500000US01049,"DeKalb County, Alabama",37.7,99.6,6.4,6.0,7.7,7.2,7.0,6.0,...,7.0,5.4,5.4,4.5,4.3,1.2,1.3,DeKalb County,Alabama,1049
4,0500000US01051,"Elmore County, Alabama",39.0,97.4,5.3,8.2,4.7,6.0,5.0,7.6,...,5.8,7.8,4.3,5.2,2.9,1.9,1.4,Elmore County,Alabama,1051


In [13]:
# Drop area and id column
newest_age_df = newer_age_df.drop(columns=['area', 'id', 'county', 'state'])

newest_age_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Median age                         840 non-null    float64
 1   Sex ratio (males per 100 females)  840 non-null    float64
 2   % < age 5                          840 non-null    float64
 3   % age 5-9                          840 non-null    float64
 4   % age 10-14                        840 non-null    float64
 5   % age 15-19                        840 non-null    float64
 6   % age 20-24                        840 non-null    float64
 7   % age 25-29                        840 non-null    float64
 8   % age 30-34                        840 non-null    float64
 9   % age 35-39                        840 non-null    float64
 10  % age 40-44                        840 non-null    float64
 11  % age 45-49                        840 non-null    float64

## RACE FILE

In [14]:
#Look at imported race CSV
race_df.head()

Unnamed: 0,id,Geographic Area Name,Estimate!!Total,Margin of Error!!Total,Estimate!!Total!!White alone,Margin of Error!!Total!!White alone,Estimate!!Total!!Black or African American alone,Margin of Error!!Total!!Black or African American alone,Estimate!!Total!!American Indian and Alaska Native alone,Margin of Error!!Total!!American Indian and Alaska Native alone,...,Estimate!!Total!!Native Hawaiian and Other Pacific Islander alone,Margin of Error!!Total!!Native Hawaiian and Other Pacific Islander alone,Estimate!!Total!!Some other race alone,Margin of Error!!Total!!Some other race alone,Estimate!!Total!!Two or more races,Margin of Error!!Total!!Two or more races,Estimate!!Total!!Two or more races!!Two races including Some other race,Margin of Error!!Total!!Two or more races!!Two races including Some other race,"Estimate!!Total!!Two or more races!!Two races excluding Some other race, and three or more races","Margin of Error!!Total!!Two or more races!!Two races excluding Some other race, and three or more races"
0,0500000US01003,"Baldwin County, Alabama",218022.0,*****,187759.0,1764.0,20554.0,446.0,1276.0,702.0,...,45.0,86.0,2586.0,1957.0,3464.0,1078.0,177.0,193.0,3287.0,1032.0
1,0500000US01015,"Calhoun County, Alabama",114277.0,*****,85046.0,1148.0,24737.0,642.0,107.0,138.0,...,0.0,201.0,1182.0,1239.0,2779.0,793.0,842.0,601.0,1937.0,594.0
2,0500000US01043,"Cullman County, Alabama",83442.0,*****,78643.0,1169.0,906.0,335.0,301.0,403.0,...,0.0,201.0,1367.0,1105.0,2059.0,806.0,335.0,528.0,1724.0,511.0
3,0500000US01049,"DeKalb County, Alabama",71385.0,*****,60157.0,1938.0,1335.0,201.0,868.0,614.0,...,0.0,201.0,7583.0,1943.0,1269.0,541.0,0.0,201.0,1269.0,541.0
4,0500000US01051,"Elmore County, Alabama",81887.0,*****,60622.0,957.0,17113.0,1682.0,345.0,292.0,...,0.0,201.0,1339.0,897.0,1946.0,1401.0,56.0,93.0,1890.0,1388.0


### race table

In [15]:
# Return only wanted columns
race_columns = ['id', 'Geographic Area Name', 'Estimate!!Total', 'Estimate!!Total!!White alone',
                'Estimate!!Total!!Black or African American alone',
                'Estimate!!Total!!American Indian and Alaska Native alone',
                'Estimate!!Total!!Native Hawaiian and Other Pacific Islander alone',
                'Estimate!!Total!!Some other race alone','Estimate!!Total!!Two or more races']
new_race_df = race_df[race_columns].copy()

new_race_df.head()

Unnamed: 0,id,Geographic Area Name,Estimate!!Total,Estimate!!Total!!White alone,Estimate!!Total!!Black or African American alone,Estimate!!Total!!American Indian and Alaska Native alone,Estimate!!Total!!Native Hawaiian and Other Pacific Islander alone,Estimate!!Total!!Some other race alone,Estimate!!Total!!Two or more races
0,0500000US01003,"Baldwin County, Alabama",218022.0,187759.0,20554.0,1276.0,45.0,2586.0,3464.0
1,0500000US01015,"Calhoun County, Alabama",114277.0,85046.0,24737.0,107.0,0.0,1182.0,2779.0
2,0500000US01043,"Cullman County, Alabama",83442.0,78643.0,906.0,301.0,0.0,1367.0,2059.0
3,0500000US01049,"DeKalb County, Alabama",71385.0,60157.0,1335.0,868.0,0.0,7583.0,1269.0
4,0500000US01051,"Elmore County, Alabama",81887.0,60622.0,17113.0,345.0,0.0,1339.0,1946.0


In [16]:
#Inspect race table for column type
new_race_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838 entries, 0 to 837
Data columns (total 9 columns):
 #   Column                                                             Non-Null Count  Dtype  
---  ------                                                             --------------  -----  
 0   id                                                                 838 non-null    object 
 1   Geographic Area Name                                               838 non-null    object 
 2   Estimate!!Total                                                    801 non-null    float64
 3   Estimate!!Total!!White alone                                       801 non-null    float64
 4   Estimate!!Total!!Black or African American alone                   801 non-null    float64
 5   Estimate!!Total!!American Indian and Alaska Native alone           801 non-null    float64
 6   Estimate!!Total!!Native Hawaiian and Other Pacific Islander alone  801 non-null    float64
 7   Estimate!!Total!!Some other

In [17]:
# Drop rows with no data
newer_race_df = new_race_df.dropna()

newer_race_df.head()

Unnamed: 0,id,Geographic Area Name,Estimate!!Total,Estimate!!Total!!White alone,Estimate!!Total!!Black or African American alone,Estimate!!Total!!American Indian and Alaska Native alone,Estimate!!Total!!Native Hawaiian and Other Pacific Islander alone,Estimate!!Total!!Some other race alone,Estimate!!Total!!Two or more races
0,0500000US01003,"Baldwin County, Alabama",218022.0,187759.0,20554.0,1276.0,45.0,2586.0,3464.0
1,0500000US01015,"Calhoun County, Alabama",114277.0,85046.0,24737.0,107.0,0.0,1182.0,2779.0
2,0500000US01043,"Cullman County, Alabama",83442.0,78643.0,906.0,301.0,0.0,1367.0,2059.0
3,0500000US01049,"DeKalb County, Alabama",71385.0,60157.0,1335.0,868.0,0.0,7583.0,1269.0
4,0500000US01051,"Elmore County, Alabama",81887.0,60622.0,17113.0,345.0,0.0,1339.0,1946.0


In [18]:
# Rename the column headers
newest_race_df = newer_race_df.rename(columns={'Geographic Area Name': 'area','Estimate!!Total': 'Population',
                                                     'Estimate!!Total!!White alone': 'White alone',
                                                     'Estimate!!Total!!Black or African American alone': 'African American alone',
                                                     'Estimate!!Total!!American Indian and Alaska Native alone': 'American Indian and Alaska Native alone',
                                                     'Estimate!!Total!!Native Hawaiian and Other Pacific Islander alone': 'Native Hawaiian and Other Pacific Islander alone',
                                                     'Estimate!!Total!!Some other race alone': 'Some other race alone',
                                                     'Estimate!!Total!!Two or more races': 'Two or more races'})

# Adding state and county columns to the existing dataframe - splitting is done on the basis of comma 
newest_race_df[['county','state']] = newest_race_df.area.str.split(", ",expand=True)

# Adding fips id to df
newest_race_df['fips'] = newest_race_df['id'].str[-5:].astype('int')

newest_race_df.head()

Unnamed: 0,id,area,Population,White alone,African American alone,American Indian and Alaska Native alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races,county,state,fips
0,0500000US01003,"Baldwin County, Alabama",218022.0,187759.0,20554.0,1276.0,45.0,2586.0,3464.0,Baldwin County,Alabama,1003
1,0500000US01015,"Calhoun County, Alabama",114277.0,85046.0,24737.0,107.0,0.0,1182.0,2779.0,Calhoun County,Alabama,1015
2,0500000US01043,"Cullman County, Alabama",83442.0,78643.0,906.0,301.0,0.0,1367.0,2059.0,Cullman County,Alabama,1043
3,0500000US01049,"DeKalb County, Alabama",71385.0,60157.0,1335.0,868.0,0.0,7583.0,1269.0,DeKalb County,Alabama,1049
4,0500000US01051,"Elmore County, Alabama",81887.0,60622.0,17113.0,345.0,0.0,1339.0,1946.0,Elmore County,Alabama,1051


In [19]:
# Drop area and id column
clean_race_df = newest_race_df.drop(columns=['area', 'id'])

clean_race_df.head()

Unnamed: 0,Population,White alone,African American alone,American Indian and Alaska Native alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races,county,state,fips
0,218022.0,187759.0,20554.0,1276.0,45.0,2586.0,3464.0,Baldwin County,Alabama,1003
1,114277.0,85046.0,24737.0,107.0,0.0,1182.0,2779.0,Calhoun County,Alabama,1015
2,83442.0,78643.0,906.0,301.0,0.0,1367.0,2059.0,Cullman County,Alabama,1043
3,71385.0,60157.0,1335.0,868.0,0.0,7583.0,1269.0,DeKalb County,Alabama,1049
4,81887.0,60622.0,17113.0,345.0,0.0,1339.0,1946.0,Elmore County,Alabama,1051


In [20]:
# Convert all columns to %
clean_race_df['% White alone']=round(clean_race_df['White alone']/clean_race_df['Population']*100,2)
clean_race_df['% African American alone']=round(clean_race_df['African American alone']/clean_race_df['Population']*100,2)
clean_race_df['% American Indian and Alaska Native alone']=round(clean_race_df['American Indian and Alaska Native alone']/clean_race_df['Population']*100,2)
clean_race_df['% Native Hawaiian and Other Pacific Islander alone']=round(clean_race_df['Native Hawaiian and Other Pacific Islander alone']/clean_race_df['Population']*100,2)
clean_race_df['% Some other race alone']=round(clean_race_df['Some other race alone']/clean_race_df['Population']*100,2)
clean_race_df['% Two or more races']=round(clean_race_df['Two or more races']/clean_race_df['Population']*100,2)

#Drop Total Column
final_race_df = clean_race_df.drop(columns=['county', 'state', 'White alone','African American alone',
                                            'American Indian and Alaska Native alone',
                                            'Native Hawaiian and Other Pacific Islander alone','Some other race alone',
                                            'Two or more races'])

final_race_df.head()

Unnamed: 0,Population,fips,% White alone,% African American alone,% American Indian and Alaska Native alone,% Native Hawaiian and Other Pacific Islander alone,% Some other race alone,% Two or more races
0,218022.0,1003,86.12,9.43,0.59,0.02,1.19,1.59
1,114277.0,1015,74.42,21.65,0.09,0.0,1.03,2.43
2,83442.0,1043,94.25,1.09,0.36,0.0,1.64,2.47
3,71385.0,1049,84.27,1.87,1.22,0.0,10.62,1.78
4,81887.0,1051,74.03,20.9,0.42,0.0,1.64,2.38


## INSURANCE FILE

In [21]:
#Look at imported insurance CSV
insurance_df.head()

Unnamed: 0,year,version,statefips,countyfips,geocat,agecat,racecat,sexcat,iprcat,NIPR,...,pctui_moe,PCTIC,pctic_moe,PCTELIG,pctelig_moe,PCTLIIC,pctliic_moe,state_name,county_name,Unnamed: 25
0,2018,,1,0,40,0,0,0,0,3955117,...,0.3,88.1,0.3,11.9,0.3,88.1,0.3,Alabama ...,,
1,2018,,1,0,40,0,0,0,1,1460808,...,0.6,80.4,0.6,7.2,0.2,29.7,0.4,Alabama ...,,
2,2018,,1,0,40,0,0,0,2,1805111,...,0.6,81.5,0.6,8.4,0.3,37.2,0.4,Alabama ...,,
3,2018,,1,0,40,0,0,0,3,989540,...,0.8,79.4,0.8,5.2,0.2,19.9,0.3,Alabama ...,,
4,2018,,1,0,40,0,0,0,4,2679733,...,0.4,84.5,0.4,10.5,0.3,57.2,0.4,Alabama ...,,


### insurance table

In [22]:
new_insurance_df = insurance_df.loc[(insurance_df['geocat'] == 50) & (insurance_df['agecat'] == 0) & (insurance_df['racecat'] == 0)
                      & (insurance_df['sexcat'] == 0) & (insurance_df['iprcat'] == 0)]
new_insurance_df.head()

Unnamed: 0,year,version,statefips,countyfips,geocat,agecat,racecat,sexcat,iprcat,NIPR,...,pctui_moe,PCTIC,pctic_moe,PCTELIG,pctelig_moe,PCTLIIC,pctliic_moe,state_name,county_name,Unnamed: 25
366,2018,,1,1,50,0,0,0,0,46709,...,1.2,90.0,1.2,10.0,1.2,90.0,1.2,Alabama ...,Autauga County,
462,2018,,1,3,50,0,0,0,0,172071,...,1.2,86.8,1.2,13.2,1.2,86.8,1.2,Alabama ...,Baldwin County,
558,2018,,1,5,50,0,0,0,0,17106,...,1.6,86.5,1.6,13.5,1.6,86.5,1.6,Alabama ...,Barbour County,
654,2018,,1,7,50,0,0,0,0,16643,...,1.4,89.4,1.4,10.6,1.4,89.4,1.4,Alabama ...,Bibb County,
750,2018,,1,9,50,0,0,0,0,47053,...,1.4,85.9,1.4,14.1,1.4,85.9,1.4,Alabama ...,Blount County,


In [23]:
# return only wanted columns
insurance_columns = ['statefips', 'countyfips', 'PCTUI', 'PCTIC', 'state_name', 'county_name']
newer_insurance_df = new_insurance_df[insurance_columns].copy()

newer_insurance_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 366 to 320202
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   statefips    3142 non-null   int64 
 1   countyfips   3142 non-null   int64 
 2   PCTUI        3142 non-null   object
 3   PCTIC        3142 non-null   object
 4   state_name   3142 non-null   object
 5   county_name  3142 non-null   object
dtypes: int64(2), object(4)
memory usage: 171.8+ KB


In [24]:
newer_insurance_df.drop(newer_insurance_df[newer_insurance_df['PCTUI'] == '   . '].index, inplace = True)

In [25]:
# convert the 'PCTUI' and 'PCTIC' column to float format
newer_insurance_df = newer_insurance_df.astype({"PCTUI":'float', "PCTIC":'float'})

# Rename the column headers
newest_insurance_df = newer_insurance_df.rename(columns={'PCTUI': '% Uninsured',
                                                         'PCTIC': '% Insured',
                                                         'state_name': 'state',
                                                         'county_name': 'county'})

newest_insurance_df

Unnamed: 0,statefips,countyfips,% Uninsured,% Insured,state,county
366,1,1,10.0,90.0,Alabama ...,Autauga County
462,1,3,13.2,86.8,Alabama ...,Baldwin County
558,1,5,13.5,86.5,Alabama ...,Barbour County
654,1,7,10.6,89.4,Alabama ...,Bibb County
750,1,9,14.1,85.9,Alabama ...,Blount County
...,...,...,...,...,...,...
319818,56,37,11.5,88.5,Wyoming ...,Sweetwater County
319914,56,39,13.4,86.6,Wyoming ...,Teton County
320010,56,41,11.8,88.2,Wyoming ...,Uinta County
320106,56,43,14.8,85.2,Wyoming ...,Washakie County


In [26]:
# Create fips id with 2 columns as name
result = [] 
for value in newest_insurance_df["countyfips"]: 
    if len(str(value)) == 1: 
        result.append('00'+ str(value)) 
    elif len(str(value)) == 2: 
        result.append('0' + str(value))
    else: 
        result.append(str(value))
        
newest_insurance_df["Result"] = result

newest_insurance_df.loc[:,'fips']= newest_insurance_df.loc[:,'statefips'].astype(str) +\
    newest_insurance_df.loc[:,'Result']

In [27]:
newest_insurance_df

Unnamed: 0,statefips,countyfips,% Uninsured,% Insured,state,county,Result,fips
366,1,1,10.0,90.0,Alabama ...,Autauga County,001,1001
462,1,3,13.2,86.8,Alabama ...,Baldwin County,003,1003
558,1,5,13.5,86.5,Alabama ...,Barbour County,005,1005
654,1,7,10.6,89.4,Alabama ...,Bibb County,007,1007
750,1,9,14.1,85.9,Alabama ...,Blount County,009,1009
...,...,...,...,...,...,...,...,...
319818,56,37,11.5,88.5,Wyoming ...,Sweetwater County,037,56037
319914,56,39,13.4,86.6,Wyoming ...,Teton County,039,56039
320010,56,41,11.8,88.2,Wyoming ...,Uinta County,041,56041
320106,56,43,14.8,85.2,Wyoming ...,Washakie County,043,56043


In [28]:
#Keep only wanted columns
final_insurance_columns = ['fips', '% Uninsured', '% Insured']
final_insurance_df = newest_insurance_df[final_insurance_columns]
final_insurance_df.head()

Unnamed: 0,fips,% Uninsured,% Insured
366,1001,10.0,90.0
462,1003,13.2,86.8
558,1005,13.5,86.5
654,1007,10.6,89.4
750,1009,14.1,85.9


In [29]:
# convert the 'fips' column to integer format
final_insurance_df = final_insurance_df.astype({"fips":'int'})

final_insurance_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3141 entries, 366 to 320202
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   fips         3141 non-null   int32  
 1   % Uninsured  3141 non-null   float64
 2   % Insured    3141 non-null   float64
dtypes: float64(2), int32(1)
memory usage: 85.9 KB


## POVERTY FILE

In [30]:
#Look at imported poverty CSV
poverty_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,Postal Code,Name,"Poverty Estimate, All Ages",90% CI Lower Bound,90% CI Upper Bound,"Poverty Percent, All Ages",90% CI Lower Bound.1,90% CI Upper Bound.1,...,90% CI Upper Bound.5,Median Household Income,90% CI Lower Bound.6,90% CI Upper Bound.6,"Poverty Estimate, Age 0-4",90% CI Lower Bound.7,90% CI Upper Bound.7,"Poverty Percent, Age 0-4",90% CI Lower Bound.8,90% CI Upper Bound.8
0,0,0,US,United States,41852315,41619366,42085264,13.1,13.0,13.2,...,17.2,61937,61843,62031,3758704,3714862,3802546,19.5,19.3,19.7
1,1,0,AL,Alabama,801758,785668,817848,16.8,16.5,17.1,...,23.7,49881,49123,50639,73915,69990,77840,26.0,24.6,27.4
2,1,1,AL,Autauga County,7587,6334,8840,13.8,11.5,16.1,...,23.9,59338,53628,65048,.,.,.,.,.,.
3,1,3,AL,Baldwin County,21069,17390,24748,9.8,8.1,11.5,...,16.9,57588,54437,60739,.,.,.,.,.,.
4,1,5,AL,Barbour County,6788,5662,7914,30.9,25.8,36.0,...,45.9,34382,31157,37607,.,.,.,.,.,.


### poverty table

In [31]:
# return only wanted columns
poverty_columns = ['State FIPS Code', 'County FIPS Code', 'Poverty Percent, All Ages', 'Median Household Income', 'Postal Code', 'Name']
new_poverty_df = poverty_df[poverty_columns].copy()

new_poverty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3194 entries, 0 to 3193
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   State FIPS Code            3194 non-null   int64 
 1   County FIPS Code           3194 non-null   int64 
 2   Poverty Percent, All Ages  3194 non-null   object
 3   Median Household Income    3194 non-null   object
 4   Postal Code                3194 non-null   object
 5   Name                       3194 non-null   object
dtypes: int64(2), object(4)
memory usage: 149.8+ KB


In [32]:
new_poverty_df.drop(new_poverty_df[new_poverty_df['Median Household Income'] == '.'].index, inplace = True)

In [33]:
# convert the 'Poverty Percent, All Ages' and 'Median Household Income' column to float/int format
new_poverty_df['Median Household Income'] = new_poverty_df['Median Household Income'].str.replace(',', '').astype(int)
new_poverty_df['Poverty Percent, All Ages'] = new_poverty_df['Poverty Percent, All Ages'].astype(float)

# Rename the column headers
newer_poverty_df = new_poverty_df.rename(columns={'State FIPS Code': 'statefips',
                                                     'County FIPS Code': 'countyfips',
                                                     'Poverty Percent, All Ages': 'Poverty %, All Ages',
                                                     'Postal Code': 'state_abbr',
                                                     'Name': 'county'})

# drop any rows that are not county specific
newest_poverty_df = newer_poverty_df[newer_poverty_df.countyfips != 0]

newest_poverty_df

Unnamed: 0,statefips,countyfips,"Poverty %, All Ages",Median Household Income,state_abbr,county
2,1,1,13.8,59338,AL,Autauga County
3,1,3,9.8,57588,AL,Baldwin County
4,1,5,30.9,34382,AL,Barbour County
5,1,7,21.8,46064,AL,Bibb County
6,1,9,13.2,50412,AL,Blount County
...,...,...,...,...,...,...
3189,56,37,8.4,73315,WY,Sweetwater County
3190,56,39,6.3,99087,WY,Teton County
3191,56,41,10.0,63401,WY,Uinta County
3192,56,43,11.9,55190,WY,Washakie County


In [34]:
# Create fips id with 2 columns as name
result = [] 
for value in newest_poverty_df["countyfips"]: 
    if len(str(value)) == 1: 
        result.append('00'+ str(value)) 
    elif len(str(value)) == 2: 
        result.append('0' + str(value))
    else: 
        result.append(str(value))
        
newest_poverty_df["Result"] = result

newest_poverty_df.loc[:,'fips']= newest_poverty_df.loc[:,'statefips'].astype(str) +\
    newest_poverty_df.loc[:,'Result']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newest_poverty_df["Result"] = result
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [35]:
newest_poverty_df

Unnamed: 0,statefips,countyfips,"Poverty %, All Ages",Median Household Income,state_abbr,county,Result,fips
2,1,1,13.8,59338,AL,Autauga County,001,1001
3,1,3,9.8,57588,AL,Baldwin County,003,1003
4,1,5,30.9,34382,AL,Barbour County,005,1005
5,1,7,21.8,46064,AL,Bibb County,007,1007
6,1,9,13.2,50412,AL,Blount County,009,1009
...,...,...,...,...,...,...,...,...
3189,56,37,8.4,73315,WY,Sweetwater County,037,56037
3190,56,39,6.3,99087,WY,Teton County,039,56039
3191,56,41,10.0,63401,WY,Uinta County,041,56041
3192,56,43,11.9,55190,WY,Washakie County,043,56043


In [36]:
#Keep only wanted columns
final_poverty_columns = ['fips', 'Poverty %, All Ages', 'Median Household Income']
final_poverty_df = newest_poverty_df[final_poverty_columns]
final_poverty_df.head()

Unnamed: 0,fips,"Poverty %, All Ages",Median Household Income
2,1001,13.8,59338
3,1003,9.8,57588
4,1005,30.9,34382
5,1007,21.8,46064
6,1009,13.2,50412


In [37]:
# convert the 'fips' column to integer format
final_poverty_df = final_poverty_df.astype({"fips":'int'})

final_poverty_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3141 entries, 2 to 3193
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   fips                     3141 non-null   int32  
 1   Poverty %, All Ages      3141 non-null   float64
 2   Median Household Income  3141 non-null   int32  
dtypes: float64(1), int32(2)
memory usage: 73.6 KB


## Join Tables

### rural and insurance combo

In [38]:
#join 2 tables based on fips
combined_df1 = newer_rural_df.join(final_insurance_df.set_index('fips'), on='fips')
combined_df1

Unnamed: 0,fips,state_abbr,county,RUCC_2013,RUCC Description,% Uninsured,% Insured
0,1001,AL,Autauga County,2,"Metro - Counties in metro areas of 250,000 to ...",10.0,90.0
1,1003,AL,Baldwin County,3,Metro - Counties in metro areas of fewer than ...,13.2,86.8
2,1005,AL,Barbour County,6,"Nonmetro - Urban population of 2,500 to 19,999...",13.5,86.5
3,1007,AL,Bibb County,1,Metro - Counties in metro areas of 1 million p...,10.6,89.4
4,1009,AL,Blount County,1,Metro - Counties in metro areas of 1 million p...,14.1,85.9
...,...,...,...,...,...,...,...
3216,72145,PR,Vega Baja Municipio,1,Metro - Counties in metro areas of 1 million p...,,
3217,72147,PR,Vieques Municipio,7,"Nonmetro - Urban population of 2,500 to 19,999...",,
3218,72149,PR,Villalba Municipio,2,"Metro - Counties in metro areas of 250,000 to ...",,
3219,72151,PR,Yabucoa Municipio,1,Metro - Counties in metro areas of 1 million p...,,


In [39]:
# Drop rows with no data
new_combined_df1 = combined_df1.dropna()

new_combined_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3139 entries, 0 to 3142
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   fips              3139 non-null   int64  
 1   state_abbr        3139 non-null   object 
 2   county            3139 non-null   object 
 3   RUCC_2013         3139 non-null   int64  
 4   RUCC Description  3139 non-null   object 
 5   % Uninsured       3139 non-null   float64
 6   % Insured         3139 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 196.2+ KB


### adding poverty data

In [40]:
#join 2 tables based on fips
combined_df2 = new_combined_df1.join(final_poverty_df.set_index('fips'), on='fips')
combined_df2

Unnamed: 0,fips,state_abbr,county,RUCC_2013,RUCC Description,% Uninsured,% Insured,"Poverty %, All Ages",Median Household Income
0,1001,AL,Autauga County,2,"Metro - Counties in metro areas of 250,000 to ...",10.0,90.0,13.8,59338
1,1003,AL,Baldwin County,3,Metro - Counties in metro areas of fewer than ...,13.2,86.8,9.8,57588
2,1005,AL,Barbour County,6,"Nonmetro - Urban population of 2,500 to 19,999...",13.5,86.5,30.9,34382
3,1007,AL,Bibb County,1,Metro - Counties in metro areas of 1 million p...,10.6,89.4,21.8,46064
4,1009,AL,Blount County,1,Metro - Counties in metro areas of 1 million p...,14.1,85.9,13.2,50412
...,...,...,...,...,...,...,...,...,...
3138,56037,WY,Sweetwater County,5,"Nonmetro - Urban population of 20,000 or more,...",11.5,88.5,8.4,73315
3139,56039,WY,Teton County,7,"Nonmetro - Urban population of 2,500 to 19,999...",13.4,86.6,6.3,99087
3140,56041,WY,Uinta County,7,"Nonmetro - Urban population of 2,500 to 19,999...",11.8,88.2,10.0,63401
3141,56043,WY,Washakie County,7,"Nonmetro - Urban population of 2,500 to 19,999...",14.8,85.2,11.9,55190


### adding age data

In [42]:
#join 2 tables based on fips
combined_df3 = combined_df2.join(newest_age_df.set_index('fips'), on='fips')
combined_df3

Unnamed: 0,fips,state_abbr,county,RUCC_2013,RUCC Description,% Uninsured,% Insured,"Poverty %, All Ages",Median Household Income,Median age,...,% age 40-44,% age 45-49,% age 50-54,% age 55-59,% age 60-64,% age 65-69,% age 70-74,% age 75-79,% age 80-84,% > age 84
0,1001,AL,Autauga County,2,"Metro - Counties in metro areas of 250,000 to ...",10.0,90.0,13.8,59338,,...,,,,,,,,,,
1,1003,AL,Baldwin County,3,Metro - Counties in metro areas of fewer than ...,13.2,86.8,9.8,57588,43.0,...,5.4,5.9,6.3,7.4,6.9,6.3,6.7,4.5,2.0,1.9
2,1005,AL,Barbour County,6,"Nonmetro - Urban population of 2,500 to 19,999...",13.5,86.5,30.9,34382,,...,,,,,,,,,,
3,1007,AL,Bibb County,1,Metro - Counties in metro areas of 1 million p...,10.6,89.4,21.8,46064,,...,,,,,,,,,,
4,1009,AL,Blount County,1,Metro - Counties in metro areas of 1 million p...,14.1,85.9,13.2,50412,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,56037,WY,Sweetwater County,5,"Nonmetro - Urban population of 20,000 or more,...",11.5,88.5,8.4,73315,,...,,,,,,,,,,
3139,56039,WY,Teton County,7,"Nonmetro - Urban population of 2,500 to 19,999...",13.4,86.6,6.3,99087,,...,,,,,,,,,,
3140,56041,WY,Uinta County,7,"Nonmetro - Urban population of 2,500 to 19,999...",11.8,88.2,10.0,63401,,...,,,,,,,,,,
3141,56043,WY,Washakie County,7,"Nonmetro - Urban population of 2,500 to 19,999...",14.8,85.2,11.9,55190,,...,,,,,,,,,,


### adding race data

In [43]:
#join 2 tables based on fips
combined_df4 = combined_df3.join(final_race_df.set_index('fips'), on='fips')
combined_df4.head()

Unnamed: 0,fips,state_abbr,county,RUCC_2013,RUCC Description,% Uninsured,% Insured,"Poverty %, All Ages",Median Household Income,Median age,...,% age 75-79,% age 80-84,% > age 84,Population,% White alone,% African American alone,% American Indian and Alaska Native alone,% Native Hawaiian and Other Pacific Islander alone,% Some other race alone,% Two or more races
0,1001,AL,Autauga County,2,"Metro - Counties in metro areas of 250,000 to ...",10.0,90.0,13.8,59338,,...,,,,,,,,,,
1,1003,AL,Baldwin County,3,Metro - Counties in metro areas of fewer than ...,13.2,86.8,9.8,57588,43.0,...,4.5,2.0,1.9,218022.0,86.12,9.43,0.59,0.02,1.19,1.59
2,1005,AL,Barbour County,6,"Nonmetro - Urban population of 2,500 to 19,999...",13.5,86.5,30.9,34382,,...,,,,,,,,,,
3,1007,AL,Bibb County,1,Metro - Counties in metro areas of 1 million p...,10.6,89.4,21.8,46064,,...,,,,,,,,,,
4,1009,AL,Blount County,1,Metro - Counties in metro areas of 1 million p...,14.1,85.9,13.2,50412,,...,,,,,,,,,,


In [45]:
combined_df4.to_csv('Demographics_by_County.csv', index=False)