# Forming the county_demographics table

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from secret import username, password

# EXTRACT

#### All data is in folder named 'Potential Data Sources'
* Downloaded rural-urban codes CSV files from https://www.ers.usda.gov/data-products/rural-urban-continuum-codes/ and placed in folder named 'Rural_or_Urban_Designations'
* Downloaded health insurance census data from https://www.census.gov/acs/www/data/data-tables-and-tools/ and placed in folder named 'US Census-Health Insurance"
* Downloaded all Texas CSV files from https://imis.county.org/iMIS/CountyInformationProgram/QueriesCIP.aspx and placed in folder named 'Texas Only Demographics'

In [2]:
# Study data files
rural_filepath = "../Potential Data Sources/Rural_or_Urban_Designations/ruralurbancodes2013.csv"
insurance_filepath = "../Potential Data Sources/US Census-Health Insurance/sahie_2018.csv"
age_filepath = "../Potential Data Sources/Texas Only Demographics/Texas Age.csv"
race_filepath = "../Potential Data Sources/Texas Only Demographics/Texas Race.csv"
poverty_filepath = "../Potential Data Sources/Texas Only Demographics/Texas Income_Poverty.csv"
general_filepath = "../Potential Data Sources/Texas Only Demographics/Texas General Information.csv"
education_filepath = "../Potential Data Sources/Texas Only Demographics/Texas Education_Unemployment.csv"
population_filepath = "../Potential Data Sources/Texas Only Demographics/Texas Population.csv"

In [3]:
# Read the data into Pandas DataFrame
rural_df = pd.read_csv(rural_filepath)
insurance_df = pd.read_csv(insurance_filepath, skiprows=79)
age_df = pd.read_csv(age_filepath)
race_df = pd.read_csv(race_filepath)
poverty_df = pd.read_csv(poverty_filepath)
general_df = pd.read_csv(general_filepath)
education_df = pd.read_csv(education_filepath)
population_df = pd.read_csv(population_filepath)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# TRANSFORM

## RURAL FILE

In [4]:
#Look at imported rural CSV
rural_df.head()

Unnamed: 0,FIPS,State,County_Name,Population_2010,RUCC_2013,Description
0,1001,AL,Autauga County,54571,2,"Metro - Counties in metro areas of 250,000 to ..."
1,1003,AL,Baldwin County,182265,3,Metro - Counties in metro areas of fewer than ...
2,1005,AL,Barbour County,27457,6,"Nonmetro - Urban population of 2,500 to 19,999..."
3,1007,AL,Bibb County,22915,1,Metro - Counties in metro areas of 1 million p...
4,1009,AL,Blount County,57322,1,Metro - Counties in metro areas of 1 million p...


In [5]:
# Return only wanted columns for rural table
rural_columns = ['FIPS', 'State', 'County_Name', 'RUCC_2013', 'Description']
new_rural_df = rural_df[rural_columns].copy()

new_rural_df.head()

Unnamed: 0,FIPS,State,County_Name,RUCC_2013,Description
0,1001,AL,Autauga County,2,"Metro - Counties in metro areas of 250,000 to ..."
1,1003,AL,Baldwin County,3,Metro - Counties in metro areas of fewer than ...
2,1005,AL,Barbour County,6,"Nonmetro - Urban population of 2,500 to 19,999..."
3,1007,AL,Bibb County,1,Metro - Counties in metro areas of 1 million p...
4,1009,AL,Blount County,1,Metro - Counties in metro areas of 1 million p...


In [6]:
# Limit to only Texas
rural_Texas=new_rural_df.loc[new_rural_df['State']=='TX']

# Remove County from County_Name column
rural_Texas['County_Name'] = rural_Texas['County_Name'].str.rstrip('County')
rural_Texas['County_Name'] = rural_Texas['County_Name'].str.rstrip()

rural_Texas.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rural_Texas['County_Name'] = rural_Texas['County_Name'].str.rstrip('County')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rural_Texas['County_Name'] = rural_Texas['County_Name'].str.rstrip()


Unnamed: 0,FIPS,State,County_Name,RUCC_2013,Description
2523,48001,TX,Anderson,7,"Nonmetro - Urban population of 2,500 to 19,999..."
2524,48003,TX,Andrews,6,"Nonmetro - Urban population of 2,500 to 19,999..."
2525,48005,TX,Angelina,5,"Nonmetro - Urban population of 20,000 or more,..."
2526,48007,TX,Aransas,2,"Metro - Counties in metro areas of 250,000 to ..."
2527,48009,TX,Archer,3,Metro - Counties in metro areas of fewer than ...


In [7]:
#Remove extra spaces at end of Description
rural_Texas['Description'] = rural_Texas['Description'].str.rstrip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rural_Texas['Description'] = rural_Texas['Description'].str.rstrip()


In [8]:
#Inspect rural table for column type
rural_Texas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 254 entries, 2523 to 2776
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   FIPS         254 non-null    int64 
 1   State        254 non-null    object
 2   County_Name  254 non-null    object
 3   RUCC_2013    254 non-null    int64 
 4   Description  254 non-null    object
dtypes: int64(2), object(3)
memory usage: 11.9+ KB


In [9]:
# Rename the column headers
new_rural_Texas = rural_Texas.rename(columns={'County_Name': 'County','State': 'State_Abbr', 'Description': 'RUCC_Description'})

# Reset index
new_rural_Texas.reset_index(drop=True)

Unnamed: 0,FIPS,State_Abbr,County,RUCC_2013,RUCC_Description
0,48001,TX,Anderson,7,"Nonmetro - Urban population of 2,500 to 19,999..."
1,48003,TX,Andrews,6,"Nonmetro - Urban population of 2,500 to 19,999..."
2,48005,TX,Angelina,5,"Nonmetro - Urban population of 20,000 or more,..."
3,48007,TX,Aransas,2,"Metro - Counties in metro areas of 250,000 to ..."
4,48009,TX,Archer,3,Metro - Counties in metro areas of fewer than ...
...,...,...,...,...,...
249,48499,TX,Wood,6,"Nonmetro - Urban population of 2,500 to 19,999..."
250,48501,TX,Yoakum,7,"Nonmetro - Urban population of 2,500 to 19,999..."
251,48503,TX,Young,7,"Nonmetro - Urban population of 2,500 to 19,999..."
252,48505,TX,Zapata,6,"Nonmetro - Urban population of 2,500 to 19,999..."


## INSURANCE FILE

In [10]:
#Look at imported insurance CSV
insurance_df.head()

Unnamed: 0,year,version,statefips,countyfips,geocat,agecat,racecat,sexcat,iprcat,NIPR,...,pctui_moe,PCTIC,pctic_moe,PCTELIG,pctelig_moe,PCTLIIC,pctliic_moe,state_name,county_name,Unnamed: 25
0,2018,,1,0,40,0,0,0,0,3955117,...,0.3,88.1,0.3,11.9,0.3,88.1,0.3,Alabama ...,,
1,2018,,1,0,40,0,0,0,1,1460808,...,0.6,80.4,0.6,7.2,0.2,29.7,0.4,Alabama ...,,
2,2018,,1,0,40,0,0,0,2,1805111,...,0.6,81.5,0.6,8.4,0.3,37.2,0.4,Alabama ...,,
3,2018,,1,0,40,0,0,0,3,989540,...,0.8,79.4,0.8,5.2,0.2,19.9,0.3,Alabama ...,,
4,2018,,1,0,40,0,0,0,4,2679733,...,0.4,84.5,0.4,10.5,0.3,57.2,0.4,Alabama ...,,


In [11]:
new_insurance_df = insurance_df.loc[(insurance_df['geocat'] == 50) & (insurance_df['agecat'] == 0) & (insurance_df['racecat'] == 0)
                      & (insurance_df['sexcat'] == 0) & (insurance_df['iprcat'] == 0)]
new_insurance_df.head()

Unnamed: 0,year,version,statefips,countyfips,geocat,agecat,racecat,sexcat,iprcat,NIPR,...,pctui_moe,PCTIC,pctic_moe,PCTELIG,pctelig_moe,PCTLIIC,pctliic_moe,state_name,county_name,Unnamed: 25
366,2018,,1,1,50,0,0,0,0,46709,...,1.2,90.0,1.2,10.0,1.2,90.0,1.2,Alabama ...,Autauga County,
462,2018,,1,3,50,0,0,0,0,172071,...,1.2,86.8,1.2,13.2,1.2,86.8,1.2,Alabama ...,Baldwin County,
558,2018,,1,5,50,0,0,0,0,17106,...,1.6,86.5,1.6,13.5,1.6,86.5,1.6,Alabama ...,Barbour County,
654,2018,,1,7,50,0,0,0,0,16643,...,1.4,89.4,1.4,10.6,1.4,89.4,1.4,Alabama ...,Bibb County,
750,2018,,1,9,50,0,0,0,0,47053,...,1.4,85.9,1.4,14.1,1.4,85.9,1.4,Alabama ...,Blount County,


In [12]:
# return only wanted columns
insurance_columns = ['PCTUI', 'PCTIC', 'state_name', 'county_name']
newer_insurance_df = new_insurance_df[insurance_columns].copy()

newer_insurance_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 366 to 320202
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PCTUI        3142 non-null   object
 1   PCTIC        3142 non-null   object
 2   state_name   3142 non-null   object
 3   county_name  3142 non-null   object
dtypes: object(4)
memory usage: 122.7+ KB


In [13]:
# Remove extra spaces in State column
newer_insurance_df['state_name'] = newer_insurance_df['state_name'].str.strip()

# Remove extra spaces in County column
newer_insurance_df['county_name'] = newer_insurance_df['county_name'].str.strip()

# Limit to only Texas
insurance_Texas=newer_insurance_df.loc[newer_insurance_df['state_name']=='Texas']

In [14]:
# convert the 'PCTUI' and 'PCTIC' column to float format
insurance_Texas = insurance_Texas.astype({"PCTUI":'float', "PCTIC":'float'})

# Rename the column headers
insurance_Texas = insurance_Texas.rename(columns={'PCTUI': 'Percent_Uninsured',
                                                         'PCTIC': 'Percent_Insured',
                                                         'state_name': 'State',
                                                         'county_name': 'County'})

# Remove County from 'County' column
insurance_Texas['County'] = insurance_Texas['County'].str.rstrip('County')
insurance_Texas['County'] = insurance_Texas['County'].str.rstrip()


# Reset index
insurance_Texas.reset_index(drop=True)

Unnamed: 0,Percent_Uninsured,Percent_Insured,State,County
0,19.9,80.1,Texas,Anderson
1,21.0,79.0,Texas,Andrews
2,21.3,78.7,Texas,Angelina
3,24.1,75.9,Texas,Aransas
4,16.5,83.5,Texas,Archer
...,...,...,...,...
249,20.0,80.0,Texas,Wood
250,22.4,77.6,Texas,Yoakum
251,22.0,78.0,Texas,Young
252,28.4,71.6,Texas,Zapata


## AGE FILE

In [15]:
#Look at imported age CSV
age_df.head()

Unnamed: 0,County,Median Age,% Age 17 and Under,% Age 65 and Older,% Age 85 and Older
0,Anderson,39.4,19.02,15.0,1.48
1,Andrews,31.5,31.02,9.8,1.11
2,Angelina,37.5,25.44,16.63,2.0
3,Aransas,51.2,17.73,28.89,2.9
4,Archer,44.1,21.55,19.61,2.02


In [16]:
# Rename the column headers
age_df = age_df.rename(columns={'Median Age':'Median_Age','% Age 17 and Under':'Percent_Age_17_and_Under',
                                '% Age 65 and Older':'Percent_Age_65_and_Older','% Age 85 and Older':'Percent_Age_85_and_Older'})

In [17]:
#Inspect age table for column type
age_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   County                    254 non-null    object 
 1   Median_Age                254 non-null    float64
 2   Percent_Age_17_and_Under  254 non-null    float64
 3   Percent_Age_65_and_Older  254 non-null    float64
 4   Percent_Age_85_and_Older  254 non-null    float64
dtypes: float64(4), object(1)
memory usage: 10.0+ KB


## RACE FILE

In [18]:
#Look at imported race CSV
race_df.head()

Unnamed: 0,County,Ethnicity: % Hispanic,Race: % White Alone,Race: % African American Alone,Race: % American Indian & Alaska Native Alone,Race: % Asian Alone,Race: % Native Hawaiian and Other Pacific Islander Alone,Race: % Multi-Racial
0,Anderson,18.02,74.68,21.83,0.7,0.92,0.16,1.71
1,Andrews,56.56,94.2,1.92,1.53,0.79,0.02,1.54
2,Angelina,22.41,81.24,15.49,0.75,1.06,0.07,1.39
3,Aransas,27.47,93.18,1.7,1.25,1.96,0.09,1.83
4,Archer,8.24,95.26,1.17,1.47,0.51,0.03,1.55


In [19]:
# Rename the column headers
race_df = race_df.rename(columns={'Ethnicity: % Hispanic':'Percent_Hispanic','Race: % White Alone':'Percent_White_Alone',
                                  'Race: % African American Alone':'Percent_African_American_Alone',
                                  'Race: % American Indian & Alaska Native Alone':'Percent_American_Indian_&_Alaska_Native_Alone',
                                  'Race: % Asian Alone':'Percent_Asian_Alone',
                                  'Race: % Native Hawaiian and Other Pacific Islander Alone':'Percent_Native_Hawaiian_and_Other_Pacific_Islander_Alone',
                                  'Race: % Multi-Racial':'Percent_Multi_Racial'})

In [20]:
#Inspect race table for column type
race_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 8 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   County                                                    254 non-null    object 
 1   Percent_Hispanic                                          254 non-null    float64
 2   Percent_White_Alone                                       254 non-null    float64
 3   Percent_African_American_Alone                            254 non-null    float64
 4   Percent_American_Indian_&_Alaska_Native_Alone             254 non-null    float64
 5   Percent_Asian_Alone                                       254 non-null    float64
 6   Percent_Native_Hawaiian_and_Other_Pacific_Islander_Alone  254 non-null    float64
 7   Percent_Multi_Racial                                      254 non-null    float64
dtypes: float64(7), object

## POVERTY FILE

In [21]:
#Look at imported poverty CSV
poverty_df.head()

Unnamed: 0,County,Per Capita Income,Total Personal Income,Median Household Income,Average Annual Pay,% of Population in Poverty,% of Population Under 18 in Poverty
0,Anderson,"$34,242","$1,987,998,000","$45,969","$44,146",19.8,22.6
1,Andrews,"$50,011","$906,592,000","$84,946","$68,340",10.7,14.0
2,Angelina,"$38,897","$3,387,655,000","$46,653","$40,464",17.9,26.7
3,Aransas,"$48,389","$1,151,262,000","$46,912","$38,613",19.9,34.7
4,Archer,"$50,310","$442,022,000","$61,190","$38,231",10.6,14.3


In [22]:
# return only wanted columns
new_poverty_df = poverty_df.drop(columns=['Total Personal Income'])

new_poverty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 6 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   County                               254 non-null    object 
 1   Per Capita Income                    254 non-null    object 
 2   Median Household Income              254 non-null    object 
 3   Average Annual Pay                   254 non-null    object 
 4   % of Population in Poverty           254 non-null    float64
 5   % of Population Under 18 in Poverty  254 non-null    float64
dtypes: float64(2), object(4)
memory usage: 12.0+ KB


In [23]:
# convert the 'Poverty Percent, All Ages' and 'Median Household Income' column to int format
new_poverty_df['Per Capita Income'] = new_poverty_df['Per Capita Income'].str.replace('$', '').str.replace(',', '').astype(int)
new_poverty_df['Median Household Income'] = new_poverty_df['Median Household Income'].str.replace('$', '').str.replace(',', '').astype(int)
new_poverty_df['Average Annual Pay'] = new_poverty_df['Average Annual Pay'].str.replace('$', '').str.replace(',', '').astype(int)

new_poverty_df.head()

Unnamed: 0,County,Per Capita Income,Median Household Income,Average Annual Pay,% of Population in Poverty,% of Population Under 18 in Poverty
0,Anderson,34242,45969,44146,19.8,22.6
1,Andrews,50011,84946,68340,10.7,14.0
2,Angelina,38897,46653,40464,17.9,26.7
3,Aransas,48389,46912,38613,19.9,34.7
4,Archer,50310,61190,38231,10.6,14.3


In [24]:
# Rename the column headers
new_poverty_df = new_poverty_df.rename(columns={'Per Capita Income':'Per_Capita_Income',
                                                'Median Household Income':'Median_Household_Income',
                                                'Average Annual Pay':'Avg_Annual_Pay',
                                                '% of Population in Poverty':'Percent_Population_in_Poverty',
                                                '% of Population Under 18 in Poverty':'Percent_Population_under_18_in_Poverty'})

In [25]:
new_poverty_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 6 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   County                                  254 non-null    object 
 1   Per_Capita_Income                       254 non-null    int32  
 2   Median_Household_Income                 254 non-null    int32  
 3   Avg_Annual_Pay                          254 non-null    int32  
 4   Percent_Population_in_Poverty           254 non-null    float64
 5   Percent_Population_under_18_in_Poverty  254 non-null    float64
dtypes: float64(2), int32(3), object(1)
memory usage: 9.1+ KB


## GENERAL FILE

In [26]:
#Look at imported general CSV
general_df.head()

Unnamed: 0,County,Land Area,Water Area,Total Area,Percent Urban,Percent Rural
0,Anderson,1062.6,15.4,1078.0,32.94,67.06
1,Andrews,1500.7,0.4,1501.1,83.5,16.5
2,Angelina,797.8,66.9,864.7,56.92,43.08
3,Aransas,252.1,275.9,528.0,72.74,27.26
4,Archer,903.3,22.3,925.6,11.01,88.99


In [27]:
# Rename the column headers
general_df = general_df.rename(columns={'Percent Urban':'Percent_Urban','Percent Rural': 'Percent_Rural'})

In [28]:
# return only wanted columns
new_general_df = general_df.drop(columns=['Land Area', 'Water Area', 'Total Area'])

new_general_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   County         254 non-null    object 
 1   Percent_Urban  254 non-null    float64
 2   Percent_Rural  254 non-null    float64
dtypes: float64(2), object(1)
memory usage: 6.1+ KB


## EDUCATION FILE

In [29]:
#Look at imported education CSV
education_df.head()

Unnamed: 0,County,% High School Graduate or Higher,% Bachelor s Degree or Higher,Unemployment Rate (%)
0,Anderson,80.2,11.8,3.6
1,Andrews,73.8,10.6,3.1
2,Angelina,79.9,15.7,5.1
3,Aransas,83.2,20.2,6.8
4,Archer,90.1,21.8,3.4


In [30]:
# Rename the column headers
new_education_df = education_df.rename(columns={'% High School Graduate or Higher':'Percent_HS_Graduate_or_Higher',
                                                '% Bachelor s Degree or Higher': 'Percent_Bachelors_Degree_or_Higher',
                                                'Unemployment Rate (%)':'Percent_Unemployed'})

In [31]:
new_education_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 4 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   County                              254 non-null    object 
 1   Percent_HS_Graduate_or_Higher       254 non-null    float64
 2   Percent_Bachelors_Degree_or_Higher  254 non-null    float64
 3   Percent_Unemployed                  254 non-null    float64
dtypes: float64(3), object(1)
memory usage: 8.1+ KB


## POPULATION FILE

In [32]:
#Look at imported population CSV
population_df.head()

Unnamed: 0,County,County Population,Population Density Per Sq Mile,County Seat,County Seat Population
0,Anderson,57735,55.01,Palestine,18712
1,Andrews,18705,9.85,Andrews,11088
2,Angelina,86715,108.77,Lufkin,35067
3,Aransas,23510,91.87,Rockport,8766
4,Archer,8553,10.03,Archer City,1834


In [33]:
# return only wanted columns
new_population_df = population_df.drop(columns=['County Seat', 'County Seat Population'])

new_population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 3 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   County                           254 non-null    object
 1   County Population                254 non-null    object
 2    Population Density Per Sq Mile  254 non-null    object
dtypes: object(3)
memory usage: 6.1+ KB


In [34]:
# Rename the column headers
newer_population_df = new_population_df.rename(columns={'County Population':'County_Population',
                                                        ' Population Density Per Sq Mile': 'Population_Density_per_Sq_Mile'})

In [35]:
# convert the 'County Population' and 'Population Density Per Sq Mile' column to int format
newer_population_df['County_Population'] = newer_population_df['County_Population'].str.replace(',', '').astype(int)
newer_population_df['Population_Density_per_Sq_Mile'] = newer_population_df['Population_Density_per_Sq_Mile'].str.replace(',', '').astype(float)

newer_population_df.head()

Unnamed: 0,County,County_Population,Population_Density_per_Sq_Mile
0,Anderson,57735,55.01
1,Andrews,18705,9.85
2,Angelina,86715,108.77
3,Aransas,23510,91.87
4,Archer,8553,10.03


In [36]:
newer_population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 3 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   County                          254 non-null    object 
 1   County_Population               254 non-null    int32  
 2   Population_Density_per_Sq_Mile  254 non-null    float64
dtypes: float64(1), int32(1), object(1)
memory usage: 5.1+ KB


## Join Tables

### age and race data

In [37]:
#join 2 tables based on county
combined_df1 = age_df.join(race_df.set_index('County'), on='County')
combined_df1

Unnamed: 0,County,Median_Age,Percent_Age_17_and_Under,Percent_Age_65_and_Older,Percent_Age_85_and_Older,Percent_Hispanic,Percent_White_Alone,Percent_African_American_Alone,Percent_American_Indian_&_Alaska_Native_Alone,Percent_Asian_Alone,Percent_Native_Hawaiian_and_Other_Pacific_Islander_Alone,Percent_Multi_Racial
0,Anderson,39.4,19.02,15.00,1.48,18.02,74.68,21.83,0.70,0.92,0.16,1.71
1,Andrews,31.5,31.02,9.80,1.11,56.56,94.20,1.92,1.53,0.79,0.02,1.54
2,Angelina,37.5,25.44,16.63,2.00,22.41,81.24,15.49,0.75,1.06,0.07,1.39
3,Aransas,51.2,17.73,28.89,2.90,27.47,93.18,1.70,1.25,1.96,0.09,1.83
4,Archer,44.1,21.55,19.61,2.02,8.24,95.26,1.17,1.47,0.51,0.03,1.55
...,...,...,...,...,...,...,...,...,...,...,...,...
249,Wood,48.9,18.86,27.51,2.80,10.18,91.25,5.55,1.04,0.58,0.07,1.52
250,Yoakum,30.6,32.70,11.86,1.33,66.82,95.09,1.43,1.72,0.55,0.03,1.18
251,Young,41.6,23.66,20.96,2.82,18.94,94.85,1.57,1.26,0.78,0.07,1.48
252,Zapata,30.0,33.06,13.16,1.47,94.54,98.53,0.40,0.47,0.28,0.01,0.32


### adding poverty data

In [38]:
#join 2 tables based on County
combined_df2 = combined_df1.join(new_poverty_df.set_index('County'), on='County')
combined_df2

Unnamed: 0,County,Median_Age,Percent_Age_17_and_Under,Percent_Age_65_and_Older,Percent_Age_85_and_Older,Percent_Hispanic,Percent_White_Alone,Percent_African_American_Alone,Percent_American_Indian_&_Alaska_Native_Alone,Percent_Asian_Alone,Percent_Native_Hawaiian_and_Other_Pacific_Islander_Alone,Percent_Multi_Racial,Per_Capita_Income,Median_Household_Income,Avg_Annual_Pay,Percent_Population_in_Poverty,Percent_Population_under_18_in_Poverty
0,Anderson,39.4,19.02,15.00,1.48,18.02,74.68,21.83,0.70,0.92,0.16,1.71,34242,45969,44146,19.8,22.6
1,Andrews,31.5,31.02,9.80,1.11,56.56,94.20,1.92,1.53,0.79,0.02,1.54,50011,84946,68340,10.7,14.0
2,Angelina,37.5,25.44,16.63,2.00,22.41,81.24,15.49,0.75,1.06,0.07,1.39,38897,46653,40464,17.9,26.7
3,Aransas,51.2,17.73,28.89,2.90,27.47,93.18,1.70,1.25,1.96,0.09,1.83,48389,46912,38613,19.9,34.7
4,Archer,44.1,21.55,19.61,2.02,8.24,95.26,1.17,1.47,0.51,0.03,1.55,50310,61190,38231,10.6,14.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Wood,48.9,18.86,27.51,2.80,10.18,91.25,5.55,1.04,0.58,0.07,1.52,38152,48384,37726,15.3,24.9
250,Yoakum,30.6,32.70,11.86,1.33,66.82,95.09,1.43,1.72,0.55,0.03,1.18,41112,61560,62190,12.1,16.3
251,Young,41.6,23.66,20.96,2.82,18.94,94.85,1.57,1.26,0.78,0.07,1.48,50231,49301,42430,13.9,20.4
252,Zapata,30.0,33.06,13.16,1.47,94.54,98.53,0.40,0.47,0.28,0.01,0.32,28294,33160,50175,32.1,49.1


### adding general data

In [39]:
#join 2 tables based on County
combined_df3 = combined_df2.join(new_general_df.set_index('County'), on='County')
combined_df3

Unnamed: 0,County,Median_Age,Percent_Age_17_and_Under,Percent_Age_65_and_Older,Percent_Age_85_and_Older,Percent_Hispanic,Percent_White_Alone,Percent_African_American_Alone,Percent_American_Indian_&_Alaska_Native_Alone,Percent_Asian_Alone,Percent_Native_Hawaiian_and_Other_Pacific_Islander_Alone,Percent_Multi_Racial,Per_Capita_Income,Median_Household_Income,Avg_Annual_Pay,Percent_Population_in_Poverty,Percent_Population_under_18_in_Poverty,Percent_Urban,Percent_Rural
0,Anderson,39.4,19.02,15.00,1.48,18.02,74.68,21.83,0.70,0.92,0.16,1.71,34242,45969,44146,19.8,22.6,32.94,67.06
1,Andrews,31.5,31.02,9.80,1.11,56.56,94.20,1.92,1.53,0.79,0.02,1.54,50011,84946,68340,10.7,14.0,83.50,16.50
2,Angelina,37.5,25.44,16.63,2.00,22.41,81.24,15.49,0.75,1.06,0.07,1.39,38897,46653,40464,17.9,26.7,56.92,43.08
3,Aransas,51.2,17.73,28.89,2.90,27.47,93.18,1.70,1.25,1.96,0.09,1.83,48389,46912,38613,19.9,34.7,72.74,27.26
4,Archer,44.1,21.55,19.61,2.02,8.24,95.26,1.17,1.47,0.51,0.03,1.55,50310,61190,38231,10.6,14.3,11.01,88.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Wood,48.9,18.86,27.51,2.80,10.18,91.25,5.55,1.04,0.58,0.07,1.52,38152,48384,37726,15.3,24.9,25.80,74.20
250,Yoakum,30.6,32.70,11.86,1.33,66.82,95.09,1.43,1.72,0.55,0.03,1.18,41112,61560,62190,12.1,16.3,62.67,37.33
251,Young,41.6,23.66,20.96,2.82,18.94,94.85,1.57,1.26,0.78,0.07,1.48,50231,49301,42430,13.9,20.4,66.43,33.57
252,Zapata,30.0,33.06,13.16,1.47,94.54,98.53,0.40,0.47,0.28,0.01,0.32,28294,33160,50175,32.1,49.1,76.47,23.53


### adding education data

In [40]:
#join 2 tables based on County
combined_df4 = combined_df3.join(new_education_df.set_index('County'), on='County')
combined_df4

Unnamed: 0,County,Median_Age,Percent_Age_17_and_Under,Percent_Age_65_and_Older,Percent_Age_85_and_Older,Percent_Hispanic,Percent_White_Alone,Percent_African_American_Alone,Percent_American_Indian_&_Alaska_Native_Alone,Percent_Asian_Alone,...,Per_Capita_Income,Median_Household_Income,Avg_Annual_Pay,Percent_Population_in_Poverty,Percent_Population_under_18_in_Poverty,Percent_Urban,Percent_Rural,Percent_HS_Graduate_or_Higher,Percent_Bachelors_Degree_or_Higher,Percent_Unemployed
0,Anderson,39.4,19.02,15.00,1.48,18.02,74.68,21.83,0.70,0.92,...,34242,45969,44146,19.8,22.6,32.94,67.06,80.2,11.8,3.6
1,Andrews,31.5,31.02,9.80,1.11,56.56,94.20,1.92,1.53,0.79,...,50011,84946,68340,10.7,14.0,83.50,16.50,73.8,10.6,3.1
2,Angelina,37.5,25.44,16.63,2.00,22.41,81.24,15.49,0.75,1.06,...,38897,46653,40464,17.9,26.7,56.92,43.08,79.9,15.7,5.1
3,Aransas,51.2,17.73,28.89,2.90,27.47,93.18,1.70,1.25,1.96,...,48389,46912,38613,19.9,34.7,72.74,27.26,83.2,20.2,6.8
4,Archer,44.1,21.55,19.61,2.02,8.24,95.26,1.17,1.47,0.51,...,50310,61190,38231,10.6,14.3,11.01,88.99,90.1,21.8,3.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Wood,48.9,18.86,27.51,2.80,10.18,91.25,5.55,1.04,0.58,...,38152,48384,37726,15.3,24.9,25.80,74.20,85.1,17.1,4.7
250,Yoakum,30.6,32.70,11.86,1.33,66.82,95.09,1.43,1.72,0.55,...,41112,61560,62190,12.1,16.3,62.67,37.33,71.4,17.8,3.5
251,Young,41.6,23.66,20.96,2.82,18.94,94.85,1.57,1.26,0.78,...,50231,49301,42430,13.9,20.4,66.43,33.57,81.6,18.6,3.8
252,Zapata,30.0,33.06,13.16,1.47,94.54,98.53,0.40,0.47,0.28,...,28294,33160,50175,32.1,49.1,76.47,23.53,50.3,8.3,7.3


### adding population data

In [41]:
#join 2 tables based on County
combined_df5 = combined_df4.join(newer_population_df.set_index('County'), on='County')
combined_df5

Unnamed: 0,County,Median_Age,Percent_Age_17_and_Under,Percent_Age_65_and_Older,Percent_Age_85_and_Older,Percent_Hispanic,Percent_White_Alone,Percent_African_American_Alone,Percent_American_Indian_&_Alaska_Native_Alone,Percent_Asian_Alone,...,Avg_Annual_Pay,Percent_Population_in_Poverty,Percent_Population_under_18_in_Poverty,Percent_Urban,Percent_Rural,Percent_HS_Graduate_or_Higher,Percent_Bachelors_Degree_or_Higher,Percent_Unemployed,County_Population,Population_Density_per_Sq_Mile
0,Anderson,39.4,19.02,15.00,1.48,18.02,74.68,21.83,0.70,0.92,...,44146,19.8,22.6,32.94,67.06,80.2,11.8,3.6,57735,55.01
1,Andrews,31.5,31.02,9.80,1.11,56.56,94.20,1.92,1.53,0.79,...,68340,10.7,14.0,83.50,16.50,73.8,10.6,3.1,18705,9.85
2,Angelina,37.5,25.44,16.63,2.00,22.41,81.24,15.49,0.75,1.06,...,40464,17.9,26.7,56.92,43.08,79.9,15.7,5.1,86715,108.77
3,Aransas,51.2,17.73,28.89,2.90,27.47,93.18,1.70,1.25,1.96,...,38613,19.9,34.7,72.74,27.26,83.2,20.2,6.8,23510,91.87
4,Archer,44.1,21.55,19.61,2.02,8.24,95.26,1.17,1.47,0.51,...,38231,10.6,14.3,11.01,88.99,90.1,21.8,3.4,8553,10.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Wood,48.9,18.86,27.51,2.80,10.18,91.25,5.55,1.04,0.58,...,37726,15.3,24.9,25.80,74.20,85.1,17.1,4.7,45539,65.04
250,Yoakum,30.6,32.70,11.86,1.33,66.82,95.09,1.43,1.72,0.55,...,62190,12.1,16.3,62.67,37.33,71.4,17.8,3.5,8713,9.85
251,Young,41.6,23.66,20.96,2.82,18.94,94.85,1.57,1.26,0.78,...,42430,13.9,20.4,66.43,33.57,81.6,18.6,3.8,18010,20.29
252,Zapata,30.0,33.06,13.16,1.47,94.54,98.53,0.40,0.47,0.28,...,50175,32.1,49.1,76.47,23.53,50.3,8.3,7.3,14179,14.04


### adding insurance data

In [42]:
#join 2 tables based on County
combined_df6 = combined_df5.join(insurance_Texas.set_index('County'), on='County')
combined_df6

Unnamed: 0,County,Median_Age,Percent_Age_17_and_Under,Percent_Age_65_and_Older,Percent_Age_85_and_Older,Percent_Hispanic,Percent_White_Alone,Percent_African_American_Alone,Percent_American_Indian_&_Alaska_Native_Alone,Percent_Asian_Alone,...,Percent_Urban,Percent_Rural,Percent_HS_Graduate_or_Higher,Percent_Bachelors_Degree_or_Higher,Percent_Unemployed,County_Population,Population_Density_per_Sq_Mile,Percent_Uninsured,Percent_Insured,State
0,Anderson,39.4,19.02,15.00,1.48,18.02,74.68,21.83,0.70,0.92,...,32.94,67.06,80.2,11.8,3.6,57735,55.01,19.9,80.1,Texas
1,Andrews,31.5,31.02,9.80,1.11,56.56,94.20,1.92,1.53,0.79,...,83.50,16.50,73.8,10.6,3.1,18705,9.85,21.0,79.0,Texas
2,Angelina,37.5,25.44,16.63,2.00,22.41,81.24,15.49,0.75,1.06,...,56.92,43.08,79.9,15.7,5.1,86715,108.77,21.3,78.7,Texas
3,Aransas,51.2,17.73,28.89,2.90,27.47,93.18,1.70,1.25,1.96,...,72.74,27.26,83.2,20.2,6.8,23510,91.87,24.1,75.9,Texas
4,Archer,44.1,21.55,19.61,2.02,8.24,95.26,1.17,1.47,0.51,...,11.01,88.99,90.1,21.8,3.4,8553,10.03,16.5,83.5,Texas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Wood,48.9,18.86,27.51,2.80,10.18,91.25,5.55,1.04,0.58,...,25.80,74.20,85.1,17.1,4.7,45539,65.04,20.0,80.0,Texas
250,Yoakum,30.6,32.70,11.86,1.33,66.82,95.09,1.43,1.72,0.55,...,62.67,37.33,71.4,17.8,3.5,8713,9.85,22.4,77.6,Texas
251,Young,41.6,23.66,20.96,2.82,18.94,94.85,1.57,1.26,0.78,...,66.43,33.57,81.6,18.6,3.8,18010,20.29,22.0,78.0,Texas
252,Zapata,30.0,33.06,13.16,1.47,94.54,98.53,0.40,0.47,0.28,...,76.47,23.53,50.3,8.3,7.3,14179,14.04,28.4,71.6,Texas


### adding rural data

In [43]:
#join 2 tables based on County
combined_df7 = combined_df6.join(new_rural_Texas.set_index('County'), on='County')
combined_df7

Unnamed: 0,County,Median_Age,Percent_Age_17_and_Under,Percent_Age_65_and_Older,Percent_Age_85_and_Older,Percent_Hispanic,Percent_White_Alone,Percent_African_American_Alone,Percent_American_Indian_&_Alaska_Native_Alone,Percent_Asian_Alone,...,Percent_Unemployed,County_Population,Population_Density_per_Sq_Mile,Percent_Uninsured,Percent_Insured,State,FIPS,State_Abbr,RUCC_2013,RUCC_Description
0,Anderson,39.4,19.02,15.00,1.48,18.02,74.68,21.83,0.70,0.92,...,3.6,57735,55.01,19.9,80.1,Texas,48001,TX,7,"Nonmetro - Urban population of 2,500 to 19,999..."
1,Andrews,31.5,31.02,9.80,1.11,56.56,94.20,1.92,1.53,0.79,...,3.1,18705,9.85,21.0,79.0,Texas,48003,TX,6,"Nonmetro - Urban population of 2,500 to 19,999..."
2,Angelina,37.5,25.44,16.63,2.00,22.41,81.24,15.49,0.75,1.06,...,5.1,86715,108.77,21.3,78.7,Texas,48005,TX,5,"Nonmetro - Urban population of 20,000 or more,..."
3,Aransas,51.2,17.73,28.89,2.90,27.47,93.18,1.70,1.25,1.96,...,6.8,23510,91.87,24.1,75.9,Texas,48007,TX,2,"Metro - Counties in metro areas of 250,000 to ..."
4,Archer,44.1,21.55,19.61,2.02,8.24,95.26,1.17,1.47,0.51,...,3.4,8553,10.03,16.5,83.5,Texas,48009,TX,3,Metro - Counties in metro areas of fewer than ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Wood,48.9,18.86,27.51,2.80,10.18,91.25,5.55,1.04,0.58,...,4.7,45539,65.04,20.0,80.0,Texas,48499,TX,6,"Nonmetro - Urban population of 2,500 to 19,999..."
250,Yoakum,30.6,32.70,11.86,1.33,66.82,95.09,1.43,1.72,0.55,...,3.5,8713,9.85,22.4,77.6,Texas,48501,TX,7,"Nonmetro - Urban population of 2,500 to 19,999..."
251,Young,41.6,23.66,20.96,2.82,18.94,94.85,1.57,1.26,0.78,...,3.8,18010,20.29,22.0,78.0,Texas,48503,TX,7,"Nonmetro - Urban population of 2,500 to 19,999..."
252,Zapata,30.0,33.06,13.16,1.47,94.54,98.53,0.40,0.47,0.28,...,7.3,14179,14.04,28.4,71.6,Texas,48505,TX,6,"Nonmetro - Urban population of 2,500 to 19,999..."


In [44]:
combined_df7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 31 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   County                                                    254 non-null    object 
 1   Median_Age                                                254 non-null    float64
 2   Percent_Age_17_and_Under                                  254 non-null    float64
 3   Percent_Age_65_and_Older                                  254 non-null    float64
 4   Percent_Age_85_and_Older                                  254 non-null    float64
 5   Percent_Hispanic                                          254 non-null    float64
 6   Percent_White_Alone                                       254 non-null    float64
 7   Percent_African_American_Alone                            254 non-null    float64
 8   Percent_American_Ind

## Export to CSV

In [45]:
combined_df7.to_csv('Texas_Demographics_by_County.csv', index=False)

In [46]:
connection_string = f'{username}:{password}@localhost:5432/Covid_19'
engine = create_engine(f'postgresql://{connection_string}')

In [47]:
# Confirm tables
engine.table_names()

['county_demographics', 'county_daily_data']

In [48]:
combined_df7.to_sql(name='county_demographics', con=engine, if_exists='append', index=False)