In [209]:
#Import dependencies
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

from config import db_password

In [210]:
# Force pandas to show all columns
pd.set_option('display.max_columns', None)

In [211]:
# Load Vaccine Hesitancy dataset and check dataframe
file_path = "Vaccine_Hesitancy_for_COVID-19__County_and_local_estimates.csv"
vaccine_hesitancy_df = pd.read_csv(file_path)
vaccine_hesitancy_df.head()

Unnamed: 0,FIPS Code,County Name,State,Estimated hesitant,Estimated strongly hesitant,Social Vulnerability Index (SVI),SVI Category,Ability to handle a COVID-19 outbreak (CVAC),CVAC Category,Percent adults fully vaccinated against COVID-19,Percent Hispanic,Percent non-Hispanic American Indian/Alaska Native,Percent non-Hispanic Asian,Percent non-Hispanic Black,Percent non-Hispanic Native Hawaiian/Pacific Islander,Percent non-Hispanic White,Geographical Point,State Code,County Boundary,State Boundary
0,1123,"Tallapoosa County, Alabama",ALABAMA,0.23,0.12,0.89,Very High Vulnerability,0.64,High Vulnerability,0.161,0.0242,0.0022,0.0036,0.2697,0.0,0.6887,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.841259 33.104456, -85.8409...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
1,1121,"Talladega County, Alabama",ALABAMA,0.23,0.11,0.87,Very High Vulnerability,0.84,Very High Vulnerability,0.133,0.0229,0.0043,0.0061,0.3237,0.0003,0.6263,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-86.303069 33.46316, -86.30306...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
2,1131,"Wilcox County, Alabama",ALABAMA,0.23,0.11,0.93,Very High Vulnerability,0.94,Very High Vulnerability,0.228,0.0053,0.0009,0.0003,0.6938,0.0,0.2684,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-87.52534299999999 32.132773, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
3,1129,"Washington County, Alabama",ALABAMA,0.23,0.11,0.73,High Vulnerability,0.82,Very High Vulnerability,0.192,0.0146,0.0731,0.0025,0.2354,0.0,0.6495,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-88.45317899999999 31.505388, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
4,1133,"Winston County, Alabama",ALABAMA,0.22,0.11,0.7,High Vulnerability,0.8,High Vulnerability,0.085,0.0315,0.0034,0.0016,0.0073,0.0005,0.937,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-87.63656399999999 34.120908, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."


In [212]:
# Check number of records per state
vaccine_hesitancy_df["State"].value_counts()

TEXAS                   254
GEORGIA                 159
VIRGINIA                133
KENTUCKY                120
MISSOURI                115
KANSAS                  105
ILLINOIS                102
NORTH CAROLINA          100
IOWA                     99
TENNESSEE                95
NEBRASKA                 93
INDIANA                  92
OHIO                     88
MINNESOTA                87
MICHIGAN                 83
MISSISSIPPI              82
OKLAHOMA                 77
ARKANSAS                 75
WISCONSIN                72
ALABAMA                  67
PENNSYLVANIA             67
FLORIDA                  67
SOUTH DAKOTA             66
LOUISIANA                64
COLORADO                 64
NEW YORK                 62
CALIFORNIA               58
MONTANA                  56
WEST VIRGINIA            55
NORTH DAKOTA             53
SOUTH CAROLINA           46
IDAHO                    44
WASHINGTON               39
OREGON                   36
NEW MEXICO               33
ALASKA              

In [213]:
# Check for null values
vaccine_hesitancy_df.isnull().sum()

FIPS Code                                                  0
County Name                                                0
State                                                      0
Estimated hesitant                                         0
Estimated strongly hesitant                                0
Social Vulnerability Index (SVI)                           1
SVI Category                                               0
Ability to handle a COVID-19 outbreak (CVAC)               0
CVAC Category                                              0
Percent adults fully vaccinated against COVID-19         316
Percent Hispanic                                           0
Percent non-Hispanic American Indian/Alaska Native         0
Percent non-Hispanic Asian                                 0
Percent non-Hispanic Black                                 0
Percent non-Hispanic Native Hawaiian/Pacific Islander      0
Percent non-Hispanic White                                 0
Geographical Point      

In [214]:
#vaccine_hesitancy_df = vaccine_hesitancy_df[vaccine_hesitancy_df['Social Vulnerability Index (SVI)'].notna()]

In [215]:
# Check for null values again
vaccine_hesitancy_df.isnull().sum()

FIPS Code                                                  0
County Name                                                0
State                                                      0
Estimated hesitant                                         0
Estimated strongly hesitant                                0
Social Vulnerability Index (SVI)                           1
SVI Category                                               0
Ability to handle a COVID-19 outbreak (CVAC)               0
CVAC Category                                              0
Percent adults fully vaccinated against COVID-19         316
Percent Hispanic                                           0
Percent non-Hispanic American Indian/Alaska Native         0
Percent non-Hispanic Asian                                 0
Percent non-Hispanic Black                                 0
Percent non-Hispanic Native Hawaiian/Pacific Islander      0
Percent non-Hispanic White                                 0
Geographical Point      

In [216]:
# Show all columns in df
vaccine_hesitancy_df.columns

Index(['FIPS Code', 'County Name', 'State', 'Estimated hesitant',
       'Estimated strongly hesitant', 'Social Vulnerability Index (SVI)',
       'SVI Category', 'Ability to handle a COVID-19 outbreak (CVAC)',
       'CVAC Category', 'Percent adults fully vaccinated against COVID-19',
       'Percent Hispanic',
       'Percent non-Hispanic American Indian/Alaska Native',
       'Percent non-Hispanic Asian', 'Percent non-Hispanic Black',
       'Percent non-Hispanic Native Hawaiian/Pacific Islander',
       'Percent non-Hispanic White', 'Geographical Point', 'State Code',
       'County Boundary', 'State Boundary'],
      dtype='object')

In [217]:
# Check columns data types
vaccine_hesitancy_df.dtypes

FIPS Code                                                  int64
County Name                                               object
State                                                     object
Estimated hesitant                                       float64
Estimated strongly hesitant                              float64
Social Vulnerability Index (SVI)                         float64
SVI Category                                              object
Ability to handle a COVID-19 outbreak (CVAC)             float64
CVAC Category                                             object
Percent adults fully vaccinated against COVID-19         float64
Percent Hispanic                                         float64
Percent non-Hispanic American Indian/Alaska Native       float64
Percent non-Hispanic Asian                               float64
Percent non-Hispanic Black                               float64
Percent non-Hispanic Native Hawaiian/Pacific Islander    float64
Percent non-Hispanic Whit

In [218]:
vaccine_hesitancy_df.count()

FIPS Code                                                3142
County Name                                              3142
State                                                    3142
Estimated hesitant                                       3142
Estimated strongly hesitant                              3142
Social Vulnerability Index (SVI)                         3141
SVI Category                                             3142
Ability to handle a COVID-19 outbreak (CVAC)             3142
CVAC Category                                            3142
Percent adults fully vaccinated against COVID-19         2826
Percent Hispanic                                         3142
Percent non-Hispanic American Indian/Alaska Native       3142
Percent non-Hispanic Asian                               3142
Percent non-Hispanic Black                               3142
Percent non-Hispanic Native Hawaiian/Pacific Islander    3142
Percent non-Hispanic White                               3142
Geograph

In [219]:
 # Rename the columns in the vaccine_hesitancy DataFrame.
vaccine_hesitancy_df.rename({'FIPS Code':'fips_code',
                  'County Name':'county',
                  'State':'state',
                  'Estimated hesitant':'est_hesitant',
                  'Estimated strongly hesitant':'est_strongly_hesitant',
                  'Social Vulnerability Index (SVI)':'svi',
                  'SVI Category':'svi_category',
                  'Ability to handle a COVID-19 outbreak (CVAC)':'cvac',
                  'CVAC Category':'cvac_category',
                  'Percent adults fully vaccinated against COVID-19':'percent_fully_vaccinated',
                  'Percent Hispanic':'percent_hispanic',
                  'Percent non-Hispanic American Indian/Alaska Native':'percent_american_indian_alaska_native',
                  'Percent non-Hispanic Asian':'percent_asian',
                  'Percent non-Hispanic Black':'percent_black',
                  'Percent non-Hispanic Native Hawaiian/Pacific Islander':'percent_hawaiian_pacific',
                  'Percent non-Hispanic White':'percent_white',
                  'Geographical Point':'geographical_point',
                  'State Code':'state_code',
                  'County Boundary':'county_boundary',
                  'State Boundary':'state_boundary',
                 }, axis='columns', inplace=True)

In [220]:
# Check new column names
vaccine_hesitancy_df.dtypes

fips_code                                  int64
county                                    object
state                                     object
est_hesitant                             float64
est_strongly_hesitant                    float64
svi                                      float64
svi_category                              object
cvac                                     float64
cvac_category                             object
percent_fully_vaccinated                 float64
percent_hispanic                         float64
percent_american_indian_alaska_native    float64
percent_asian                            float64
percent_black                            float64
percent_hawaiian_pacific                 float64
percent_white                            float64
geographical_point                        object
state_code                                object
county_boundary                           object
state_boundary                            object
dtype: object

In [221]:
# Re order fips column
vaccine_hesitancy_df = vaccine_hesitancy_df.sort_values(by='fips_code', ascending=True)
vaccine_hesitancy_df.head()

Unnamed: 0,fips_code,county,state,est_hesitant,est_strongly_hesitant,svi,svi_category,cvac,cvac_category,percent_fully_vaccinated,percent_hispanic,percent_american_indian_alaska_native,percent_asian,percent_black,percent_hawaiian_pacific,percent_white,geographical_point,state_code,county_boundary,state_boundary
273,1001,"Autauga County, Alabama",ALABAMA,0.22,0.1,0.44,Moderate Vulnerability,0.61,High Vulnerability,0.114,0.0283,0.0025,0.0103,0.19,0.0001,0.746,POINT (-86.844516 32.756889),AL,MULTIPOLYGON (((-86.90309599999999 32.54062599...,"MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
282,1003,"Baldwin County, Alabama",ALABAMA,0.2,0.1,0.22,Low Vulnerability,0.23,Low Vulnerability,0.176,0.0456,0.0065,0.0092,0.0917,0.0,0.8307,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-87.990684 30.55549, -87.98783...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
309,1005,"Barbour County, Alabama",ALABAMA,0.23,0.11,1.0,Very High Vulnerability,0.89,Very High Vulnerability,0.128,0.0436,0.0029,0.0048,0.4744,0.0,0.4581,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-85.429819 32.045983, -85.4303...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
338,1007,"Bibb County, Alabama",ALABAMA,0.24,0.12,0.6,High Vulnerability,0.76,High Vulnerability,0.115,0.0257,0.0013,0.0012,0.2214,0.0,0.7453,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-87.312265 33.086219, -87.3121...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."
375,1009,"Blount County, Alabama",ALABAMA,0.23,0.11,0.42,Moderate Vulnerability,0.8,High Vulnerability,0.095,0.0926,0.0007,0.0037,0.0153,0.0004,0.8689,POINT (-86.844516 32.756889),AL,"MULTIPOLYGON (((-86.74918799999999 33.997596, ...","MULTIPOLYGON (((-88.139988 34.581703, -88.1352..."


In [222]:
# Load county statistics (presidential election data) dataset and check dataframe
file = "county_statistics.csv"
voting_pres_df = pd.read_csv(file)
voting_pres_df.head(10)

Unnamed: 0.1,Unnamed: 0,county,state,percentage16_Donald_Trump,percentage16_Hillary_Clinton,total_votes16,votes16_Donald_Trump,votes16_Hillary_Clinton,percentage20_Donald_Trump,percentage20_Joe_Biden,total_votes20,votes20_Donald_Trump,votes20_Joe_Biden,lat,long,cases,deaths,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,0,Abbeville,SC,0.629,0.346,10724.0,6742.0,3712.0,0.661,0.33,12433.0,8215.0,4101.0,34.223334,-82.461707,805.0,17.0,24788.0,12044.0,12744.0,1.3,68.9,27.6,0.1,0.3,0.0,19452.0,35254.0,2259.0,19234.0,799.0,22.7,32.1,27.2,20.7,20.8,10.6,20.7,78.3,11.1,0.5,1.8,1.8,6.5,25.8,9505.0,78.8,13.3,7.8,0.1,9.4
1,1,Acadia,LA,0.773,0.206,27386.0,21159.0,5638.0,0.795,0.191,28425.0,22596.0,5443.0,30.295065,-92.414197,3182.0,102.0,62607.0,30433.0,32174.0,2.4,77.5,17.6,0.1,0.1,0.0,45197.0,40492.0,2544.0,21591.0,1002.0,21.5,27.6,27.6,16.9,25.7,15.0,14.8,83.2,10.3,0.2,1.6,2.2,2.5,27.6,24982.0,80.0,12.1,7.6,0.3,8.9
2,2,Accomack,VA,0.545,0.428,15755.0,8582.0,6737.0,0.542,0.447,16938.0,9172.0,7578.0,37.767072,-75.632346,1227.0,19.0,32840.0,16079.0,16761.0,8.8,60.3,28.3,0.3,0.7,0.0,24408.0,42260.0,2253.0,24266.0,1564.0,19.8,31.8,31.1,17.7,18.8,15.1,17.3,80.0,10.6,0.5,2.6,1.8,4.5,22.0,13837.0,74.6,18.1,7.1,0.2,5.4
3,3,Ada,ID,0.479,0.387,195587.0,93748.0,75676.0,0.504,0.465,259389.0,130699.0,120539.0,43.452658,-116.241552,17451.0,181.0,435117.0,217999.0,217118.0,7.9,85.2,1.2,0.4,2.6,0.1,316189.0,60151.0,1294.0,31642.0,725.0,11.8,13.1,43.0,16.6,25.0,6.9,8.4,80.7,7.7,0.5,1.5,2.8,6.9,20.4,214984.0,78.3,15.0,6.6,0.1,4.3
4,4,Adair,IA,0.653,0.3,3759.0,2456.0,1127.0,0.697,0.286,4183.0,2917.0,1197.0,41.330756,-94.471059,222.0,1.0,7192.0,3552.0,3640.0,1.7,96.6,0.3,0.0,0.4,0.0,5572.0,49477.0,2633.0,28861.0,2055.0,9.5,12.1,28.2,16.9,20.0,17.3,17.6,77.9,12.4,0.3,2.8,0.4,6.2,22.3,3680.0,73.8,15.3,10.4,0.5,3.0
5,5,Adair,KY,0.806,0.161,8231.0,6637.0,1323.0,0.83,0.159,8766.0,7275.0,1391.0,37.104598,-85.281297,517.0,22.0,19304.0,9632.0,9672.0,1.8,93.4,3.6,0.1,0.1,0.0,15280.0,36575.0,3426.0,18408.0,1010.0,21.5,27.1,28.5,15.9,19.7,12.2,23.8,84.5,9.0,0.0,2.6,0.5,3.4,22.2,7988.0,74.1,15.8,9.9,0.1,6.2
6,6,Adair,MO,0.594,0.345,10137.0,6019.0,3495.0,0.618,0.358,10337.0,6391.0,3705.0,40.190586,-92.600782,578.0,0.0,25437.0,12013.0,13424.0,2.3,90.5,2.4,0.2,2.3,0.1,20169.0,38750.0,2130.0,21778.0,1702.0,26.2,20.7,36.8,18.2,24.1,9.4,11.5,77.3,12.1,0.1,4.0,2.6,4.0,17.1,11274.0,73.6,20.9,5.3,0.2,5.5
7,7,Adair,OK,0.735,0.212,6468.0,4753.0,1374.0,0.786,0.195,7108.0,5585.0,1387.0,35.884942,-94.658593,855.0,11.0,22136.0,10987.0,11149.0,6.4,40.8,0.3,41.7,0.6,0.2,16050.0,33366.0,1560.0,16576.0,820.0,30.8,43.0,23.9,17.6,19.4,14.0,25.2,84.4,8.5,0.1,2.8,1.0,3.2,23.1,8130.0,71.6,20.4,7.5,0.5,5.5
8,8,Adams,CO,0.421,0.494,175125.0,73807.0,86471.0,0.404,0.567,234599.0,94874.0,132951.0,39.874321,-104.336258,15538.0,263.0,487850.0,245840.0,242010.0,39.3,51.1,3.0,0.5,3.8,0.1,306416.0,64087.0,991.0,27487.0,366.0,12.2,16.4,29.8,18.9,24.1,13.7,13.5,77.7,11.2,3.8,1.2,1.1,5.0,29.2,246450.0,83.6,11.2,5.1,0.1,5.1
9,9,Adams,IA,0.669,0.271,2082.0,1393.0,565.0,0.708,0.273,2158.0,1528.0,590.0,41.029036,-94.699326,87.0,1.0,3785.0,1870.0,1915.0,1.2,96.8,0.4,0.1,0.6,0.0,2992.0,49745.0,3996.0,27022.0,1948.0,11.2,8.6,37.4,14.3,17.0,13.1,18.3,80.7,9.6,0.2,3.3,0.8,5.3,19.6,1796.0,72.0,12.1,15.5,0.3,4.2


In [223]:
# Show all columns in df
voting_pres_df.columns

Index(['Unnamed: 0', 'county', 'state', 'percentage16_Donald_Trump',
       'percentage16_Hillary_Clinton', 'total_votes16', 'votes16_Donald_Trump',
       'votes16_Hillary_Clinton', 'percentage20_Donald_Trump',
       'percentage20_Joe_Biden', 'total_votes20', 'votes20_Donald_Trump',
       'votes20_Joe_Biden', 'lat', 'long', 'cases', 'deaths', 'TotalPop',
       'Men', 'Women', 'Hispanic', 'White', 'Black', 'Native', 'Asian',
       'Pacific', 'VotingAgeCitizen', 'Income', 'IncomeErr', 'IncomePerCap',
       'IncomePerCapErr', 'Poverty', 'ChildPoverty', 'Professional', 'Service',
       'Office', 'Construction', 'Production', 'Drive', 'Carpool', 'Transit',
       'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute', 'Employed',
       'PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork',
       'Unemployment'],
      dtype='object')

In [224]:
# Drop unnamed: 0 column and empty 2020 percentage columns
voting_pres_df.drop("Unnamed: 0",inplace=True, axis=1)
voting_pres_df = voting_pres_df[voting_pres_df['percentage20_Donald_Trump'].notna()]

In [225]:
# Check for null values
voting_pres_df.isnull().sum()

county                             0
state                              0
percentage16_Donald_Trump       1404
percentage16_Hillary_Clinton    1404
total_votes16                   1404
votes16_Donald_Trump            1404
votes16_Hillary_Clinton         1404
percentage20_Donald_Trump          0
percentage20_Joe_Biden             0
total_votes20                      0
votes20_Donald_Trump               0
votes20_Joe_Biden                  0
lat                             1442
long                            1442
cases                           1442
deaths                          1442
TotalPop                        1402
Men                             1402
Women                           1402
Hispanic                        1402
White                           1402
Black                           1402
Native                          1402
Asian                           1402
Pacific                         1402
VotingAgeCitizen                1402
Income                          1402
I

In [226]:
# Show columns data type
voting_pres_df.dtypes

county                           object
state                            object
percentage16_Donald_Trump       float64
percentage16_Hillary_Clinton    float64
total_votes16                   float64
votes16_Donald_Trump            float64
votes16_Hillary_Clinton         float64
percentage20_Donald_Trump       float64
percentage20_Joe_Biden          float64
total_votes20                   float64
votes20_Donald_Trump            float64
votes20_Joe_Biden               float64
lat                             float64
long                            float64
cases                           float64
deaths                          float64
TotalPop                        float64
Men                             float64
Women                           float64
Hispanic                        float64
White                           float64
Black                           float64
Native                          float64
Asian                           float64
Pacific                         float64


In [227]:
voting_pres_df["state"].unique()

array(['SC', 'LA', 'VA', 'ID', 'IA', 'KY', 'MO', 'OK', 'CO', 'IL', 'IN',
       'MS', 'ND', 'NE', 'OH', 'PA', 'WA', 'WI', 'VT', 'MN', 'FL', 'NC',
       'CA', 'NY', 'WY', 'MI', 'MD', 'KS', 'TN', 'TX', 'AZ', 'GA', 'AR',
       'NJ', 'SD', 'AL', 'OR', 'WV', 'MA', 'UT', 'MT', 'NM', 'RI', 'NH',
       'NV', 'ME', 'DC', 'CT', 'HI', 'DE', 'AK'], dtype=object)

In [228]:
# Check number of records per state
voting_pres_df["state"].value_counts()

ME    375
MA    333
TX    254
VT    245
NH    237
CT    169
GA    159
VA    133
KY    120
MO    115
KS    105
IL    102
NC    100
IA     99
TN     95
NE     93
IN     92
OH     88
MN     87
MI     83
MS     82
OK     77
AR     75
WI     72
FL     67
AL     67
PA     67
SD     66
CO     64
LA     64
NY     62
CA     58
MT     56
WV     55
ND     53
SC     46
ID     44
AK     40
RI     39
WA     39
OR     36
NM     33
UT     29
MD     24
WY     23
NJ     21
NV     17
AZ     15
DC      8
HI      4
DE      3
Name: state, dtype: int64

In [229]:
# Update state column to show full state name
# Create state dictionary
us_state_full = {
'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado',
'CT': 'Connecticut', 'DE': 'Delaware','DC': 'District of Columbia', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana',
'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey',
'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
'OR': 'Oregon', 'PA': 'Pennsylvania','PR': 'Puerto Rico', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota',
'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington',
'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'}

us_state_abbrev = {
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO',
'Connecticut': 'CT', 'Delaware': 'DE','District of Columbia':'DC', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',
'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
'Oregon': 'OR', 'Pennsylvania': 'PA','Puerto Rico':'PR', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD',
'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}
# (Not Needed) Update values in state column
#voting_pres_df["state"] = voting_pres_df["state"].map(us_state_full).fillna(voting_pres_df["state"])

In [230]:
voting_pres_df["state"]

0       SC
1       LA
2       VA
3       ID
4       IA
        ..
4651    RI
4652    VT
4654    MA
4655    MA
4656    ME
Name: state, Length: 4490, dtype: object

In [231]:
# Add "County" to county name column
voting_pres_df["county"] = voting_pres_df["county"] + ' ' + 'County'
voting_pres_df["county"] = voting_pres_df["county"].str.lower()

In [232]:
# Check the dataframe
voting_pres_df.head(10)

Unnamed: 0,county,state,percentage16_Donald_Trump,percentage16_Hillary_Clinton,total_votes16,votes16_Donald_Trump,votes16_Hillary_Clinton,percentage20_Donald_Trump,percentage20_Joe_Biden,total_votes20,votes20_Donald_Trump,votes20_Joe_Biden,lat,long,cases,deaths,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,abbeville county,SC,0.629,0.346,10724.0,6742.0,3712.0,0.661,0.33,12433.0,8215.0,4101.0,34.223334,-82.461707,805.0,17.0,24788.0,12044.0,12744.0,1.3,68.9,27.6,0.1,0.3,0.0,19452.0,35254.0,2259.0,19234.0,799.0,22.7,32.1,27.2,20.7,20.8,10.6,20.7,78.3,11.1,0.5,1.8,1.8,6.5,25.8,9505.0,78.8,13.3,7.8,0.1,9.4
1,acadia county,LA,0.773,0.206,27386.0,21159.0,5638.0,0.795,0.191,28425.0,22596.0,5443.0,30.295065,-92.414197,3182.0,102.0,62607.0,30433.0,32174.0,2.4,77.5,17.6,0.1,0.1,0.0,45197.0,40492.0,2544.0,21591.0,1002.0,21.5,27.6,27.6,16.9,25.7,15.0,14.8,83.2,10.3,0.2,1.6,2.2,2.5,27.6,24982.0,80.0,12.1,7.6,0.3,8.9
2,accomack county,VA,0.545,0.428,15755.0,8582.0,6737.0,0.542,0.447,16938.0,9172.0,7578.0,37.767072,-75.632346,1227.0,19.0,32840.0,16079.0,16761.0,8.8,60.3,28.3,0.3,0.7,0.0,24408.0,42260.0,2253.0,24266.0,1564.0,19.8,31.8,31.1,17.7,18.8,15.1,17.3,80.0,10.6,0.5,2.6,1.8,4.5,22.0,13837.0,74.6,18.1,7.1,0.2,5.4
3,ada county,ID,0.479,0.387,195587.0,93748.0,75676.0,0.504,0.465,259389.0,130699.0,120539.0,43.452658,-116.241552,17451.0,181.0,435117.0,217999.0,217118.0,7.9,85.2,1.2,0.4,2.6,0.1,316189.0,60151.0,1294.0,31642.0,725.0,11.8,13.1,43.0,16.6,25.0,6.9,8.4,80.7,7.7,0.5,1.5,2.8,6.9,20.4,214984.0,78.3,15.0,6.6,0.1,4.3
4,adair county,IA,0.653,0.3,3759.0,2456.0,1127.0,0.697,0.286,4183.0,2917.0,1197.0,41.330756,-94.471059,222.0,1.0,7192.0,3552.0,3640.0,1.7,96.6,0.3,0.0,0.4,0.0,5572.0,49477.0,2633.0,28861.0,2055.0,9.5,12.1,28.2,16.9,20.0,17.3,17.6,77.9,12.4,0.3,2.8,0.4,6.2,22.3,3680.0,73.8,15.3,10.4,0.5,3.0
5,adair county,KY,0.806,0.161,8231.0,6637.0,1323.0,0.83,0.159,8766.0,7275.0,1391.0,37.104598,-85.281297,517.0,22.0,19304.0,9632.0,9672.0,1.8,93.4,3.6,0.1,0.1,0.0,15280.0,36575.0,3426.0,18408.0,1010.0,21.5,27.1,28.5,15.9,19.7,12.2,23.8,84.5,9.0,0.0,2.6,0.5,3.4,22.2,7988.0,74.1,15.8,9.9,0.1,6.2
6,adair county,MO,0.594,0.345,10137.0,6019.0,3495.0,0.618,0.358,10337.0,6391.0,3705.0,40.190586,-92.600782,578.0,0.0,25437.0,12013.0,13424.0,2.3,90.5,2.4,0.2,2.3,0.1,20169.0,38750.0,2130.0,21778.0,1702.0,26.2,20.7,36.8,18.2,24.1,9.4,11.5,77.3,12.1,0.1,4.0,2.6,4.0,17.1,11274.0,73.6,20.9,5.3,0.2,5.5
7,adair county,OK,0.735,0.212,6468.0,4753.0,1374.0,0.786,0.195,7108.0,5585.0,1387.0,35.884942,-94.658593,855.0,11.0,22136.0,10987.0,11149.0,6.4,40.8,0.3,41.7,0.6,0.2,16050.0,33366.0,1560.0,16576.0,820.0,30.8,43.0,23.9,17.6,19.4,14.0,25.2,84.4,8.5,0.1,2.8,1.0,3.2,23.1,8130.0,71.6,20.4,7.5,0.5,5.5
8,adams county,CO,0.421,0.494,175125.0,73807.0,86471.0,0.404,0.567,234599.0,94874.0,132951.0,39.874321,-104.336258,15538.0,263.0,487850.0,245840.0,242010.0,39.3,51.1,3.0,0.5,3.8,0.1,306416.0,64087.0,991.0,27487.0,366.0,12.2,16.4,29.8,18.9,24.1,13.7,13.5,77.7,11.2,3.8,1.2,1.1,5.0,29.2,246450.0,83.6,11.2,5.1,0.1,5.1
9,adams county,IA,0.669,0.271,2082.0,1393.0,565.0,0.708,0.273,2158.0,1528.0,590.0,41.029036,-94.699326,87.0,1.0,3785.0,1870.0,1915.0,1.2,96.8,0.4,0.1,0.6,0.0,2992.0,49745.0,3996.0,27022.0,1948.0,11.2,8.6,37.4,14.3,17.0,13.1,18.3,80.7,9.6,0.2,3.3,0.8,5.3,19.6,1796.0,72.0,12.1,15.5,0.3,4.2


In [233]:
# Load census demographic dataset and check dataframe
new_file = "acs2017_county_data.csv"
census_demographic_df = pd.read_csv(new_file)
census_demographic_df.head(10)

Unnamed: 0,CountyId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001,Alabama,Autauga County,55036,26899,28137,2.7,75.4,18.9,0.3,0.9,0.0,41016,55317,2838,27824,2024,13.7,20.1,35.3,18.0,23.2,8.1,15.4,86.0,9.6,0.1,0.6,1.3,2.5,25.8,24112,74.1,20.2,5.6,0.1,5.2
1,1003,Alabama,Baldwin County,203360,99527,103833,4.4,83.1,9.5,0.8,0.7,0.0,155376,52562,1348,29364,735,11.8,16.1,35.7,18.2,25.6,9.7,10.8,84.7,7.6,0.1,0.8,1.1,5.6,27.0,89527,80.7,12.9,6.3,0.1,5.5
2,1005,Alabama,Barbour County,26201,13976,12225,4.2,45.7,47.8,0.2,0.6,0.0,20269,33368,2551,17561,798,27.2,44.9,25.0,16.8,22.6,11.5,24.1,83.4,11.1,0.3,2.2,1.7,1.3,23.4,8878,74.1,19.1,6.5,0.3,12.4
3,1007,Alabama,Bibb County,22580,12251,10329,2.4,74.6,22.0,0.4,0.0,0.0,17662,43404,3431,20911,1889,15.2,26.6,24.4,17.6,19.7,15.9,22.4,86.4,9.5,0.7,0.3,1.7,1.5,30.0,8171,76.0,17.4,6.3,0.3,8.2
4,1009,Alabama,Blount County,57667,28490,29177,9.0,87.4,1.5,0.3,0.1,0.0,42513,47412,2630,22021,850,15.6,25.4,28.5,12.9,23.3,15.8,19.5,86.8,10.2,0.1,0.4,0.4,2.1,35.0,21380,83.9,11.9,4.0,0.1,4.9
5,1011,Alabama,Bullock County,10478,5616,4862,0.3,21.6,75.6,1.0,0.7,0.0,8212,29655,5376,20856,2355,28.5,50.4,19.7,17.1,18.6,14.0,30.6,73.1,15.7,0.3,6.2,1.7,3.0,29.8,4290,81.4,13.6,5.0,0.0,12.1
6,1013,Alabama,Butler County,20126,9416,10710,0.3,52.2,44.7,0.1,1.1,0.0,15459,36326,2701,19004,943,24.4,34.8,26.9,17.3,18.5,11.6,25.7,83.6,12.6,0.0,0.9,0.9,2.0,23.2,7727,79.1,15.3,5.3,0.3,7.6
7,1015,Alabama,Calhoun County,115527,55593,59934,3.6,72.7,20.4,0.2,1.0,0.0,88383,43686,1491,23638,793,18.6,26.6,29.0,17.5,23.7,10.4,19.4,85.0,9.2,0.2,1.3,1.1,3.2,24.8,47392,74.9,19.9,5.1,0.1,10.1
8,1017,Alabama,Chambers County,33895,16320,17575,2.2,56.2,39.3,0.3,1.0,0.0,26259,37342,2011,22002,1205,18.8,29.1,24.3,13.5,23.0,11.6,27.6,87.1,9.7,0.2,0.6,0.5,2.0,23.6,14527,84.5,11.8,3.7,0.0,6.4
9,1019,Alabama,Cherokee County,25855,12862,12993,1.6,91.8,5.0,0.5,0.1,0.0,20620,40041,2316,23010,1354,16.1,20.0,28.8,14.8,18.1,11.9,26.5,85.0,12.1,0.4,0.3,0.3,2.0,26.5,9879,74.8,17.1,8.1,0.0,5.3


In [234]:
census_demographic_df.count()

CountyId            3220
State               3220
County              3220
TotalPop            3220
Men                 3220
Women               3220
Hispanic            3220
White               3220
Black               3220
Native              3220
Asian               3220
Pacific             3220
VotingAgeCitizen    3220
Income              3220
IncomeErr           3220
IncomePerCap        3220
IncomePerCapErr     3220
Poverty             3220
ChildPoverty        3219
Professional        3220
Service             3220
Office              3220
Construction        3220
Production          3220
Drive               3220
Carpool             3220
Transit             3220
Walk                3220
OtherTransp         3220
WorkAtHome          3220
MeanCommute         3220
Employed            3220
PrivateWork         3220
PublicWork          3220
SelfEmployed        3220
FamilyWork          3220
Unemployment        3220
dtype: int64

In [235]:
census_demographic_df["State"].value_counts()

Texas                   254
Georgia                 159
Virginia                133
Kentucky                120
Missouri                115
Kansas                  105
Illinois                102
North Carolina          100
Iowa                     99
Tennessee                95
Nebraska                 93
Indiana                  92
Ohio                     88
Minnesota                87
Michigan                 83
Mississippi              82
Puerto Rico              78
Oklahoma                 77
Arkansas                 75
Wisconsin                72
Alabama                  67
Pennsylvania             67
Florida                  67
South Dakota             66
Louisiana                64
Colorado                 64
New York                 62
California               58
Montana                  56
West Virginia            55
North Dakota             53
South Carolina           46
Idaho                    44
Washington               39
Oregon                   36
New Mexico          

In [236]:
# Show all columns in df
census_demographic_df.columns

Index(['CountyId', 'State', 'County', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')

In [237]:
# Check columns data type
census_demographic_df.dtypes

CountyId              int64
State                object
County               object
TotalPop              int64
Men                   int64
Women                 int64
Hispanic            float64
White               float64
Black               float64
Native              float64
Asian               float64
Pacific             float64
VotingAgeCitizen      int64
Income                int64
IncomeErr             int64
IncomePerCap          int64
IncomePerCapErr       int64
Poverty             float64
ChildPoverty        float64
Professional        float64
Service             float64
Office              float64
Construction        float64
Production          float64
Drive               float64
Carpool             float64
Transit             float64
Walk                float64
OtherTransp         float64
WorkAtHome          float64
MeanCommute         float64
Employed              int64
PrivateWork         float64
PublicWork          float64
SelfEmployed        float64
FamilyWork          

In [238]:
# Check for null values
census_demographic_df.isnull().sum()

CountyId            0
State               0
County              0
TotalPop            0
Men                 0
Women               0
Hispanic            0
White               0
Black               0
Native              0
Asian               0
Pacific             0
VotingAgeCitizen    0
Income              0
IncomeErr           0
IncomePerCap        0
IncomePerCapErr     0
Poverty             0
ChildPoverty        1
Professional        0
Service             0
Office              0
Construction        0
Production          0
Drive               0
Carpool             0
Transit             0
Walk                0
OtherTransp         0
WorkAtHome          0
MeanCommute         0
Employed            0
PrivateWork         0
PublicWork          0
SelfEmployed        0
FamilyWork          0
Unemployment        0
dtype: int64

In [239]:
# Load Vaccine Hesitancy dataset and check dataframe
new_file_path = "County_Codes_FIPS.csv"
county_state_df = pd.read_csv(new_file_path)

# Rename and reorder columns to match the database
county_state_df.rename({'FIPS':'fips_code', 'Name': 'county', 'State': 'state'}, axis='columns', inplace=True)
cols = ['county', 'state','fips_code']
county_state_df = county_state_df[cols]

In [240]:
county_state_df["county"] = county_state_df["county"] + ' ' + 'County'
county_state_df["county"] = county_state_df["county"].str.lower()

In [241]:
# Check the new dataframe
county_state_df.head(10)

Unnamed: 0,county,state,fips_code
0,anderson county,TX,48001
1,andrews county,TX,48003
2,angelina county,TX,48005
3,aransas county,TX,48007
4,archer county,TX,48009
5,armstrong county,TX,48011
6,atascosa county,TX,48013
7,austin county,TX,48015
8,bailey county,TX,48017
9,bandera county,TX,48019


In [242]:
# Check for null values
county_state_df.isnull().sum()

county       0
state        0
fips_code    0
dtype: int64

In [243]:
census_county_states_df = census_demographic_df.filter(["County", "State", "CountyId"])
census_county_states_df.rename({'County':'county', 'State': 'state', 'CountyId': 'fips_code'}, axis='columns', inplace=True)
census_county_states_df["state"] = census_county_states_df["state"].map(us_state_abbrev).fillna(census_county_states_df["state"])
census_county_states_df["county"] = census_county_states_df["county"].str.lower()
census_county_states_df

Unnamed: 0,county,state,fips_code
0,autauga county,AL,1001
1,baldwin county,AL,1003
2,barbour county,AL,1005
3,bibb county,AL,1007
4,blount county,AL,1009
...,...,...,...
3215,vega baja municipio,PR,72145
3216,vieques municipio,PR,72147
3217,villalba municipio,PR,72149
3218,yabucoa municipio,PR,72151


In [244]:
merged_county_codes_df = county_state_df.append(census_county_states_df)
merged_county_codes_df.drop_duplicates(keep='first', inplace=True)
merged_county_codes_df

Unnamed: 0,county,state,fips_code
0,anderson county,TX,48001
1,andrews county,TX,48003
2,angelina county,TX,48005
3,aransas county,TX,48007
4,archer county,TX,48009
...,...,...,...
3215,vega baja municipio,PR,72145
3216,vieques municipio,PR,72147
3217,villalba municipio,PR,72149
3218,yabucoa municipio,PR,72151


In [245]:
merged_voting_pres_df = pd.merge(merged_county_codes_df, voting_pres_df, on=["county", "state"], how="left")
merged_voting_pres_df = merged_voting_pres_df[merged_voting_pres_df['percentage20_Donald_Trump'].notna()]
merged_voting_pres_df

Unnamed: 0,county,state,fips_code,percentage16_Donald_Trump,percentage16_Hillary_Clinton,total_votes16,votes16_Donald_Trump,votes16_Hillary_Clinton,percentage20_Donald_Trump,percentage20_Joe_Biden,total_votes20,votes20_Donald_Trump,votes20_Joe_Biden,lat,long,cases,deaths,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,anderson county,TX,48001,0.780,0.199,16887.0,13165.0,3358.0,0.786,0.205,19155.0,15062.0,3934.0,31.815347,-95.653548,2979.0,39.0,57747.0,35292.0,22455.0,17.3,59.7,20.9,0.3,0.5,0.0,44095.0,42313.0,2337.0,17466.0,894.0,15.7,23.0,24.4,23.5,23.4,12.9,15.9,85.1,9.9,0.2,0.5,0.7,3.6,23.7,19102.0,73.2,20.4,6.2,0.2,4.3
1,andrews county,TX,48003,0.797,0.170,4926.0,3925.0,836.0,0.843,0.145,5856.0,4937.0,849.0,32.304686,-102.637655,667.0,10.0,17577.0,8980.0,8597.0,55.4,41.1,1.5,0.1,0.2,0.1,10695.0,70753.0,6115.0,29903.0,3147.0,11.9,15.5,21.3,13.0,24.2,22.7,18.8,79.8,16.8,0.0,0.0,0.8,2.5,19.8,8054.0,85.3,8.7,5.7,0.3,4.5
2,angelina county,TX,48005,0.725,0.252,29870.0,21666.0,7538.0,0.724,0.264,34628.0,25070.0,9136.0,31.254573,-94.609015,2317.0,93.0,87700.0,42707.0,44993.0,21.5,61.2,14.7,0.1,1.1,0.0,60533.0,46472.0,1452.0,21974.0,758.0,19.1,27.1,28.2,20.9,24.0,12.3,14.6,82.9,11.9,0.4,1.7,0.9,2.2,18.7,36164.0,77.2,16.7,6.0,0.1,7.6
3,aransas county,TX,48007,0.739,0.235,10467.0,7730.0,2458.0,0.752,0.237,12241.0,9210.0,2896.0,28.105562,-96.999505,351.0,18.0,24832.0,12448.0,12384.0,27.0,67.7,1.6,0.0,2.0,0.0,19441.0,44601.0,3698.0,29999.0,2914.0,17.8,26.8,28.2,23.3,22.1,16.5,9.8,81.4,8.4,0.0,2.8,0.5,6.8,20.8,10387.0,72.9,12.3,14.6,0.2,6.7
4,archer county,TX,48009,0.887,0.092,4269.0,3785.0,394.0,0.897,0.093,4796.0,4300.0,446.0,33.615700,-98.687546,165.0,1.0,8793.0,4335.0,4458.0,8.3,88.6,0.8,0.3,0.4,0.0,6712.0,63192.0,4765.0,31103.0,2291.0,8.8,10.1,34.2,13.4,25.3,12.9,14.2,86.0,6.9,0.3,1.0,1.1,4.7,20.4,4344.0,75.6,14.5,9.8,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3350,st. lawrence county,NY,36089,0.525,0.417,35154.0,18450.0,14659.0,0.616,0.367,35519.0,21862.0,13036.0,44.497618,-75.065500,423.0,4.0,110817.0,56381.0,54436.0,2.3,92.1,2.2,0.7,1.1,0.1,85770.0,48330.0,1867.0,23554.0,606.0,19.4,29.0,34.2,22.6,21.4,11.4,10.4,77.1,9.8,0.5,6.1,1.3,5.1,20.9,44013.0,68.3,24.0,7.5,0.3,8.8
3351,lamoure county,ND,38045,0.697,0.236,2115.0,1475.0,500.0,0.741,0.237,2219.0,1645.0,527.0,46.456811,-98.535424,224.0,11.0,4106.0,2129.0,1977.0,1.0,97.1,0.1,0.7,0.0,0.0,3175.0,57463.0,3896.0,36653.0,3293.0,10.7,15.3,40.0,12.7,18.1,14.5,14.7,69.9,6.6,0.0,9.8,0.7,13.0,17.5,2019.0,64.5,13.4,20.5,1.6,0.8
3352,oglala lakota county,SD,46102,,,,,,0.093,0.884,3200.0,297.0,2829.0,43.337492,-102.555550,1051.0,9.0,14291.0,6934.0,7357.0,3.7,4.1,0.1,90.3,0.0,0.0,8940.0,27804.0,2880.0,9334.0,703.0,51.9,59.8,33.4,32.2,16.5,7.0,10.9,64.8,12.2,2.0,11.9,4.5,4.6,17.0,3219.0,31.1,64.8,4.1,0.0,22.7
3353,dewitt county,TX,48123,0.808,0.170,6822.0,5510.0,1161.0,0.809,0.184,8118.0,6567.0,1494.0,29.081019,-97.356812,1087.0,44.0,20474.0,10697.0,9777.0,34.5,55.6,9.0,0.0,0.0,0.0,15285.0,50960.0,5037.0,28116.0,2303.0,16.0,20.2,30.5,18.3,19.8,15.5,15.9,85.0,7.6,0.1,2.3,1.0,3.9,23.8,7965.0,76.9,16.1,6.7,0.2,6.5


In [246]:
# Check for null values
merged_voting_pres_df.isnull().sum()

county                           0
state                            0
fips_code                        0
percentage16_Donald_Trump        1
percentage16_Hillary_Clinton     1
total_votes16                    1
votes16_Donald_Trump             1
votes16_Hillary_Clinton          1
percentage20_Donald_Trump        0
percentage20_Joe_Biden           0
total_votes20                    0
votes20_Donald_Trump             0
votes20_Joe_Biden                0
lat                             39
long                            39
cases                           39
deaths                          39
TotalPop                         0
Men                              0
Women                            0
Hispanic                         0
White                            0
Black                            0
Native                           0
Asian                            0
Pacific                          0
VotingAgeCitizen                 0
Income                           0
IncomeErr           

In [247]:
# Re order fips column
merged_voting_pres_df = merged_voting_pres_df.sort_values(by='fips_code', ascending=True)
merged_voting_pres_df.head()

Unnamed: 0,county,state,fips_code,percentage16_Donald_Trump,percentage16_Hillary_Clinton,total_votes16,votes16_Donald_Trump,votes16_Hillary_Clinton,percentage20_Donald_Trump,percentage20_Joe_Biden,total_votes20,votes20_Donald_Trump,votes20_Joe_Biden,lat,long,cases,deaths,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
2110,autauga county,AL,1001,0.734,0.24,24661.0,18110.0,5908.0,0.715,0.27,27639.0,19764.0,7450.0,32.539527,-86.644082,2173.0,31.0,55036.0,26899.0,28137.0,2.7,75.4,18.9,0.3,0.9,0.0,41016.0,55317.0,2838.0,27824.0,2024.0,13.7,20.1,35.3,18.0,23.2,8.1,15.4,86.0,9.6,0.1,0.6,1.3,2.5,25.8,24112.0,74.1,20.2,5.6,0.1,5.2
2111,baldwin county,AL,1003,0.774,0.196,94090.0,72780.0,18409.0,0.762,0.223,108945.0,83055.0,24344.0,30.72775,-87.722071,6966.0,71.0,203360.0,99527.0,103833.0,4.4,83.1,9.5,0.8,0.7,0.0,155376.0,52562.0,1348.0,29364.0,735.0,11.8,16.1,35.7,18.2,25.6,9.7,10.8,84.7,7.6,0.1,0.8,1.1,5.6,27.0,89527.0,80.7,12.9,6.3,0.1,5.5
2112,barbour county,AL,1005,0.523,0.467,10390.0,5431.0,4848.0,0.536,0.456,10457.0,5605.0,4772.0,31.868263,-85.387129,1061.0,9.0,26201.0,13976.0,12225.0,4.2,45.7,47.8,0.2,0.6,0.0,20269.0,33368.0,2551.0,17561.0,798.0,27.2,44.9,25.0,16.8,22.6,11.5,24.1,83.4,11.1,0.3,2.2,1.7,1.3,23.4,8878.0,74.1,19.1,6.5,0.3,12.4
2113,bibb county,AL,1007,0.77,0.214,8748.0,6733.0,1874.0,0.784,0.207,9573.0,7508.0,1982.0,32.996421,-87.125115,878.0,15.0,22580.0,12251.0,10329.0,2.4,74.6,22.0,0.4,0.0,0.0,17662.0,43404.0,3431.0,20911.0,1889.0,15.2,26.6,24.4,17.6,19.7,15.9,22.4,86.4,9.5,0.7,0.3,1.7,1.5,30.0,8171.0,76.0,17.4,6.3,0.3,8.2
2114,blount county,AL,1009,0.899,0.085,25384.0,22808.0,2150.0,0.896,0.096,27459.0,24595.0,2627.0,33.982109,-86.567906,2095.0,25.0,57667.0,28490.0,29177.0,9.0,87.4,1.5,0.3,0.1,0.0,42513.0,47412.0,2630.0,22021.0,850.0,15.6,25.4,28.5,12.9,23.3,15.8,19.5,86.8,10.2,0.1,0.4,0.4,2.1,35.0,21380.0,83.9,11.9,4.0,0.1,4.9


In [248]:
# Load county size (will be used for rural vs urban classification) dataset and check dataframe
file = "NCHSURCodes2013.xlsx"
county_size_df = pd.read_excel(file, index_col=False)
county_size_df.head(10)

Unnamed: 0,FIPS code,State Abr.,County name,CBSA title,CBSA 2012 pop,County 2012 pop,2013 code,2006 code,1990-based code,Unnamed: 9
0,1001,AL,Autauga County,"Montgomery, AL",377149,55514,3,3,3,
1,1003,AL,Baldwin County,"Daphne-Fairhope-Foley, AL",190790,190790,4,5,3,
2,1005,AL,Barbour County,,.,27201,6,5,5,
3,1007,AL,Bibb County,"Birmingham-Hoover, AL",1136650,22597,2,2,6,
4,1009,AL,Blount County,"Birmingham-Hoover, AL",1136650,57826,2,2,3,
5,1011,AL,Bullock County,,.,10474,6,6,6,
6,1013,AL,Butler County,,.,20307,6,6,6,
7,1015,AL,Calhoun County,"Anniston-Oxford-Jacksonville, AL",117296,117296,4,4,4,
8,1017,AL,Chambers County,"Valley, AL",34064,34064,5,5,6,
9,1019,AL,Cherokee County,,.,26021,6,6,6,


In [249]:
# Drop unnamed columns. Code from https://stackoverflow.com/questions/49645135/python-pandas-display-extra-unnamed-columns-for-an-excel-file
county_size_df = county_size_df[county_size_df.filter(regex='^(?!Unnamed)').columns]

# Drop columns we don't need
county_size_df = county_size_df.drop(columns={'CBSA title', 'CBSA 2012 pop', '2006 code', '1990-based code'})

In [250]:
# Rename and reorder columns 
county_size_df.rename({'FIPS code':'fips_code', 'State Abr.': 'state', 'County name':'county', 'County 2012 pop':'population', '2013 code':'density'}, axis='columns', inplace=True)
size_cols = ['fips_code', 'county', 'state', 'population', 'density']
county_size_df = county_size_df[size_cols]


In [251]:
# Code the density column so 1-3 = urban and 4-6 = rural
density_dict = {1:'urban', 2:'urban', 3:'urban', 4:'urban', 5:'rural', 6:'rural'}
county_size_df['density'] = county_size_df['density'].replace(density_dict)


In [252]:
# Check for null values and other things we don't want in the data
county_size_df.isnull().sum()

fips_code     0
county        0
state         0
population    0
density       0
dtype: int64

In [253]:
# Realized there are periods in the population column. 
county_size_df['population'].value_counts()


.        4
14008    3
55190    2
61475    2
7780     2
        ..
16996    1
58723    1
13668    1
13798    1
23807    1
Name: population, Length: 3092, dtype: int64

In [254]:
# Locate the period colunms
county_size_df.loc[county_size_df['population'] == '.']

Unnamed: 0,fips_code,county,state,population,density
89,2201,Prince of Wales-Outer Ket,AK,.,rural
92,2232,Skagway-Hoonah-Angoon Census Area,AK,.,rural
97,2280,Wrangell-Petersburg Census Area,AK,.,rural
2926,51560,Clifton Forge city,VA,.,rural


In [255]:
# These seem to be census outliers according to google. Dropping them. 

county_size_df = county_size_df.drop([89, 92, 97, 2926])

In [256]:
# Double checking those period ones are gone. 
county_size_df.loc[county_size_df['population'] == '.']

Unnamed: 0,fips_code,county,state,population,density


In [257]:
# Create SQL engine
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/vaccine_hesitancy"                                                                                        
engine = create_engine(db_string)

# Import all 3 tables to Postgres DB
census_demographic_df.to_sql(name='census_demographic', con=engine, index=False, if_exists='append')
vaccine_hesitancy_df.to_sql(name='vaccine_hesitancy_covid', con=engine, index=False, if_exists='append')   
merged_voting_pres_df.to_sql(name='county_statistics', con=engine, index=False, if_exists='append')  
county_size_df.to_sql(name='county_size', con=engine, index=False, if_exists='append')