1. Prepare Datasets

Import the S3 data into SageMaker

In [2]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Store S3 locations

In [23]:
s3_public_path_kaggle = "s3://collegeaffordability317/Kaggle/"
s3_public_path_tuition = "s3://collegeaffordability317/TuitionTracker/"
s3_public_path_usda = "s3://collegeaffordability317/USDA/"
s3_public_path_usde = "s3://collegeaffordability317/USDE/"

In [24]:
%store s3_public_path_kaggle
%store s3_public_path_tuition
%store s3_public_path_usda
%store s3_public_path_usde

Stored 's3_public_path_kaggle' (str)
Stored 's3_public_path_tuition' (str)
Stored 's3_public_path_usda' (str)
Stored 's3_public_path_usde' (str)


Show all the data files for the project

In [26]:
!aws s3 ls $s3_public_path_kaggle

2024-03-17 16:07:03   15143141 college_data.csv


In [28]:
!aws s3 ls $s3_public_path_tuition --recursive

2024-03-24 19:25:25          0 TuitionTracker/DataDictionary/
2024-03-24 19:25:37      21527 TuitionTracker/DataDictionary/DataDictionary.xlsx
2024-03-24 19:24:22          0 TuitionTracker/GradRates/
2024-03-24 19:24:45     838246 TuitionTracker/GradRates/GradRates.csv
2024-03-24 19:24:55          0 TuitionTracker/NetPrice/
2024-03-24 19:25:12    1319314 TuitionTracker/NetPrice/NetPrice.csv
2024-03-24 19:23:55          0 TuitionTracker/RetentionRates/
2024-03-24 19:24:08    1382164 TuitionTracker/RetentionRates/RetentionRates.csv
2024-03-24 19:20:50          0 TuitionTracker/StickerPrice/
2024-03-24 19:21:49     653000 TuitionTracker/StickerPrice/StickerPrice.csv


In [29]:
!aws s3 ls $s3_public_path_usda --recursive

2024-03-24 19:26:33          0 USDA/Education/
2024-03-24 19:26:58    1610608 USDA/Education/Education.xlsx
2024-03-24 19:28:03          0 USDA/PovertyEstimates/
2024-03-24 19:28:14     450599 USDA/PovertyEstimates/PovertyEstimates.csv
2024-03-24 19:28:24          0 USDA/Unemployment/
2024-03-24 19:28:34    2187886 USDA/Unemployment/Unemployment.csv


In [8]:
!aws s3 ls $s3_public_path_usde

2024-03-17 16:06:52      59904 FedStudentAidPortfolioByAge.xls
2024-03-17 16:06:53      68096 FedStudentAidPortfolioByDebtSize.xls
2024-03-17 16:06:53      62976 FedStudentAidPortfolioByLoanType.xls
2024-03-17 16:06:53      50176 FedStudentAidPortfolioByLocation.xls
2024-03-17 16:06:54      60928 FedStudentAidPortfolioBySchoolType.xls
2024-03-17 16:06:54      61952 FedStudentAidPortfolioSummary.xls
2024-03-17 16:06:55      52224 PortfolioByAgeAndDebtSize.xls
2024-03-17 16:06:55      91648 PortfolioByDelinquencyStatus.xls
2024-03-17 16:06:56      57856 PortfolioByLocationAndAge.xls
2024-03-17 16:06:56      58368 PortfolioByLocationAndDebtSize.xls
2024-03-17 16:06:56     147456 PortfolioByRepaymentPlan.xls


2. Athena DB

Data Wrangling

In [11]:
from pyathena import connect

# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [12]:
# Set Athena parameters
database_name = 'collegeaffordability317'
table_name = 'college_data'

In [13]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [14]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS collegeaffordability317


In [15]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(10)

Unnamed: 0,database_name
0,collegeaffordability317
1,default
2,dsoaws
3,sagemaker_featurestore


Populate Tables

In [16]:
tuitiontracker_dir = 's3://collegeaffordability317/TuitionTracker/'

In [31]:
# Drop the table if it already exists
table_name_1 = 'StickerPrice'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_1}', conn)

# Define the CREATE TABLE statement with data types in lowercase
create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name_1} (
    unit_id INT,
    institution_name STRING,
    sector INT,
    total_price_in_state_on_campus_2021_2022 FLOAT,
    total_price_in_state_off_campus_wo_fam_2021_2022 FLOAT,
    total_price_in_state_off_campus_w_fam_2021_2022 FLOAT,
    total_price_in_state_on_campus_2020_2021 FLOAT,
    total_price_in_state_off_campus_wo_fam_2020_2021 FLOAT,
    total_price_in_state_off_campus_w_fam_2020_2021 FLOAT,
    total_price_in_state_on_campus_2019_2020 FLOAT,
    total_price_in_state_off_campus_wo_fam_2019_2020 FLOAT,
    total_price_in_state_off_campus_w_fam_2019_2020 FLOAT,
    total_price_in_state_on_campus_2018_2019 FLOAT,
    total_price_in_state_off_campus_wo_fam_2018_2019 FLOAT,
    total_price_in_state_off_campus_w_fam_2018_2019 FLOAT,
    total_price_in_state_on_campus_2017_2018 FLOAT,
    total_price_in_state_off_campus_wo_fam_2017_2018 FLOAT,
    total_price_in_state_off_campus_w_fam_2017_2018 FLOAT,
    total_price_in_state_on_campus_2016_2017 FLOAT,
    total_price_in_state_off_campus_wo_fam_2016_2017 FLOAT,
    total_price_in_state_off_campus_w_fam_2016_2017 FLOAT,
    total_price_in_state_on_campus_2015_2016 FLOAT,
    total_price_in_state_off_campus_wo_fam_2015_2016 FLOAT,
    total_price_in_state_off_campus_w_fam_2015_2016 FLOAT,
    total_price_in_state_on_campus_2014_2015 FLOAT,
    total_price_in_state_off_campus_wo_fam_2014_2015 FLOAT,
    total_price_in_state_off_campus_w_fam_2014_2015 FLOAT,
    total_price_in_state_on_campus_2013_2014 FLOAT,
    total_price_in_state_off_campus_wo_fam_2013_2014 FLOAT,
    total_price_in_state_off_campus_w_fam_2013_2014 FLOAT,
    total_price_in_state_on_campus_2012_2013 FLOAT,
    total_price_in_state_off_campus_wo_fam_2012_2013 FLOAT,
    total_price_in_state_off_campus_w_fam_2012_2013 FLOAT,
    total_price_in_state_on_campus_2011_2012 FLOAT,
    total_price_in_state_off_campus_wo_fam_2011_2012 FLOAT,
    total_price_in_state_off_campus_w_fam_2011_2012 FLOAT
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION '{tuitiontracker_dir}/{table_name_1}'
    TBLPROPERTIES ('skip.header.line.count'='1')
"""

# Execute create table statement
pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name_1} LIMIT 10', conn)

Unnamed: 0,unit_id,institution_name,sector,total_price_in_state_on_campus_2021_2022,total_price_in_state_off_campus_wo_fam_2021_2022,total_price_in_state_off_campus_w_fam_2021_2022,total_price_in_state_on_campus_2020_2021,total_price_in_state_off_campus_wo_fam_2020_2021,total_price_in_state_off_campus_w_fam_2020_2021,total_price_in_state_on_campus_2019_2020,...,total_price_in_state_off_campus_w_fam_2014_2015,total_price_in_state_on_campus_2013_2014,total_price_in_state_off_campus_wo_fam_2013_2014,total_price_in_state_off_campus_w_fam_2013_2014,total_price_in_state_on_campus_2012_2013,total_price_in_state_off_campus_wo_fam_2012_2013,total_price_in_state_off_campus_w_fam_2012_2013,total_price_in_state_on_campus_2011_2012,total_price_in_state_off_campus_wo_fam_2011_2012,total_price_in_state_off_campus_w_fam_2011_2012
0,180203,Aaniiih Nakoda College,1,,17030.0,8510.0,,17030.0,8510.0,,...,8510.0,,17030.0,8510.0,,17030.0,8510.0,,17030.0,8510.0
1,222178,Abilene Christian University,2,55500.0,55500.0,43872.0,53672.0,53672.0,42322.0,51887.0,...,34100.0,41800.0,41800.0,33000.0,39900.0,39900.0,31250.0,38250.0,,
2,497037,Abilene Christian University-Undergraduate Online,2,,30670.0,19042.0,,,,,...,,,,,,,,,,
3,138558,Abraham Baldwin Agricultural College,1,15727.0,13965.0,7765.0,15575.0,13865.0,7665.0,15479.0,...,6894.0,17503.0,13188.0,7578.0,16550.0,12619.0,7009.0,12347.0,,
4,488031,Abraham Lincoln University,3,,27133.0,11365.0,,25576.0,11176.0,,...,,,,,,,,,,
5,172866,Academy College,3,,38118.0,29579.0,,38182.0,29643.0,,...,28772.0,,30165.0,22965.0,,31504.0,24304.0,,30821.0,23621.0
6,412173,Academy for Nursing and Health Occupations,5,,,,,,,,...,,,,,,,,,,
7,108232,Academy of Art University,3,49222.0,48772.0,31032.0,47174.0,47078.0,30020.0,46724.0,...,25752.0,37828.0,36386.0,25226.0,36884.0,35436.0,24592.0,36202.0,34628.0,23891.0
8,487658,Academy of Interactive Entertainment,5,,36621.0,27513.0,,37522.0,27142.0,,...,,,,,,,,,,
9,439969,Acupuncture and Massage College,3,,41073.0,17761.0,,39281.0,17561.0,,...,16221.0,,23843.0,15355.0,,,,,30111.0,15603.0


In [33]:
# Drop the table if it already exists
table_name_2 = 'NetPrice'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_2}', conn)

# Define the CREATE TABLE statement with data types in lowercase
create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name_2} (
    unit_id int,
    institution_name string,
    sector int,
    avg_net_price_grants_scholarship_2020_2021 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_grants_scholarship_2019_2020 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_grants_scholarship_2018_2019 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_grants_scholarship_2017_2018 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_grants_scholarship_2016_2017 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_grants_scholarship_2015_2016 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_grants_scholarship_2014_2015 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_grants_scholarship_2013_2014 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_grants_scholarship_2012_2013 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_grants_scholarship_2011_2012 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2011_2012 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2011_2012 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2011_2012 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2011_2012 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2011_2012 float
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION '{tuitiontracker_dir}/{table_name_2}'
    TBLPROPERTIES ('skip.header.line.count'='1')
"""

# Execute create table statement
pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name_2} LIMIT 10', conn)

Unnamed: 0,unit_id,institution_name,sector,avg_net_price_grants_scholarship_2020_2021,avg_net_price_income_0_30k_titleiv_fed_finaid_2020_2021,avg_net_price_income_30k_48k_titleiv_fed_finaid_2020_2021,avg_net_price_income_48k_75k_titleiv_fed_finaid_2020_2021,avg_net_price_income_75k_110k_titleiv_fed_finaid_2020_2021,avg_net_price_income_over_110k_titleiv_fed_finaid_2020_2021,avg_net_price_grants_scholarship_2019_2020,...,avg_net_price_income_30k_48k_titleiv_fed_finaid_2012_2013,avg_net_price_income_48k_75k_titleiv_fed_finaid_2012_2013,avg_net_price_income_75k_110k_titleiv_fed_finaid_2012_2013,avg_net_price_income_over_110k_titleiv_fed_finaid_2012_2013,avg_net_price_grants_scholarship_2011_2012,avg_net_price_income_0_30k_titleiv_fed_finaid_2011_2012,avg_net_price_income_30k_48k_titleiv_fed_finaid_2011_2012,avg_net_price_income_48k_75k_titleiv_fed_finaid_2011_2012,avg_net_price_income_75k_110k_titleiv_fed_finaid_2011_2012,avg_net_price_income_over_110k_titleiv_fed_finaid_2011_2012
0,180203,Aaniiih Nakoda College,1,8381.0,8119.0,8326.0,10138.0,,,7777.0,...,5024.0,3359.0,,,13201.0,13133.0,13769.0,14069.0,,
1,222178,Abilene Christian University,2,,,,,,,,...,,,,,,,,,,
2,497037,Abilene Christian University-Undergraduate Online,2,,,,,,,,...,,,,,,,,,,
3,138558,Abraham Baldwin Agricultural College,1,7744.0,4784.0,5862.0,8408.0,10953.0,10568.0,8106.0,...,8862.0,10959.0,12342.0,12946.0,7518.0,6026.0,6895.0,9511.0,11080.0,11182.0
4,488031,Abraham Lincoln University,3,,,,,,,,...,,,,,,,,,,
5,172866,Academy College,3,,,,,,,,...,,,,,,,,,,
6,412173,Academy for Nursing and Health Occupations,5,,,,,,,,...,,,,,,,,,,
7,108232,Academy of Art University,3,,,,,,,,...,,,,,,,,,,
8,487658,Academy of Interactive Entertainment,5,,,,,,,,...,,,,,,,,,,
9,439969,Acupuncture and Massage College,3,,,,,,,,...,,,,,,,,,,


In [49]:
usda_dir = 's3://collegeaffordability317/USDA/'

In [55]:
# Drop the table if it already exists
table_name_3 = 'PovertyEstimates'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_3}', conn)

# Define the CREATE TABLE statement with data types in lowercase
create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name_3} (
    FIPS_Code INT,
    Stabr STRING,
    Area_name STRING,
    Rural_urban_Continuum_Code_2003 STRING,
    Urban_Influence_Code_2003 STRING,
    Rural_urban_Continuum_Code_2013 STRING,
    Urban_Influence_Code_2013 STRING,
    POVALL_2021 STRING,
    CI90LBALL_2021 STRING,
    CI90UBALL_2021 STRING,
    PCTPOVALL_2021 STRING,
    CI90LBALLP_2021 STRING,
    CI90UBALLP_2021 STRING,
    POV017_2021 STRING,
    CI90LB017_2021 STRING,
    CI90UB017_2021 STRING,
    PCTPOV017_2021 STRING,
    CI90LB017P_2021 STRING,
    CI90UB017P_2021 STRING,
    POV517_2021 STRING,
    CI90LB517_2021 STRING,
    CI90UB517_2021 STRING,
    PCTPOV517_2021 STRING,
    CI90LB517P_2021 STRING,
    CI90UB517P_2021 STRING,
    MEDHHINC_2021 STRING,
    CI90LBINC_2021 STRING,
    CI90UBINC_2021 STRING,
    POV04_2021 STRING,
    CI90LB04_2021 STRING,
    CI90UB04_2021 STRING,
    PCTPOV04_2021 STRING,
    CI90LB04P_2021 STRING,
    CI90UB04P_2021 STRING
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION '{usda_dir}/{table_name_3}'
    TBLPROPERTIES ('skip.header.line.count'='5')
"""

# Execute create table statement
pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name_3} LIMIT 10', conn)

Unnamed: 0,fips_code,stabr,area_name,rural_urban_continuum_code_2003,urban_influence_code_2003,rural_urban_continuum_code_2013,urban_influence_code_2013,povall_2021,ci90lball_2021,ci90uball_2021,...,ci90ub517p_2021,medhhinc_2021,ci90lbinc_2021,ci90ubinc_2021,pov04_2021,ci90lb04_2021,ci90ub04_2021,pctpov04_2021,ci90lb04p_2021,ci90ub04p_2021
0,0,US,United States,,,,,41393176,41149497,41636855,...,16.3,69717,69583,69851,3349149.0,3299669.0,3398629.0,18.3,18.0,18.6
1,1000,AL,Alabama,,,,,800848,782169,819527,...,22.5,53990,53218,54762,71220.0,66888.0,75552.0,25.1,23.6,26.6
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,6296,4772,7820,...,20.4,66444,60061,72827,,,,,,
3,1003,AL,Baldwin County,4.0,5.0,3.0,2.0,25526,21599,29453,...,18.5,65658,60723,70593,,,,,,
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,5089,3773,6405,...,44.6,38649,34308,42990,,,,,,
5,1007,AL,Bibb County,1.0,1.0,1.0,1.0,4204,3324,5084,...,35.4,48454,42438,54470,,,,,,
6,1009,AL,Blount County,1.0,1.0,1.0,1.0,6992,5516,8468,...,20.7,56894,52632,61156,,,,,,
7,1011,AL,Bullock County,6.0,6.0,6.0,6.0,2764,2161,3367,...,54.3,32027,28131,35923,,,,,,
8,1013,AL,Butler County,6.0,6.0,6.0,6.0,4226,3385,5067,...,43.7,39442,34974,43910,,,,,,
9,1015,AL,Calhoun County,3.0,2.0,3.0,2.0,21630,18671,24589,...,30.4,48166,43479,52853,,,,,,


In [59]:
# Drop the table if it already exists
table_name_4 = 'Unemployment'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_4}', conn)

# Define the CREATE TABLE statement with data types in lowercase
create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name_4} (
    FIPS_Code INT,
    State STRING,
    Area_Name STRING,  
    Rural_Urban_Continuum_Code_2013 INT,
    Urban_Influence_Code_2013 INT,
    Metro_2013 INT,
    Civilian_labor_force_2000 INT,
    Employed_2000 INT,
    Unemployed_2000 INT,
    Unemployment_rate_2000 FLOAT,
    Civilian_labor_force_2001 INT,
    Employed_2001 INT,
    Unemployed_2001 INT,
    Unemployment_rate_2001 FLOAT,
    Civilian_labor_force_2002 INT,
    Employed_2002 INT,
    Unemployed_2002 INT,
    Unemployment_rate_2002 FLOAT,
    Civilian_labor_force_2003 INT,
    Employed_2003 INT,
    Unemployed_2003 INT,
    Unemployment_rate_2003 FLOAT,
    Civilian_labor_force_2004 INT,
    Employed_2004 INT,
    Unemployed_2004 INT, 
    Unemployment_rate_2004 FLOAT,
    Civilian_labor_force_2005 INT,
    Employed_2005 INT,
    Unemployed_2005 INT,
    Unemployment_rate_2005 FLOAT,
    Civilian_labor_force_2006 INT,
    Employed_2006 INT,
    Unemployed_2006 INT,
    Unemployment_rate_2006 FLOAT,
    Civilian_labor_force_2007 INT,
    Employed_2007 INT,
    Unemployed_2007 INT,
    Unemployment_rate_2007 FLOAT,
    Civilian_labor_force_2008 INT,
    Employed_2008 INT,
    Unemployed_2008 INT,
    Unemployment_rate_2008 FLOAT,
    Civilian_labor_force_2009 INT,
    Employed_2009 INT,
    Unemployed_2009 INT,
    Unemployment_rate_2009 FLOAT,
    Civilian_labor_force_2010 INT,
    Employed_2010 INT,
    Unemployed_2010 INT,
    Unemployment_rate_2010 FLOAT,
    Civilian_labor_force_2011 INT,
    Employed_2011 INT,
    Unemployed_2011 INT,
    Unemployment_rate_2011 FLOAT,
    Civilian_labor_force_2012 INT,
    Employed_2012 INT,
    Unemployed_2012 INT,
    Unemployment_rate_2012 FLOAT,
    Civilian_labor_force_2013 INT,
    Employed_2013 INT,
    Unemployed_2013 INT,
    Unemployment_rate_2013 FLOAT,
    Civilian_labor_force_2014 INT,
    Employed_2014 INT,
    Unemployed_2014 INT,
    Unemployment_rate_2014 FLOAT,
    Civilian_labor_force_2015 INT,
    Employed_2015 INT,
    Unemployed_2015 INT,
    Unemployment_rate_2015 FLOAT,
    Civilian_labor_force_2016 INT,
    Employed_2016 INT,
    Unemployed_2016 INT,
    Unemployment_rate_2016 FLOAT,
    Civilian_labor_force_2017 INT,
    Employed_2017 INT,
    Unemployed_2017 INT,
    Unemployment_rate_2017 FLOAT,
    Civilian_labor_force_2018 INT,
    Employed_2018 INT,
    Unemployed_2018 INT,
    Unemployment_rate_2018 FLOAT,
    Civilian_labor_force_2019 INT,
    Employed_2019 INT,
    Unemployed_2019 INT,
    Unemployment_rate_2019 FLOAT,
    Civilian_labor_force_2020 INT,
    Employed_2020 INT,
    Unemployed_2020 INT,
    Unemployment_rate_2020 FLOAT,
    Civilian_labor_force_2021 INT,
    Employed_2021 INT,
    Unemployed_2021 INT,
    Unemployment_rate_2021 FLOAT,
    Civilian_labor_force_2022 INT,
    Employed_2022 INT,
    Unemployed_2022 INT,
    Unemployment_rate_2022 FLOAT,
    Median_Household_Income_2021 INT,
    Med_HH_Income_Percent_of_State_Total_2021 FLOAT
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION 's3://collegeaffordability317/USDA/Unemployment/'
    TBLPROPERTIES ('skip.header.line.count'='5')
"""

# Execute create table statement
pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name_4} LIMIT 10', conn)


Unnamed: 0,fips_code,state,area_name,rural_urban_continuum_code_2013,urban_influence_code_2013,metro_2013,civilian_labor_force_2000,employed_2000,unemployed_2000,unemployment_rate_2000,...,civilian_labor_force_2021,employed_2021,unemployed_2021,unemployment_rate_2021,civilian_labor_force_2022,employed_2022,unemployed_2022,unemployment_rate_2022,median_household_income_2021,med_hh_income_percent_of_state_total_2021
0,0,US,United States,,,,,601.0,,,...,860.0,,,900.0,,5.0,,825.0,,
1,1000,AL,Alabama,,,,,147.0,,,...,,938.0,,,,10.0,,197.0,,
2,1001,AL,"""Autauga County",,2.0,2.0,1.0,,,,...,,,7.0,,,,,,,6.3
3,1003,AL,"""Baldwin County",,3.0,2.0,1.0,,,,...,9.0,,,,,,,7.7,,
4,1005,AL,"""Barbour County",,6.0,6.0,0.0,,,,...,,,,944.0,10.0,,,,,929.0
5,1007,AL,"""Bibb County",,1.0,1.0,1.0,,,,...,,699.0,8.0,,,,,614.0,7.0,
6,1009,AL,"""Blount County",,1.0,1.0,1.0,,,,...,,,7.0,,,,,,,6.4
7,1011,AL,"""Bullock County",,6.0,6.0,0.0,,,,...,9.0,,,,,415.0,8.0,,,
8,1013,AL,"""Butler County",,6.0,6.0,0.0,,,,...,,,,955.0,10.0,,,,,789.0
9,1015,AL,"""Calhoun County",,3.0,2.0,1.0,,,,...,10.0,,,,,,,9.1,,
