# 1. Prepare Datasets

## Import the S3 data into SageMaker

In [38]:
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## Store S3 locations

In [40]:
s3_public_path_kaggle = "s3://collegeaffordability317/Kaggle/"
s3_public_path_tuition = "s3://collegeaffordability317/TuitionTracker/"
s3_public_path_usda = "s3://collegeaffordability317/USDA/"
s3_public_path_usde = "s3://collegeaffordability317/USDE/"

In [41]:
%store s3_public_path_kaggle
%store s3_public_path_tuition
%store s3_public_path_usda
%store s3_public_path_usde

Stored 's3_public_path_kaggle' (str)
Stored 's3_public_path_tuition' (str)
Stored 's3_public_path_usda' (str)
Stored 's3_public_path_usde' (str)


## Show all the data files for the project

In [42]:
!aws s3 ls $s3_public_path_kaggle

2024-03-17 16:07:03   15143141 college_data.csv


In [43]:
!aws s3 ls $s3_public_path_tuition

2024-03-17 16:07:00      21527 DataDictionary.xlsx
2024-03-17 16:07:00     838246 GradRates.csv
2024-03-17 16:07:01    1319314 NetPrice.csv
2024-03-17 16:07:01    1382164 RetentionRates.csv
2024-03-17 16:07:02     653000 StickerPrice.csv


In [44]:
!aws s3 ls $s3_public_path_usda

2024-03-17 16:06:57    1610608 Education.xlsx
2024-03-21 01:19:41     450599 PovertyEstimates.csv
2024-03-21 01:19:41    2187886 Unemployment.csv


In [45]:
!aws s3 ls $s3_public_path_usde

2024-03-17 16:06:52      59904 FedStudentAidPortfolioByAge.xls
2024-03-17 16:06:53      68096 FedStudentAidPortfolioByDebtSize.xls
2024-03-17 16:06:53      62976 FedStudentAidPortfolioByLoanType.xls
2024-03-17 16:06:53      50176 FedStudentAidPortfolioByLocation.xls
2024-03-17 16:06:54      60928 FedStudentAidPortfolioBySchoolType.xls
2024-03-17 16:06:54      61952 FedStudentAidPortfolioSummary.xls
2024-03-17 16:06:55      52224 PortfolioByAgeAndDebtSize.xls
2024-03-17 16:06:55      91648 PortfolioByDelinquencyStatus.xls
2024-03-17 16:06:56      57856 PortfolioByLocationAndAge.xls
2024-03-17 16:06:56      58368 PortfolioByLocationAndDebtSize.xls
2024-03-17 16:06:56     147456 PortfolioByRepaymentPlan.xls


# 2. Athena DB

#### Data Wrangling

In [46]:
from pyathena import connect

# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [47]:
# Set Athena parameters
database_name = 'collegeaffordability317'
table_name = 'college_data'

In [48]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [49]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS collegeaffordability317


In [50]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(10)

Unnamed: 0,database_name
0,collegeaffordability317
1,default
2,dsoaws


#### Populate Tables Being Used

In [51]:
tuitiontracker_dir = 's3://collegeaffordability317/TuitionTracker/'

In [52]:
# Drop the table if it already exists
table_name_1 = 'StickerPrice'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_1}', conn)

# Define the CREATE TABLE statement with data types in lowercase
create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name_1} (
    unit_id int,
    institution_name string,
    sector int,
    total_price_in_state_on_campus_2021_2022 float,
    total_price_in_state_off_campus_wo_fam_2021_2022 float,
    total_price_in_state_off_campus_w_fam_2021_2022 float,
    total_price_in_state_on_campus_2020_2021 float,
    total_price_in_state_off_campus_wo_fam_2020_2021 float,
    total_price_in_state_off_campus_w_fam_2020_2021 float,
    total_price_in_state_on_campus_2019_2020 float,
    total_price_in_state_off_campus_wo_fam_2019_2020 float,
    total_price_in_state_off_campus_w_fam_2019_2020 float,
    total_price_in_state_on_campus_2018_2019 float,
    total_price_in_state_off_campus_wo_fam_2018_2019 float,
    total_price_in_state_off_campus_w_fam_2018_2019 float,
    total_price_in_state_on_campus_2017_2018 float,
    total_price_in_state_off_campus_wo_fam_2017_2018 float,
    total_price_in_state_off_campus_w_fam_2017_2018 float,
    total_price_in_state_on_campus_2016_2017 float,
    total_price_in_state_off_campus_wo_fam_2016_2017 float,
    total_price_in_state_off_campus_w_fam_2016_2017 float,
    total_price_in_state_on_campus_2015_2016 float,
    total_price_in_state_off_campus_wo_fam_2015_2016 float,
    total_price_in_state_off_campus_w_fam_2015_2016 float,
    total_price_in_state_on_campus_2014_2015 float,
    total_price_in_state_off_campus_wo_fam_2014_2015 float,
    total_price_in_state_off_campus_w_fam_2014_2015 float,
    total_price_in_state_on_campus_2013_2014 float,
    total_price_in_state_off_campus_wo_fam_2013_2014 float,
    total_price_in_state_off_campus_w_fam_2013_2014 float,
    total_price_in_state_on_campus_2012_2013 float,
    total_price_in_state_off_campus_wo_fam_2012_2013 float,
    total_price_in_state_off_campus_w_fam_2012_2013 float,
    total_price_in_state_on_campus_2011_2012 float,
    total_price_in_state_off_campus_wo_fam_2011_2012 float,
    total_price_in_state_off_campus_w_fam_2011_2012 float
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION '{tuitiontracker_dir}/{table_name_1}'
"""

# Execute create table statement
pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name_1} LIMIT 10', conn)

Unnamed: 0,unit_id,institution_name,sector,total_price_in_state_on_campus_2021_2022,total_price_in_state_off_campus_wo_fam_2021_2022,total_price_in_state_off_campus_w_fam_2021_2022,total_price_in_state_on_campus_2020_2021,total_price_in_state_off_campus_wo_fam_2020_2021,total_price_in_state_off_campus_w_fam_2020_2021,total_price_in_state_on_campus_2019_2020,...,total_price_in_state_off_campus_w_fam_2014_2015,total_price_in_state_on_campus_2013_2014,total_price_in_state_off_campus_wo_fam_2013_2014,total_price_in_state_off_campus_w_fam_2013_2014,total_price_in_state_on_campus_2012_2013,total_price_in_state_off_campus_wo_fam_2012_2013,total_price_in_state_off_campus_w_fam_2012_2013,total_price_in_state_on_campus_2011_2012,total_price_in_state_off_campus_wo_fam_2011_2012,total_price_in_state_off_campus_w_fam_2011_2012


In [53]:
# Drop the table if it already exists
table_name_2 = 'NetPrice'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_2}', conn)

# Define the CREATE TABLE statement with data types in lowercase
create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name_2} (
    unit_id int,
    institution_name string,
    sector int,
    avg_net_price_grants_scholarship_2020_2021 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2020_2021 float,
    avg_net_price_grants_scholarship_2019_2020 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2019_2020 float,
    avg_net_price_grants_scholarship_2018_2019 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2018_2019 float,
    avg_net_price_grants_scholarship_2017_2018 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2017_2018 float,
    avg_net_price_grants_scholarship_2016_2017 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2016_2017 float,
    avg_net_price_grants_scholarship_2015_2016 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2015_2016 float,
    avg_net_price_grants_scholarship_2014_2015 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2014_2015 float,
    avg_net_price_grants_scholarship_2013_2014 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2013_2014 float,
    avg_net_price_grants_scholarship_2012_2013 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2012_2013 float,
    avg_net_price_grants_scholarship_2011_2012 float,
    avg_net_price_income_0_30k_titleiv_fed_finaid_2011_2012 float,
    avg_net_price_income_30k_48k_titleiv_fed_finaid_2011_2012 float,
    avg_net_price_income_48k_75k_titleiv_fed_finaid_2011_2012 float,
    avg_net_price_income_75k_110k_titleiv_fed_finaid_2011_2012 float,
    avg_net_price_income_over_110k_titleiv_fed_finaid_2011_2012 float
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION '{tuitiontracker_dir}/{table_name_2}'
"""

# Execute create table statement
pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name_2} LIMIT 10', conn)

Unnamed: 0,unit_id,institution_name,sector,avg_net_price_grants_scholarship_2020_2021,avg_net_price_income_0_30k_titleiv_fed_finaid_2020_2021,avg_net_price_income_30k_48k_titleiv_fed_finaid_2020_2021,avg_net_price_income_48k_75k_titleiv_fed_finaid_2020_2021,avg_net_price_income_75k_110k_titleiv_fed_finaid_2020_2021,avg_net_price_income_over_110k_titleiv_fed_finaid_2020_2021,avg_net_price_grants_scholarship_2019_2020,...,avg_net_price_income_30k_48k_titleiv_fed_finaid_2012_2013,avg_net_price_income_48k_75k_titleiv_fed_finaid_2012_2013,avg_net_price_income_75k_110k_titleiv_fed_finaid_2012_2013,avg_net_price_income_over_110k_titleiv_fed_finaid_2012_2013,avg_net_price_grants_scholarship_2011_2012,avg_net_price_income_0_30k_titleiv_fed_finaid_2011_2012,avg_net_price_income_30k_48k_titleiv_fed_finaid_2011_2012,avg_net_price_income_48k_75k_titleiv_fed_finaid_2011_2012,avg_net_price_income_75k_110k_titleiv_fed_finaid_2011_2012,avg_net_price_income_over_110k_titleiv_fed_finaid_2011_2012


In [56]:
poverty_dir = 's3://collegeaffordability317/PovertyEstimates/'

In [58]:
# Drop the table if it already exists
table_name_3 = 'Poverty'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_3}', conn)

# Define the CREATE TABLE statement with data types in lowercase
create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name_3} (
    FIPS_Code string,
    Stabr string,
    Area_name string,
    Rural-urban_Continuum_Code_2003 string,
    Urban_Influence_Code_2003 string,
    Rural-urban_Continuum_Code_2013 string,
    Urban_Influence_Code_ 2013 string,
    POVALL_2021	CI90LBALL_2021 string,
    CI90UBALL_2021 string,
    PCTPOVALL_2021 string,
    CI90LBALLP_2021 string,
    CI90UBALLP_2021 string,
    POV017_2021	CI90LB017_2021 string,
    CI90UB017_2021 string,
    PCTPOV017_2021 string,
    CI90LB017P_2021 string,
    CI90UB017P_2021 string,
    POV517_2021 string,
    CI90LB517_2021 string,
    CI90UB517_2021 string,
    PCTPOV517_2021 string,
    CI90LB517P_2021 string,
    CI90UB517P_2021	MEDHHINC_2021 string,
    CI90LBINC_2021 string,
    CI90UBINC_2021 string,
    POV04_2021 string,
    CI90LB04_2021 string,
    CI90UB04_2021 string,
    PCTPOV04_2021 string,
    CI90LB04P_2021 string,
    CI90UB04P_2021 string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION '{poverty_dir}/{table_name_3}'
"""

# Execute create table statement
pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name_3} LIMIT 10', conn)

Failed to execute query.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/pyathena/common.py", line 575, in _execute
    query_id = retry_api_call(
  File "/opt/conda/lib/python3.8/site-packages/pyathena/util.py", line 84, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/tenacity/__init__.py", line 379, in __call__
    do = self.iter(retry_state=retry_state)
  File "/opt/conda/lib/python3.8/site-packages/tenacity/__init__.py", line 314, in iter
    return fut.result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 437, in result
    return self.__get_result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/opt/conda/lib/python3.8/site-packages/tenacity/__init__.py", line 382, in __call__
    result = fn(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/botocore/client.py", line 553, in _ap

DatabaseError: Execution failed on sql: 
CREATE EXTERNAL TABLE IF NOT EXISTS collegeaffordability317.Poverty (
    FIPS_Code string,
    Stabr string,
    Area_name string,
    Rural-urban_Continuum_Code_2003 string,
    Urban_Influence_Code_2003 string,
    Rural-urban_Continuum_Code_2013 string,
    Urban_Influence_Code_ 2013 string,
    POVALL_2021	CI90LBALL_2021 string,
    CI90UBALL_2021 string,
    PCTPOVALL_2021 string,
    CI90LBALLP_2021 string,
    CI90UBALLP_2021 string,
    POV017_2021	CI90LB017_2021 string,
    CI90UB017_2021 string,
    PCTPOV017_2021 string,
    CI90LB017P_2021 string,
    CI90UB017P_2021 string,
    POV517_2021 string,
    CI90LB517_2021 string,
    CI90UB517_2021 string,
    PCTPOV517_2021 string,
    CI90LB517P_2021 string,
    CI90UB517P_2021	MEDHHINC_2021 string,
    CI90LBINC_2021 string,
    CI90UBINC_2021 string,
    POV04_2021 string,
    CI90LB04_2021 string,
    CI90UB04_2021 string,
    PCTPOV04_2021 string,
    CI90LB04P_2021 string,
    CI90UB04P_2021 string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION 's3://collegeaffordability317/PovertyEstimates//Poverty'

An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'MATERIALIZED', 'MULTI', 'OR', 'PROTECTED', 'ROLE', 'SCHEMA', 'TABLE', 'VIEW'
unable to rollback

In [59]:
unemployment_dir = 's3://collegeaffordability317/Unemployment/'

In [60]:
# Drop the table if it already exists
table_name_4 = 'Unemployment'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_4}', conn)

# Define the CREATE TABLE statement with data types in lowercase
create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name_4} (
    FIPS_Code,
    State,
    Area_Name,
    Rural_Urban_Continuum_Code_2013,
    Urban_Influence_Code_2013,
    Metro_2013,
    Civilian_labor_force_2000,
    Employed_2000,
    Unemployed_2000,
    Unemployment_rate_2000,
    Civilian_labor_force_2001,
    Employed_2001,
    Unemployed_2001,
    Unemployment_rate_2001,
    Civilian_labor_force_2002,
    Employed_2002,
    Unemployed_2002,
    Unemployment_rate_2002,
    Civilian_labor_force_2003,
    Employed_2003,
    Unemployed_2003,
    Unemployment_rate_2003,
    Civilian_labor_force_2004,
    Employed_2004,
    Unemployed_2004,
    Unemployment_rate_2004,
    Civilian_labor_force_2005,
    Employed_2005,
    Unemployed_2005,
    Unemployment_rate_2005,
    Civilian_labor_force_2006,
    Employed_2006,
    Unemployed_2006,
    Unemployment_rate_2006,
    Civilian_labor_force_2007,
    Employed_2007,
    Unemployed_2007,
    Unemployment_rate_2007,
    Civilian_labor_force_2008,
    Employed_2008,
    Unemployed_2008,
    Unemployment_rate_2008,
    Civilian_labor_force_2009,
    Employed_2009,	
    Unemployed_2009,
    Unemployment_rate_2009,
    Civilian_labor_force_2010,
    Employed_2010,
    Unemployed_2010,
    Unemployment_rate_2010,
    Civilian_labor_force_2011,
    Employed_2011,
    Unemployed_2011,
    Unemployment_rate_2011,
    Civilian_labor_force_2012,
    Employed_2012,
    Unemployed_2012,
    Unemployment_rate_2012,
    Civilian_labor_force_2013,
    Employed_2013,
    Unemployed_2013,
    Unemployment_rate_2013,
    Civilian_labor_force_2014,
    Employed_2014,
    Unemployed_2014,
    Unemployment_rate_2014,
    Civilian_labor_force_2015,
    Employed_2015,
    Unemployed_2015,
    Unemployment_rate_2015,
    Civilian_labor_force_2016,
    Employed_2016,
    Unemployed_2016,
    Unemployment_rate_2016,
    Civilian_labor_force_2017,
    Employed_2017,
    Unemployed_2017,
    Unemployment_rate_2017,
    Civilian_labor_force_2018,
    Employed_2018,
    Unemployed_2018,
    Unemployment_rate_2018,
    Civilian_labor_force_2019,
    Employed_2019,
    Unemployed_2019,
    Unemployment_rate_2019,
    Civilian_labor_force_2020,
    Employed_2020,
    Unemployed_2020,
    Unemployment_rate_2020,
    Civilian_labor_force_2021,
    Employed_2021,
    Unemployed_2021,
    Unemployment_rate_2021,
    Civilian_labor_force_2022,
    Employed_2022,
    Unemployed_2022,
    Unemployment_rate_2022,
    Median_Household_Income_2021,
    Med_HH_Income_Percent_of_State_Total_2021
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION '{unemployment_dir}/{table_name_3}'
"""

# Execute create table statement
pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name_3} LIMIT 10', conn)

Failed to execute query.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/pyathena/common.py", line 575, in _execute
    query_id = retry_api_call(
  File "/opt/conda/lib/python3.8/site-packages/pyathena/util.py", line 84, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/tenacity/__init__.py", line 379, in __call__
    do = self.iter(retry_state=retry_state)
  File "/opt/conda/lib/python3.8/site-packages/tenacity/__init__.py", line 314, in iter
    return fut.result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 437, in result
    return self.__get_result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/opt/conda/lib/python3.8/site-packages/tenacity/__init__.py", line 382, in __call__
    result = fn(*args, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/botocore/client.py", line 553, in _ap

DatabaseError: Execution failed on sql: 
CREATE EXTERNAL TABLE IF NOT EXISTS collegeaffordability317.Unemployment (
    FIPS_Code,
    State,
    Area_Name,
    Rural_Urban_Continuum_Code_2013,
    Urban_Influence_Code_2013,
    Metro_2013,
    Civilian_labor_force_2000,
    Employed_2000,
    Unemployed_2000,
    Unemployment_rate_2000,
    Civilian_labor_force_2001,
    Employed_2001,
    Unemployed_2001,
    Unemployment_rate_2001,
    Civilian_labor_force_2002,
    Employed_2002,
    Unemployed_2002,
    Unemployment_rate_2002,
    Civilian_labor_force_2003,
    Employed_2003,
    Unemployed_2003,
    Unemployment_rate_2003,
    Civilian_labor_force_2004,
    Employed_2004,
    Unemployed_2004,
    Unemployment_rate_2004,
    Civilian_labor_force_2005,
    Employed_2005,
    Unemployed_2005,
    Unemployment_rate_2005,
    Civilian_labor_force_2006,
    Employed_2006,
    Unemployed_2006,
    Unemployment_rate_2006,
    Civilian_labor_force_2007,
    Employed_2007,
    Unemployed_2007,
    Unemployment_rate_2007,
    Civilian_labor_force_2008,
    Employed_2008,
    Unemployed_2008,
    Unemployment_rate_2008,
    Civilian_labor_force_2009,
    Employed_2009,	
    Unemployed_2009,
    Unemployment_rate_2009,
    Civilian_labor_force_2010,
    Employed_2010,
    Unemployed_2010,
    Unemployment_rate_2010,
    Civilian_labor_force_2011,
    Employed_2011,
    Unemployed_2011,
    Unemployment_rate_2011,
    Civilian_labor_force_2012,
    Employed_2012,
    Unemployed_2012,
    Unemployment_rate_2012,
    Civilian_labor_force_2013,
    Employed_2013,
    Unemployed_2013,
    Unemployment_rate_2013,
    Civilian_labor_force_2014,
    Employed_2014,
    Unemployed_2014,
    Unemployment_rate_2014,
    Civilian_labor_force_2015,
    Employed_2015,
    Unemployed_2015,
    Unemployment_rate_2015,
    Civilian_labor_force_2016,
    Employed_2016,
    Unemployed_2016,
    Unemployment_rate_2016,
    Civilian_labor_force_2017,
    Employed_2017,
    Unemployed_2017,
    Unemployment_rate_2017,
    Civilian_labor_force_2018,
    Employed_2018,
    Unemployed_2018,
    Unemployment_rate_2018,
    Civilian_labor_force_2019,
    Employed_2019,
    Unemployed_2019,
    Unemployment_rate_2019,
    Civilian_labor_force_2020,
    Employed_2020,
    Unemployed_2020,
    Unemployment_rate_2020,
    Civilian_labor_force_2021,
    Employed_2021,
    Unemployed_2021,
    Unemployment_rate_2021,
    Civilian_labor_force_2022,
    Employed_2022,
    Unemployed_2022,
    Unemployment_rate_2022,
    Median_Household_Income_2021,
    Med_HH_Income_Percent_of_State_Total_2021
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    LOCATION 's3://collegeaffordability317/Unemployment//Poverty'

An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'MATERIALIZED', 'MULTI', 'OR', 'PROTECTED', 'ROLE', 'SCHEMA', 'TABLE', 'VIEW'
unable to rollback

# 2. Exploratory Data Analysis

In [None]:
import csv

df_colleges = pd.read_csv(s3_public_path_kaggle + 
                          "college_data.csv")
df_colleges.shape

In [None]:
df_gradrate = pd.read_csv(s3_public_path_tuition + 
                          "GradRates.csv")
df_netprice = pd.read_csv(s3_public_path_tuition + 
                          "NetPrice.csv")
df_retentionrate = pd.read_csv(s3_public_path_tuition + 
                          "RetentionRates.csv")
df_stickerprice = pd.read_csv(s3_public_path_tuition + 
                          "StickerPrice.csv")

In [None]:
df_stickerprice.head()

#### Data Types and NULLS

In [None]:
#### WE NEED TO GET FINAL FILES NAILED DOWN AND THEN THIS CODE CAN BE RAN ON THE FINAL DATA SET

##(I am not going to run it on every table and read in every table with pandas, it will be counter-intuitive)

In [None]:
# get number of rows and columns
print('Number of Rows - ', df_colleges.shape[0])
print('Number of Columns - ', df_colleges.shape[1], '\n')

# inspect datatypes and nulls
data_types = df_colleges.dtypes
data_types = pd.DataFrame(data_types)
data_types = data_types.assign(Null_Values = 
                               df_colleges.isnull().sum())
data_types.reset_index(inplace = True)
data_types.rename(columns={0:'Data Type',
                          'index': 'Field',
                          'Null_Values': "Nulls"})

#### Summary Statistics

In [None]:
### Again, this will  be capable of running on the entire dataset, need to reduce files

In [None]:
#summary statistics
stats = pd.DataFrame(df_colleges.describe()).T
stats

#### Multicollinearity Test

In [None]:
# assign correlation function to new variable
corr = df_colleges.corr()
corr

# Create a heatmap using seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, mask=corr.isnull(), cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix')
plt.show()

## Release SageMaker Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}