# Setup S3 Buckets

In [10]:
# List the S3 public bucket content
!aws s3 ls s3://ads508-group7/

2025-03-15 21:02:43     546709 2023_Registered_Foreclosure_Properties.csv
2025-03-15 21:02:44     552165 2024_Registered_Foreclosure_Properties.csv
2025-03-15 21:40:42   92038326 Arrest_Data_from_2020_to_Present.csv
2025-03-15 21:40:42  255509498 Crime_Data_from_2020_to_Present.csv


In [11]:
import boto3
import sagemaker
import pandas as pd

# create sagemaker session
sess =  sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [12]:
# Set S3 Source Location (Public S3 Bucket)
s3_public_path = "s3://ads508-group7"

In [13]:
%store s3_public_path

Stored 's3_public_path' (str)


In [14]:
# Set S3 Destination Location (Private S3 Bucket)
s3_private_path = "s3://{}/ads508-group7".format(bucket)
print(s3_private_path)

s3://sagemaker-us-east-1-026652244786/ads508-group7


In [15]:
%store s3_private_path

Stored 's3_private_path' (str)


In [None]:
#Copy Data From the Public S3 Bucket to our Private S3 Bucket in this Account

In [16]:
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "2023_Registered_Foreclosure_Properties.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "2024_Registered_Foreclosure_Properties.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "Arrest_Data_from_2020_to_Present.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "Crime_Data_from_2020_to_Present.csv"

copy: s3://ads508-group7/2023_Registered_Foreclosure_Properties.csv to s3://sagemaker-us-east-1-026652244786/ads508-group7/2023_Registered_Foreclosure_Properties.csv
copy: s3://ads508-group7/2024_Registered_Foreclosure_Properties.csv to s3://sagemaker-us-east-1-026652244786/ads508-group7/2024_Registered_Foreclosure_Properties.csv
copy: s3://ads508-group7/Arrest_Data_from_2020_to_Present.csv to s3://sagemaker-us-east-1-026652244786/ads508-group7/Arrest_Data_from_2020_to_Present.csv
copy: s3://ads508-group7/Crime_Data_from_2020_to_Present.csv to s3://sagemaker-us-east-1-026652244786/ads508-group7/Crime_Data_from_2020_to_Present.csv


In [17]:
# List Files in our Private S3 Bucket in this Account
print(s3_private_path)

s3://sagemaker-us-east-1-026652244786/ads508-group7


In [18]:
!aws s3 ls $s3_private_path/

2025-03-17 07:52:19     546709 2023_Registered_Foreclosure_Properties.csv
2025-03-17 07:52:20     552165 2024_Registered_Foreclosure_Properties.csv
2025-03-17 07:52:21   92038326 Arrest_Data_from_2020_to_Present.csv
2025-03-17 07:52:23  255509498 Crime_Data_from_2020_to_Present.csv


In [21]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/ads508-group7/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

  from IPython.core.display import display, HTML


In [24]:
# store variables for the next notebook
%store

Stored variables and their in-db values:
s3_private_path                       -> 's3://sagemaker-us-east-1-026652244786/ads508-grou
s3_public_path                        -> 's3://ads508-group7'
setup_dependencies_passed             -> True


# Create Athena Database Schema

In [25]:
ingest_create_athena_db_passed = False

In [26]:
%store -r s3_public_path

In [27]:
try:
    s3_public_path
except NameError:
    print("*****************************************************************************")
    print("[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************")
    print("[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************")
    print("*****************************************************************************")

In [28]:
print(s3_public_path)

s3://ads508-group7


In [29]:
%store -r s3_private_path

In [31]:
try:
    s3_private_path
except NameError:
    print("*****************************************************************************")
    print("[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************")
    print("[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************")
    print("*****************************************************************************")

In [33]:
print(s3_private_path)

s3://sagemaker-us-east-1-026652244786/ads508-group7


# Import PyAthena

In [34]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

# Create Athena Database

In [35]:
database_name = "crime_foreclosure_db"

In [36]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [37]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [38]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS crime_foreclosure_db


In [39]:
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [40]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,crime_foreclosure_db
1,default


In [41]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [42]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


In [44]:
# Store Variables for the Next Notebooks
%store

Stored variables and their in-db values:
ingest_create_athena_db_passed             -> True
s3_private_path                            -> 's3://sagemaker-us-east-1-026652244786/ads508-grou
s3_public_path                             -> 's3://ads508-group7'
setup_dependencies_passed                  -> True
