# Setup S3 Buckets

In [1]:
# List the S3 public bucket content
!aws s3 ls s3://ads508-group7/

2025-03-15 21:02:43     546709 2023_Registered_Foreclosure_Properties.csv
2025-03-15 21:02:44     552165 2024_Registered_Foreclosure_Properties.csv
2025-03-15 21:40:42   92038326 Arrest_Data_from_2020_to_Present.csv
2025-03-15 21:40:42  255509498 Crime_Data_from_2020_to_Present.csv


In [2]:
import boto3
import sagemaker
import pandas as pd

# create sagemaker session
sess =  sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [70]:
# Set S3 Source Location (Public S3 Bucket)
s3_public_path = "s3://ads508-group7"

In [71]:
%store s3_public_path

Stored 's3_public_path' (str)


In [5]:
# Set S3 Destination Location (Private S3 Bucket)
s3_private_path = "s3://{}/ads508-group7".format(bucket)
print(s3_private_path)

s3://sagemaker-us-east-1-643862913351/ads508-group7


In [6]:
%store s3_private_path

Stored 's3_private_path' (str)


In [None]:
#Copy Data From the Public S3 Bucket to our Private S3 Bucket in this Account

In [7]:
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "2023_Registered_Foreclosure_Properties.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "2024_Registered_Foreclosure_Properties.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "Arrest_Data_from_2020_to_Present.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "Crime_Data_from_2020_to_Present.csv"

copy: s3://ads508-group7/2023_Registered_Foreclosure_Properties.csv to s3://sagemaker-us-east-1-643862913351/ads508-group7/2023_Registered_Foreclosure_Properties.csv
copy: s3://ads508-group7/2024_Registered_Foreclosure_Properties.csv to s3://sagemaker-us-east-1-643862913351/ads508-group7/2024_Registered_Foreclosure_Properties.csv
copy: s3://ads508-group7/Arrest_Data_from_2020_to_Present.csv to s3://sagemaker-us-east-1-643862913351/ads508-group7/Arrest_Data_from_2020_to_Present.csv
copy: s3://ads508-group7/Crime_Data_from_2020_to_Present.csv to s3://sagemaker-us-east-1-643862913351/ads508-group7/Crime_Data_from_2020_to_Present.csv


In [8]:
# List Files in our Private S3 Bucket in this Account
print(s3_private_path)

s3://sagemaker-us-east-1-643862913351/ads508-group7


In [9]:
!aws s3 ls $s3_private_path/

2025-03-20 00:50:08     546709 2023_Registered_Foreclosure_Properties.csv
2025-03-20 00:50:10     552165 2024_Registered_Foreclosure_Properties.csv
2025-03-20 00:50:12   92038326 Arrest_Data_from_2020_to_Present.csv
2025-03-20 00:50:15  255509498 Crime_Data_from_2020_to_Present.csv


In [10]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/ads508-group7/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

  from IPython.core.display import display, HTML


In [11]:
# store variables for the next notebook
%store

Stored variables and their in-db values:
s3_private_path             -> 's3://sagemaker-us-east-1-643862913351/ads508-grou
s3_public_path              -> 's3://ads508-group7'


# Setup Athena Database

## Create Database

In [18]:
#setup pyathena
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
amazon-sagemaker-sql-execution 0.1.6 requires pyathena<4,>=3.3.0, but you have pyathena 2.1.0 which is incompatible.
amazon-sagemaker-sql-magic 0.1.3 requires sqlparse==0.5.0, but you have sqlparse 0.5.3 which is incompatible.[0m[31m
[0m

In [32]:
database_name = "crime_foreclosure_db"
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS crime_foreclosure_db


In [33]:
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


## Validate Database Set Up Correctly

In [34]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,crime_foreclosure_db
1,default


In [35]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [36]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


In [37]:
# Store Variables for the Next Notebooks
%store

Stored variables and their in-db values:
ingest_create_athena_db_passed             -> True
s3_private_path                            -> 's3://sagemaker-us-east-1-643862913351/ads508-grou
s3_public_path                             -> 's3://ads508-group7'


## Create tables in Athena

### Arrest Data

In [72]:
#move arrest data into own folder 
!aws s3 mv s3://sagemaker-us-east-1-643862913351/ads508-group7/Arrest_Data_from_2020_to_Present.csv s3://sagemaker-us-east-1-643862913351/ads508-group7/arrests/Arrest_Data_from_2020_to_Present.csv

move: s3://sagemaker-us-east-1-643862913351/ads508-group7/Arrest_Data_from_2020_to_Present.csv to s3://sagemaker-us-east-1-643862913351/ads508-group7/arrests/Arrest_Data_from_2020_to_Present.csv


In [76]:
# Define the Athena database, table, and S3 file location
database_name = "crime_foreclosure_db"
table_name = "arrests"
# IMPORTANT: It's best to have your CSV in its own folder.
s3_location = "s3://sagemaker-us-east-1-643862913351/ads508-group7/arrests/"

# SQL statement to create the arrests table
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
    report_id string,
    report_type string,
    arrest_date string,
    time int,
    area_id int,
    area_name string,
    reporting_district int,
    age int,
    sex_code string,
    descent_code string,
    charge_group_code int,
    charge_group_description string,
    arrest_type_code string,
    charge string,
    charge_description string,
    disposition_description string,
    address string,
    cross_street string,
    lat double,
    lon double,
    location string,
    booking_date string,
    booking_time int,
    booking_location string,
    booking_location_code int
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  "separatorChar" = ",",
  "quoteChar"     = '\"'
)
LOCATION '{}'
TBLPROPERTIES (
  'skip.header.line.count'='1'
)""".format(database_name, table_name, s3_location)

cursor.execute(statement) #executes sql statement

<pyathena.cursor.Cursor at 0x7f9efb127010>

In [79]:
# Define the Athena database, table, and S3 file location
database_name = "crime_foreclosure_db"
table_name = "arrests"
s3_location = "s3://sagemaker-us-east-1-643862913351/ads508-group7/arrests/"

# SQL statement to create the arrests table using a tab delimiter
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
    report_id string,
    report_type string,
    arrest_date string,
    time int,
    area_id int,
    area_name string,
    reporting_district int,
    age int,
    sex_code string,
    descent_code string,
    charge_group_code int,
    charge_group_description string,
    arrest_type_code string,
    charge string,
    charge_description string,
    disposition_description string,
    address string,
    cross_street string,
    lat double,
    lon double,
    location string,
    booking_date string,
    booking_time int,
    booking_location string,
    booking_location_code int
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  "separatorChar" = "\t",
  "quoteChar"     = '\"'
)
LOCATION '{}'
TBLPROPERTIES (
  'skip.header.line.count'='1'
)""".format(database_name, table_name, s3_location)

print(statement)
cursor.execute(statement)


CREATE EXTERNAL TABLE IF NOT EXISTS crime_foreclosure_db.arrests (
    report_id string,
    report_type string,
    arrest_date string,
    time int,
    area_id int,
    area_name string,
    reporting_district int,
    age int,
    sex_code string,
    descent_code string,
    charge_group_code int,
    charge_group_description string,
    arrest_type_code string,
    charge string,
    charge_description string,
    disposition_description string,
    address string,
    cross_street string,
    lat double,
    lon double,
    location string,
    booking_date string,
    booking_time int,
    booking_location string,
    booking_location_code int
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  "separatorChar" = "	",
  "quoteChar"     = '"'
)
LOCATION 's3://sagemaker-us-east-1-643862913351/ads508-group7/arrests/'
TBLPROPERTIES (
  'skip.header.line.count'='1'
)


<pyathena.cursor.Cursor at 0x7f9efb127010>

In [74]:
#validate table created
statement = "SHOW TABLES IN crime_foreclosure_db"
df_show = pd.read_sql(statement, conn)
df_show

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,arrests


In [83]:
#validate that data loaded
query = "SELECT * FROM crime_foreclosure_db.arrests LIMIT 5"
arrest_sample = pd.read_sql(query, conn)
arrest_sample

  arrest_sample = pd.read_sql(query, conn)


Unnamed: 0,report_id,report_type,arrest_date,time,area_id,area_name,reporting_district,age,sex_code,descent_code,...,disposition_description,address,cross_street,lat,lon,location,booking_date,booking_time,booking_location,booking_location_code


# EDA

## Arrest Data

In [95]:
import awswrangler as wr
import pandas as pd

# Read the CSV file directly from S3
df_arrest = wr.s3.read_csv("s3://sagemaker-us-east-1-643862913351/ads508-group7/arrests/Arrest_Data_from_2020_to_Present.csv")

# Display the first few rows
df_arrest.head()

Unnamed: 0,Report ID,Report Type,Arrest Date,Time,Area ID,Area Name,Reporting District,Age,Sex Code,Descent Code,...,Disposition Description,Address,Cross Street,LAT,LON,Location,Booking Date,Booking Time,Booking Location,Booking Location Code
0,6636966,BOOKING,07/06/2023 12:00:00 AM,2250.0,8,West LA,817,46,M,B,...,MISDEMEANOR COMPLAINT FILED,900 GAYLEY AV,,34.0637,-118.4482,POINT (-118.4482 34.0637),07/07/2023 12:00:00 AM,143.0,METRO - JAIL DIVISION,4273.0
1,6637119,BOOKING,07/07/2023 12:00:00 AM,1000.0,3,Southwest,396,39,M,B,...,MISDEMEANOR COMPLAINT FILED,40TH PL,VERMONT,34.01,-118.2915,POINT (-118.2915 34.01),07/07/2023 12:00:00 AM,1156.0,77TH ST,4212.0
2,6624479,BOOKING,06/15/2023 12:00:00 AM,1850.0,7,Wilshire,724,33,F,H,...,MISDEMEANOR COMPLAINT FILED,100 THE GROVE DR,,34.0736,-118.3563,POINT (-118.3563 34.0736),06/15/2023 12:00:00 AM,2251.0,77TH ST,4212.0
3,6636128,BOOKING,07/05/2023 12:00:00 AM,1550.0,2,Rampart,218,30,F,B,...,MISDEMEANOR COMPLAINT FILED,1000 ECHO PARK AV,,34.0741,-118.259,POINT (-118.259 34.0741),07/05/2023 12:00:00 AM,1940.0,METRO - JAIL DIVISION,4273.0
4,6636650,BOOKING,07/06/2023 12:00:00 AM,1335.0,12,77th Street,1258,31,M,H,...,,7800 S BROADWAY,,33.9689,-118.2783,POINT (-118.2783 33.9689),07/06/2023 12:00:00 AM,1345.0,77TH ST,4212.0
