# Setup S3 Buckets

In [1]:
# List the S3 public bucket content
!aws s3 ls s3://ads508-group7/

                           PRE Arrest/
                           PRE Crime/
                           PRE Foreclosures/
                           PRE LAPD/
                           PRE Unsaved/
2025-03-15 21:02:43     546709 2023_Registered_Foreclosure_Properties.csv
2025-03-15 21:02:44     552165 2024_Registered_Foreclosure_Properties.csv
2025-03-15 21:40:42   92038326 Arrest_Data_from_2020_to_Present.csv
2025-03-15 21:40:42  255509498 Crime_Data_from_2020_to_Present.csv


In [2]:
import boto3
import sagemaker
import pandas as pd

# create sagemaker session
sess =  sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

ImportError: cannot import name 'DEFAULT_CIPHERS' from 'urllib3.util.ssl_' (/opt/conda/lib/python3.11/site-packages/urllib3/util/ssl_.py)

In [3]:
# Set S3 Source Location (Public S3 Bucket)
s3_public_path = "s3://ads508-group7"

In [4]:
%store s3_public_path

Stored 's3_public_path' (str)


In [5]:
# Set S3 Destination Location (Private S3 Bucket)
s3_private_path = "s3://{}/ads508-group7".format(bucket)
print(s3_private_path)

s3://sagemaker-us-east-1-817789163072/ads508-group7


In [6]:
%store s3_private_path

Stored 's3_private_path' (str)


In [7]:
#Copy Data From the Public S3 Bucket to our Private S3 Bucket in this Account

In [8]:
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "2023_Registered_Foreclosure_Properties.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "2024_Registered_Foreclosure_Properties.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "Arrest_Data_from_2020_to_Present.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "Crime_Data_from_2020_to_Present.csv"

copy: s3://ads508-group7/2023_Registered_Foreclosure_Properties.csv to s3://sagemaker-us-east-1-817789163072/ads508-group7/2023_Registered_Foreclosure_Properties.csv
copy: s3://ads508-group7/2024_Registered_Foreclosure_Properties.csv to s3://sagemaker-us-east-1-817789163072/ads508-group7/2024_Registered_Foreclosure_Properties.csv
copy: s3://ads508-group7/Arrest_Data_from_2020_to_Present.csv to s3://sagemaker-us-east-1-817789163072/ads508-group7/Arrest_Data_from_2020_to_Present.csv
copy: s3://ads508-group7/Crime_Data_from_2020_to_Present.csv to s3://sagemaker-us-east-1-817789163072/ads508-group7/Crime_Data_from_2020_to_Present.csv


In [9]:
# List Files in our Private S3 Bucket in this Account
print(s3_private_path)

s3://sagemaker-us-east-1-817789163072/ads508-group7


In [10]:
!aws s3 ls $s3_private_path/

2025-03-26 03:42:58     546709 2023_Registered_Foreclosure_Properties.csv
2025-03-26 03:42:59     552165 2024_Registered_Foreclosure_Properties.csv
2025-03-26 03:43:01   92038326 Arrest_Data_from_2020_to_Present.csv
2025-03-26 03:43:03  255509498 Crime_Data_from_2020_to_Present.csv


In [11]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/ads508-group7/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

  from IPython.core.display import display, HTML


In [12]:
# store variables for the next notebook
%store

Stored variables and their in-db values:
s3_private_path                         -> 's3://sagemaker-us-east-1-817789163072/ads508-grou
s3_public_path                          -> 's3://ads508-group7'
setup_dependencies_passed               -> True
setup_instance_check_passed             -> True


# EDA

In [13]:
!pip install awswrangler



In [15]:
#import libraries for EDA
import awswrangler as wr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Arrest Data

In [16]:
# Read the CSV file directly from S3
df_arrest = wr.s3.read_csv("s3://sagemaker-us-east-1-643862913351/ads508-group7/arrests/Arrest_Data_from_2020_to_Present.csv")

# Display the first few rows
df_arrest.head()

2025-03-26 03:43:48,194	INFO worker.py:1786 -- Started a local Ray instance.


OSError: When getting information for key 'ads508-group7/arrests/Arrest_Data_from_2020_to_Present.csv' in bucket 'sagemaker-us-east-1-643862913351': AWS Error ACCESS_DENIED during HeadObject operation: No response body.

### Basic Data Information

In [None]:
df_arrest.shape

In [None]:
#are there duplicates?
df_arrest.duplicated().sum()

In [None]:
#get datatypes
df_arrest.dtypes

### Data Quality Report - Continous

In [None]:
#identify continuous features
conf = df_arrest.select_dtypes(include=['float64', 'int64']).columns.tolist()
conf

In [None]:
dqr_conf = pd.DataFrame({
    'Feature': conf,
    'Count': df_arrest[conf].count().values,
    'Missing Values': df_arrest[conf].isnull().sum().values,
    'Cardinality': df_arrest[conf].nunique().values,
    'Min': df_arrest[conf].min().values,
    '1st Quartile': df_arrest[conf].quantile(0.25).values,
    'Mean': df_arrest[conf].mean().values,
    'Median': df_arrest[conf].median().values,
    '3rd Quartile': df_arrest[conf].quantile(0.75).values,
    'Max': df_arrest[conf].max().values,
    'Standard Deviation': df_arrest[conf].std().values,
})
dqr_conf

### Data Quality Report - Categorical

In [None]:
#identify categorical features
catf = df_arrest.select_dtypes(include=['object']).columns.tolist()
catf

In [None]:
#create list to store modes and frequencies
modes = []
mode_freqs = []
second_modes = []
second_mode_freqs = []
mode_percentages = []
second_mode_percentages = []

In [None]:
# Calculate mode and frequency for each categorical feature
for feature in catf:
    count = df_arrest[feature].count()
    mode = df_arrest[feature].mode().iloc[0]
    mode_freq = df_arrest[feature].value_counts().iloc[0]
    modes.append(mode)
    mode_freqs.append(mode_freq)
    mode_percentages.append((mode_freq / count) * 100 if count > 0 else 0)

    # Calculate second mode and its frequency
    if len(df_arrest[feature].value_counts()) > 1:
        second_mode = df_arrest[feature].value_counts().index[1]
        second_mode_freq = df_arrest[feature].value_counts().iloc[1]
    else:
        second_mode = None
        second_mode_freq = 0

    second_modes.append(second_mode)
    second_mode_freqs.append(second_mode_freq)
    second_mode_percentages.append((second_mode_freq / count) * 100 if count > 0 else 0)

# Setup Athena Database

## Create Database

In [None]:
#setup pyathena
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

In [None]:
database_name = "crime_foreclosure_db"
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

In [None]:
pd.read_sql(statement, conn)

## Validate Database Set Up Correctly

In [None]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

In [None]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [None]:
%store ingest_create_athena_db_passed

In [None]:
# Store Variables for the Next Notebooks
%store

## Create tables in Athena

### Arrest Data

In [None]:
#used ChatGPT for SERDEPROPERTIES refinement. Recieved errors for nulls, ChatGPT used to troubleshoot.
from pyathena import connect

# connection
database_name = "crime_foreclosure_db"
table_name = "arrests"
s3_data_location = "s3://sagemaker-us-east-1-643862913351/ads508-group7/arrests/"
s3_staging_dir = "s3://sagemaker-us-east-1-643862913351/ads508-group7/query-results/"  # Athena’s output
conn = connect(
    s3_staging_dir=s3_staging_dir,
    region_name="us-east-1"
)
cursor = conn.cursor()

#SQL Create table, created all as strings to ensure everything loaded. Received errors with int. 
statement = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name} (
    report_id string,
    report_type string,
    arrest_date string,
    time string,
    area_id string,
    area_name string,
    reporting_district string,
    age string,
    sex_code string,
    descent_code string,
    charge_group_code string,
    charge_group_description string,
    arrest_type_code string,
    charge string,
    charge_description string,
    disposition_description string,
    address string,
    cross_street string,
    lat string,
    lon string,
    location string,
    booking_date string,
    booking_time string,
    booking_location string,
    booking_location_code string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  'separatorChar' = ',',
  'quoteChar'     = '\"', 
  'serialization.null.format' = ''

)
LOCATION '{s3_data_location}'
TBLPROPERTIES ('skip.header.line.count'='1')
"""

# execute sql
cursor.execute(statement)

In [None]:
#validate table created
statement = "SHOW TABLES IN crime_foreclosure_db"
df_show = pd.read_sql(statement, conn)
df_show

In [None]:
#validate that data loaded
query = "SELECT * FROM crime_foreclosure_db.arrests LIMIT 5"
arrest_sample = pd.read_sql(query, conn)
arrest_sample

In [None]:
if table_name in df_show.values:
    ingest_create_athena_table_arrests_passed = True

In [None]:
%store ingest_create_athena_table_arrests_passed

In [None]:
#AWS Glue Catalog
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="top" href="https://us-east-1.console.aws.amazon.com/glue/home?region=us-east-1#/v2/data-catalog/databases/view/crime_foreclosure_db?catalogId=643862913351">AWS Glue Catalog</a></b>'.format(
            region
        )
    )
)



In [None]:
#build quality report table
# Build the quality report DataFrame
dqr_catf = pd.DataFrame({
    'Feature': catf,
    'Count': df_arrest[catf].count().values,
    'Missing Values': df_arrest[catf].isnull().sum().values,
    'Cardinality': df_arrest[catf].nunique().values,
    'Mode': modes,
    'Mode Frequency': mode_freqs,
    'Mode %': mode_percentages,
    '2nd Mode': second_modes,
    '2nd Mode Frequency': second_mode_freqs,
    '2nd Mode %': second_mode_percentages,
})
dqr_catf