# Wildfire Risk - Ingest - S3 Setup
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [2]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.1'

# Setup Basics

In [3]:
# Import basic libraries
import boto3
import sagemaker

# Import data access libraries
import pandas as pd
from profiler import profile, profile_cat

In [4]:
# Establish session fundamentals
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.Session().client(service_name='sagemaker', region_name=region)

# Setup S3

In [5]:
# Set public path and store as SageMaker variable
s3_public_path = 's3://wildfire-risk/'
%store s3_public_path

# Set private path ("my bucket")
s3_private_path = 's3://{}/widfire-risk/csv/'.format(bucket)
%store s3_private_path
print(s3_private_path)

Stored 's3_public_path' (str)
Stored 's3_private_path' (str)
s3://sagemaker-us-east-1-857283526476/widfire-risk/csv/


In [6]:
# Get contents of Public S3 bucket (file source)
!aws s3 ls {s3_public_path}

2023-03-24 02:49:22    8623294 conditions.csv
2023-03-18 23:31:25   10821874 fires.csv
2023-03-18 23:31:21   38317925 weather.csv


In [7]:
# Now copy public bucket-based data to private (local) bucket
!aws s3 cp --recursive $s3_public_path $s3_private_path --exclude '*' --include 'fires.csv'
!aws s3 cp --recursive $s3_public_path $s3_private_path --exclude '*' --include 'weather.csv'
!aws s3 cp --recursive $s3_public_path $s3_private_path --exclude '*' --include 'conditions.csv'

copy: s3://wildfire-risk/fires.csv to s3://sagemaker-us-east-1-857283526476/widfire-risk/csv/fires.csv
copy: s3://wildfire-risk/weather.csv to s3://sagemaker-us-east-1-857283526476/widfire-risk/csv/weather.csv
copy: s3://wildfire-risk/conditions.csv to s3://sagemaker-us-east-1-857283526476/widfire-risk/csv/conditions.csv


In [8]:
# List contents of private S3 bucket to confirm copy
!aws s3 ls $s3_private_path

2023-03-27 03:20:27    8623294 conditions.csv
2023-03-27 03:20:24   10821874 fires.csv
2023-03-27 03:20:25   38317925 weather.csv


# Load and Validate Data (*traditional Pandas - as check*)

In [9]:
# Load and check base data files from public S3 bucket
fires_df = pd.read_csv(s3_private_path+'fires.csv')
weather_df = pd.read_csv(s3_private_path+'weather.csv')
profile(fires_df)
profile(weather_df)

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35315,33108,53847.0,60.4,,,,,,,,2015/08/26 19:40:00+
ControlDateTime,object,32570,30401,56592.0,63.5,,,,,,,,2015/08/27 15:59:59+
DiscoveryAcres,float64,72195,348,16967.0,19.0,5.8,500.0,,115997.0,191.8,,,0.25__0.1__nan__0.1_
EstimatedCostToDate,float64,2128,1040,87034.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__nan__nan__
FinalAcres,float64,2631,345,86531.0,97.0,14.9,318.1,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87664.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,87939.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,87998.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88252.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,77967,4,11195.0,12.6,,,,,,,,Human__Undetermined_


Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
STATION,object,175300,610,,,,,,,,,,USR0000CHOO__USR0000
DATE,object,175300,461,,,,,,,,,,2001-09__2000-07__20
LATITUDE,float64,175300,600,,,40.1,3.9,32.6,49.0,,,131.0,41.0478__38.8825__42
LONGITUDE,float64,175300,590,,,-118.0,6.3,-124.4,-82.4,,,266.0,-123.6714__-121.2683
ELEVATION,float64,175300,472,,,1214.0,734.3,,3694.2,,,,114.3__61.0__975.4__
NAME,object,175300,610,,,,,,,,,,"HOOPA CALIFORNIA, CA"
CDSD,float64,148117,11831,27183.0,15.5,182.8,315.5,,3218.1,3.1,,,383.2__426.2__23.4__
CDSD_ATTRIBUTES,object,143784,1,31516.0,18.0,,,,,,,,U__U__U__U__U
CLDD,float64,174374,4080,926.0,0.5,34.0,67.2,,601.0,,,23.0,71.2__167.7__20.6__5
CLDD_ATTRIBUTES,object,174374,6,926.0,0.5,,,,,,,,",U__,U__,U__1,U__,U"


# Store Variables and Close Session

In [10]:
# Store variables for subsequent notebooks
%store

Stored variables and their in-db values:
autopilot_train_s3_uri                                -> 's3://sagemaker-us-east-1-857283526476/data/amazon
balanced_bias_data_jsonlines_s3_uri                   -> 's3://sagemaker-us-east-1-857283526476/bias-detect
balanced_bias_data_s3_uri                             -> 's3://sagemaker-us-east-1-857283526476/bias-detect
bias_data_s3_uri                                      -> 's3://sagemaker-us-east-1-857283526476/bias-detect
ingest_create_athena_db_passed                        -> True
ingest_create_athena_table_parquet_passed             -> True
ingest_create_athena_table_passed                     -> True
ingest_create_athena_table_tsv_passed                 -> True
s3_private_path                                       -> 's3://sagemaker-us-east-1-857283526476/widfire-ris
s3_private_path_tsv                                   -> 's3://sagemaker-us-east-1-857283526476/amazon-revi
s3_public_path                                        -> 's3://

In [11]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [12]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>