# Wildfire Risk - Ingest
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [3]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.3'

# Setup Basics

In [4]:
# Import basic libraries
import boto3
import sagemaker

# Import data access libraries
import pandas as pd
from profiler import profile, profile_cat
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

# Import utility libraries
from IPython.core.display import display, HTML

[0m

In [5]:
# Establish session fundamentals
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.Session().client(service_name='sagemaker', region_name=region)

# Setup S3

In [6]:
# Set public path and store as SageMaker variable
s3_public_path = 's3://wildfire-risk/'
%store s3_public_path

# Set private path ("my bucket")
s3_private_path = 's3://{}/widfire-risk/data/'.format(bucket)
%store s3_private_path
print(s3_private_path)

Stored 's3_public_path' (str)
Stored 's3_private_path' (str)
s3://sagemaker-us-east-1-074876746575/widfire-risk/data/


In [7]:
# Get contents of Public S3 bucket (file source)
!aws s3 ls $s3_public_path

2023-03-28 02:58:16   15132159 conditions.csv
2023-03-28 02:57:48    8623294 conditions_original.csv
2023-03-28 02:58:02   12030972 fires.csv
2023-03-28 02:57:18   10821874 fires_original.csv
2023-04-01 16:35:06   17461986 merged.csv
2023-03-28 02:58:30   41122733 weather.csv
2023-03-28 02:57:33   38317925 weather_original.csv


In [8]:
# Now copy public bucket-based data to private (local) bucket
!aws s3 cp --recursive $s3_public_path $s3_private_path'fires'/ --exclude '*' --include 'fires.csv'
!aws s3 cp --recursive $s3_public_path $s3_private_path'weather'/ --exclude '*' --include 'weather.csv'
!aws s3 cp --recursive $s3_public_path $s3_private_path'conditions'/ --exclude '*' --include 'conditions.csv'
!aws s3 cp --recursive $s3_public_path $s3_private_path'merged'/ --exclude '*' --include 'merged.csv'

copy: s3://wildfire-risk/fires.csv to s3://sagemaker-us-east-1-074876746575/widfire-risk/data/fires/fires.csv
copy: s3://wildfire-risk/weather.csv to s3://sagemaker-us-east-1-074876746575/widfire-risk/data/weather/weather.csv
copy: s3://wildfire-risk/conditions.csv to s3://sagemaker-us-east-1-074876746575/widfire-risk/data/conditions/conditions.csv
copy: s3://wildfire-risk/merged.csv to s3://sagemaker-us-east-1-074876746575/widfire-risk/data/merged/merged.csv


In [9]:
# List contents of private S3 bucket to confirm copy
!aws s3 ls $s3_private_path'fires'/
!aws s3 ls $s3_private_path'weather'/
!aws s3 ls $s3_private_path'conditions'/
!aws s3 ls $s3_private_path'merged'/

2023-04-01 18:07:18   12030972 fires.csv
2023-04-01 18:07:19   41122733 weather.csv
2023-04-01 18:07:21   15132159 conditions.csv
2023-04-01 18:07:22   17461986 merged.csv


# Load and Validate Data (*traditional Pandas - as check*)

In [10]:
# Load and check base data files from public S3 bucket
fires_df = pd.read_csv(s3_private_path+'fires/fires.csv')
weather_df = pd.read_csv(s3_private_path+'weather/weather.csv')
conditions_df = pd.read_csv(s3_private_path+'conditions/conditions.csv')
merged_df = pd.read_csv(s3_private_path+'merged/merged.csv')
profile(fires_df)
profile(weather_df)
profile(conditions_df)
profile(merged_df)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35352,33145,53942.0,60.4,,,,,,,,nan__2016/09/22 17:2
ControlDateTime,object,32598,30429,56696.0,63.5,,,,,,,,nan__2016/09/22 17:2
DiscoveryAcres,float64,72321,348,16973.0,19.0,5.8,499.5,,115997.0,191.9,,,40.0__0.1__0.01__0.1
EstimatedCostToDate,float64,2128,1040,87166.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__nan__nan__
FinalAcres,float64,2645,345,86649.0,97.0,14.8,317.3,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87796.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,88071.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,88130.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88384.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,78093,4,11201.0,12.5,,,,,,,,Unknown__Human__Unde


Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
STATION,object,175300,610,,,,,,,,,,USR0000CKON__USR0000
DATE,object,175300,461,,,,,,,,,,1998-07__2007-01__20
LATITUDE,float64,175300,600,,,40.1,3.9,32.6,49.0,,,131.0,38.9119__39.1267__34
LONGITUDE,float64,175300,590,,,-118.0,6.3,-124.4,-82.4,,,266.0,-122.7064__-107.2847
ELEVATION,float64,175300,472,,,1214.0,734.3,,3694.2,,,,659.3__2748.7__389.5
NAME,object,175300,610,,,,,,,,,,"KONOCTI CALIFORNIA,"
CDSD,float64,148117,11831,27183.0,15.5,182.8,315.5,,3218.1,3.1,,,261.3__0.0__1131.4__
CDSD_ATTRIBUTES,object,143784,1,31516.0,18.0,,,,,,,,U__U__U__U__U
CLDD,float64,174374,4080,926.0,0.5,34.0,67.2,,601.0,,,23.0,208.9__0.0__151.0__0
CLDD_ATTRIBUTES,object,174374,6,926.0,0.5,,,,,,,,",U__2,U__,U__,U__,U"


Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
PLT_CN,int64,120208,94231,,,127745766191751.4,174962050684977.5,1.0,635060431126144.0,,,,174763194020004__402
INVYR,int64,120208,21,,,2008.4,6.9,1994.0,2019.0,,,,2013__2010__2009__20
COND_STATUS_CD,int64,120208,5,,,1.8,1.0,1.0,5.0,,,,1__2__1__2__1
MAPDEN,float64,48111,3,72097.0,60.0,1.0,0.1,1.0,3.0,8.5,,20.0,1.0__nan__1.0__nan__
STDAGE,float64,50653,488,69555.0,57.9,93.5,95.6,,9999.0,23.7,,,65.0__nan__41.0__nan
STDSZCD,float64,52176,4,68032.0,56.6,1.5,1.0,1.0,5.0,,,,1.0__nan__1.0__nan__
FLDSZCD,float64,48111,6,72097.0,60.0,2.7,1.0,,5.0,,,10.6,3.0__nan__2.0__nan__
SITECLCD,float64,54682,7,65526.0,54.5,4.6,1.6,1.0,7.0,,,,3.0__nan__4.0__nan__
SICOND,float64,45612,175,74596.0,62.1,90.7,26.9,7.0,192.0,,,,153.0__nan__144.0__n
STDORGCD,float64,48111,2,72097.0,60.0,0.2,0.4,,1.0,,,,0.0__nan__0.0__nan__


  self.variances_ = np.nanvar(X, axis=0)
  return 1 - self.ssr/self.centered_tss


Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
STATION,object,42289,418,,,,,,,,,,USR0000CFOU__USR0000
LATITUDE,float64,42289,416,,,40.9,4.3,32.6,49.0,,,1837.5,35.8922__34.2711__32
LONGITUDE,float64,42289,409,,,-120.7,2.2,-124.3,-114.5,,,104.0,-118.915__-118.1525_
ELEVATION,float64,42289,332,,,1046.3,542.1,,2748.7,,,,64.0__914.4__1049.4_
NAME,object,42289,418,,,,,,,,,,FOUNTAIN SPRINGS CAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...
POOState,object,9730,3,32559,77.0,,,,,,,,nan__nan__nan__US-WA
PredominantFuelModel,object,28,8,42261,99.9,,,,,,,,nan__nan__nan__nan__
PrimaryFuelModel,object,164,8,42125,99.6,,,,,,,,nan__nan__nan__nan__
ym_date,object,9730,4165,32559,77.0,,,,,,,,nan__nan__nan__2016-


# Setup Athena Database

In [11]:
# Set Athena database name
database_name = 'dsoaws'

# Set S3 staging directory (temp directory for Athena queries)
s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)

# Establish S3 connection
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# Create Athena database
statement = 'CREATE DATABASE IF NOT EXISTS {}'.format(database_name)
pd.read_sql(statement, conn)

# Verify database creation
statement = 'SHOW DATABASES'
df_show = pd.read_sql(statement, conn)

if database_name in df_show.values:
    ingest_create_athena_db_passed = True
%store ingest_create_athena_db_passed    

df_show.head(5)

Stored 'ingest_create_athena_db_passed' (bool)


Unnamed: 0,database_name
0,default
1,dsoaws
2,sagemaker_featurestore


## Register fires.csv as Athena table

In [12]:
fires_csv = 'fires.csv'
fires_tb = 'fires'

# Force table creation
statement = """DROP TABLE IF EXISTS {}.{}""".format(
    database_name, fires_tb
)    
pd.read_sql(statement, conn)

# Create table
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
    ContainmentDateTime string,
    ControlDateTime string,
    DiscoveryAcres double,
    EstimatedCostToDate double,
    FinalAcres string,
    FireBehaviorGeneral string,
    FireBehaviorGeneral1 string,
    FireBehaviorGeneral2 string,
    FireBehaviorGeneral3 string,
    FireCause string,
    FireCauseGeneral string,
    FireCauseSpecific string,
    FireDiscoveryDateTime string,
    FireOutDateTime string,
    GACC string,
    IncidentName string,
    IncidentShortDescription string,
    InitialLatitude double,
    InitialLongitude double,
    IsFireCauseInvestigated double,
    IsTrespass double,
    POOCity string,
    POOState string,
    PredominantFuelModel string,
    PrimaryFuelModel string,
    geohash string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, fires_tb, s3_private_path+fires_tb+'/'
)
print(statement)

pd.read_sql(statement, conn)

CREATE EXTERNAL TABLE IF NOT EXISTS dsoaws.fires(
    ContainmentDateTime string,
    ControlDateTime string,
    DiscoveryAcres double,
    EstimatedCostToDate double,
    FinalAcres string,
    FireBehaviorGeneral string,
    FireBehaviorGeneral1 string,
    FireBehaviorGeneral2 string,
    FireBehaviorGeneral3 string,
    FireCause string,
    FireCauseGeneral string,
    FireCauseSpecific string,
    FireDiscoveryDateTime string,
    FireOutDateTime string,
    GACC string,
    IncidentName string,
    IncidentShortDescription string,
    InitialLatitude double,
    InitialLongitude double,
    IsFireCauseInvestigated double,
    IsTrespass double,
    POOCity string,
    POOState string,
    PredominantFuelModel string,
    PrimaryFuelModel string,
    geohash string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-074876746575/widfire-risk/data/fires/'
TBLPROPERTIES ('skip.header.line.count'='1')


## Register weather.csv as Athena table

In [13]:
weather_csv = 'weather.csv'
weather_tb = 'weather'

# Force table creation
statement = """DROP TABLE IF EXISTS {}.{}""".format(
    database_name, weather_tb
)    
pd.read_sql(statement, conn)

# Create table
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
    Station string,
    WDate string,
    Latitude double,
    Longitude double,
    Elevation double,
    Name string,
    CDSD double,
    CDSD_attributes string,
    CLDD double,
    CLDD_attributes string,
    DT00 double,
    DT00_attributes string,
    DT32 double,
    DT32_attributes string,
    DX32 double,
    DX32_attributes string,
    DX70 double,
    DX70_attributes string,
    DX90 double,
    DX90_attributes string,
    EMNT double,
    EMNT_attributes string,
    EMXT double,
    EMXT_attributes string,
    HDSD double,
    HDSD_attributes string,
    HTDD double,
    HTDD_attributes string,
    TAVG double,
    TAVG_attributes string,
    TMAX double,
    TMAX_attributes string,
    TMIN double,
    TMIN_attributes string,
    geohash string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, weather_tb, s3_private_path+weather_tb+'/'
)
print(statement)

pd.read_sql(statement, conn)

CREATE EXTERNAL TABLE IF NOT EXISTS dsoaws.weather(
    Station string,
    WDate string,
    Latitude double,
    Longitude double,
    Elevation double,
    Name string,
    CDSD double,
    CDSD_attributes string,
    CLDD double,
    CLDD_attributes string,
    DT00 double,
    DT00_attributes string,
    DT32 double,
    DT32_attributes string,
    DX32 double,
    DX32_attributes string,
    DX70 double,
    DX70_attributes string,
    DX90 double,
    DX90_attributes string,
    EMNT double,
    EMNT_attributes string,
    EMXT double,
    EMXT_attributes string,
    HDSD double,
    HDSD_attributes string,
    HTDD double,
    HTDD_attributes string,
    TAVG double,
    TAVG_attributes string,
    TMAX double,
    TMAX_attributes string,
    TMIN double,
    TMIN_attributes string,
    geohash string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-074876746575/widfire-risk/data/weather/'
TBLPROPERTIES ('skip.header.li

## Register conditions.csv as Athena table

In [14]:
conditions_csv = 'conditions.csv'
conditions_tb = 'conditions'

# Force table creation
statement = """DROP TABLE IF EXISTS {}.{}""".format(
    database_name, conditions_tb
)    
pd.read_sql(statement, conn)

# Create table
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
    PLT_CN integer,
    INVYR integer,
    COND_STATUS_CD integer,
    MAPDEN double,
    STDAGE double,
    STDSZCD double,
    FLDSZCD double,
    SITECLCD double,
    SICOND double,
    STDORGCD double,
    SLOPE double,
    PHYSCLCD double,
    GSSTKCD double,
    DSTRBCD1 double,
    TRTCD1 double,
    PRESNFCD double,
    FLDAGE double,
    CARBON_DOWN_DEAD double,
    CARBON_LITTER double,
    CARBON_SOIL_ORG double,
    CARBON_STANDING_DEAD double,
    CARBON_UNDERSTORY_AG double,
    CARBON_UNDERSTORY_BG double,
    STATE string,
    WATERCD double,
    LAT double,
    LON double,
    geohash string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, conditions_tb, s3_private_path+conditions_tb+'/'
)
print(statement)

pd.read_sql(statement, conn)

CREATE EXTERNAL TABLE IF NOT EXISTS dsoaws.conditions(
    PLT_CN integer,
    INVYR integer,
    COND_STATUS_CD integer,
    MAPDEN double,
    STDAGE double,
    STDSZCD double,
    FLDSZCD double,
    SITECLCD double,
    SICOND double,
    STDORGCD double,
    SLOPE double,
    PHYSCLCD double,
    GSSTKCD double,
    DSTRBCD1 double,
    TRTCD1 double,
    PRESNFCD double,
    FLDAGE double,
    CARBON_DOWN_DEAD double,
    CARBON_LITTER double,
    CARBON_SOIL_ORG double,
    CARBON_STANDING_DEAD double,
    CARBON_UNDERSTORY_AG double,
    CARBON_UNDERSTORY_BG double,
    STATE string,
    WATERCD double,
    LAT double,
    LON double,
    geohash string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-074876746575/widfire-risk/data/conditions/'
TBLPROPERTIES ('skip.header.line.count'='1')


## Verify table creation

In [15]:
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)

if [fires_tb, weather_tb] in df_show.values:
    ingest_create_athena_table_passed = True
%store ingest_create_athena_table_passed

df_show.head(5)

Stored 'ingest_create_athena_table_passed' (bool)


Unnamed: 0,tab_name
0,amazon_reviews_tsv
1,conditions
2,fires
3,weather


## Run sample queries

In [16]:
statement = """SELECT * FROM {}.{}
    WHERE FireCause = 'Natural' LIMIT 50""".format(
    database_name, fires_tb
)
print(statement)

df = pd.read_sql(statement, conn)
df.head(5)

SELECT * FROM dsoaws.fires
    WHERE FireCause = 'Natural' LIMIT 50


Unnamed: 0,containmentdatetime,controldatetime,discoveryacres,estimatedcosttodate,finalacres,firebehaviorgeneral,firebehaviorgeneral1,firebehaviorgeneral2,firebehaviorgeneral3,firecause,...,incidentshortdescription,initiallatitude,initiallongitude,isfirecauseinvestigated,istrespass,poocity,poostate,predominantfuelmodel,primaryfuelmodel,geohash
0,,,3.0,,,,,,,Natural,...,,43.28994,-122.5181,,0.0,,US-OR,,,87281a126ffffff
1,,,,0.0,,Active,,,,Natural,...,Tiller,,,,,Tiller,US-OR,,Timber (Grass and Understory),
2,,,1.5,,,,,,,Natural,...,,42.19094,-122.4942,,0.0,,US-OR,,,87281e88cffffff
3,,,0.1,,,,,,,Natural,...,,42.63354,-122.5394,,0.0,,US-OR,,,87281e0cdffffff
4,,,,,,,,,,Natural,...,,,,,0.0,,US-CA,,,


In [17]:
statement = """SELECT * FROM {}.{} LIMIT 10""".format(
    database_name, weather_tb
)
print(statement)

df = pd.read_sql(statement, conn)
df.head(5)

SELECT * FROM dsoaws.weather LIMIT 10


Unnamed: 0,station,wdate,latitude,longitude,elevation,name,cdsd,cdsd_attributes,cldd,cldd_attributes,...,hdsd_attributes,htdd,htdd_attributes,tavg,tavg_attributes,tmax,tmax_attributes,tmin,tmin_attributes,geohash
0,USR0000CRTL,2005-08,36.4069,-118.4217,2621.3,"""RATTLESNAKE CALIFORNIA",,,,4.0,...,"""",,6.7,,U,25.0,"+""",27.8,"""",U
1,USR0000CRTL,2005-09,36.4069,-118.4217,2621.3,"""RATTLESNAKE CALIFORNIA",,,,0.0,...,"""",,-2.8,,U,11.0,"""",25.6,"""",U
2,USR0000CRTL,2006-08,36.4069,-118.4217,2621.3,"""RATTLESNAKE CALIFORNIA",,,,0.6,...,"""1",,4.4,,U,1.0,"""",26.7,"""1",U
3,USR0000CRTL,2006-09,36.4069,-118.4217,2621.3,"""RATTLESNAKE CALIFORNIA",,,,0.0,...,"""",,-3.9,,U,16.0,"""",26.1,"""",U
4,USR0000CRTL,2006-10,36.4069,-118.4217,2621.3,"""RATTLESNAKE CALIFORNIA",,,,0.0,...,"""",,-4.4,,U,17.0,"""",18.9,"""",U


In [18]:
statement = """SELECT * FROM {}.{} LIMIT 10""".format(
    database_name, conditions_tb
)
print(statement)

df = pd.read_sql(statement, conn)
df.head(5)

SELECT * FROM dsoaws.conditions LIMIT 10


Unnamed: 0,plt_cn,invyr,cond_status_cd,mapden,stdage,stdszcd,fldszcd,siteclcd,sicond,stdorgcd,...,carbon_litter,carbon_soil_org,carbon_standing_dead,carbon_understory_ag,carbon_understory_bg,state,watercd,lat,lon,geohash
0,,2019,2,,,,,,,,...,,,,,,CA,1.0,40.148215,-121.785534,872815c84ffffff
1,,2019,1,1.0,30.0,2.0,2.0,5.0,103.0,1.0,...,8.104615,18.434625,0.927575,2.274576,0.252731,CA,0.0,40.330908,-121.771399,872815502ffffff
2,,2019,1,1.0,,3.0,1.0,4.0,106.0,0.0,...,14.140916,12.305077,0.404615,1.797825,0.199758,CA,0.0,40.094837,-122.715747,872815ba5ffffff
3,,2019,2,,,,,,,,...,,,,,,CA,0.0,39.470493,-120.128735,872989a98ffffff
4,,2019,2,,,,,,,,...,,,,,,CA,0.0,37.991883,-120.289126,87283648affffff


# Setup Parquet

In [19]:
# Set Parquet basics
s3_parquet_dir = 's3://{0}/parquet'.format(bucket)

## Create Parquet file from fires.csv

In [20]:
fires_tb_parquet = 'fires_parquet'

# Force table creation
statement = """DROP TABLE IF EXISTS {}.{}""".format(
    database_name, fires_tb_parquet
)    
##pd.read_sql(statement, conn)

# SKIP PARTITIONING INITIALLY: WITH (format = 'PARQUET', external_location = '{}', partitioned_by = ARRAY['firecause']) AS
# Create table
statement = """CREATE TABLE IF NOT EXISTS {}.{}
WITH (format = 'PARQUET', external_location = '{}') AS
SELECT
    ContainmentDateTime,
    ControlDateTime,
    DiscoveryAcres,
    EstimatedCostToDate,
    FinalAcres,
    FireBehaviorGeneral,
    FireBehaviorGeneral1,
    FireBehaviorGeneral2,
    FireBehaviorGeneral3,
    FireCause,
    FireCauseGeneral,
    FireCauseSpecific,
    FireDiscoveryDateTime,
    FireOutDateTime,
    GACC,
    IncidentName,
    IncidentShortDescription,
    InitialLatitude,
    InitialLongitude,
    IsFireCauseInvestigated,
    IsTrespass,
    POOCity,
    POOState,
    PredominantFuelModel,
    PrimaryFuelModel,
    geohash
FROM {}.{}""".format(
    database_name, fires_tb_parquet, s3_private_path+fires_tb_parquet+'/', database_name, fires_tb
)
print(statement)

##pd.read_sql(statement, conn)

CREATE TABLE IF NOT EXISTS dsoaws.fires_parquet
WITH (format = 'PARQUET', external_location = 's3://sagemaker-us-east-1-074876746575/widfire-risk/data/fires_parquet/') AS
SELECT
    ContainmentDateTime,
    ControlDateTime,
    DiscoveryAcres,
    EstimatedCostToDate,
    FinalAcres,
    FireBehaviorGeneral,
    FireBehaviorGeneral1,
    FireBehaviorGeneral2,
    FireBehaviorGeneral3,
    FireCause,
    FireCauseGeneral,
    FireCauseSpecific,
    FireDiscoveryDateTime,
    FireOutDateTime,
    GACC,
    IncidentName,
    IncidentShortDescription,
    InitialLatitude,
    InitialLongitude,
    IsFireCauseInvestigated,
    IsTrespass,
    POOCity,
    POOState,
    PredominantFuelModel,
    PrimaryFuelModel,
    geohash
FROM dsoaws.fires


In [21]:
# Load partitions
statement = "MSCK REPAIR TABLE {}.{}".format(database_name, fires_tb_parquet)
print(statement)

df = pd.read_sql(statement, conn)
df.head(5)

MSCK REPAIR TABLE dsoaws.fires_parquet


In [22]:
# Show partitions
statement = "SHOW PARTITIONS {}.{}".format(database_name, fires_tb_parquet)
print(statement)

#df_partitions = pd.read_sql(statement, conn)
#df_partitions.head(5)

SHOW PARTITIONS dsoaws.fires_parquet


In [23]:
# Show tables
statement = "SHOW TABLES in {}".format(database_name)
df_tables = pd.read_sql(statement, conn)

if fires_tb_parquet in df_tables.values:
    ingest_create_athena_table_parquet_passed = True
%store ingest_create_athena_table_parquet_passed

df_tables.head(5)

UsageError: Unknown variable 'ingest_create_athena_table_parquet_passed'


In [None]:
# Run sample query
statement = """SELECT * FROM {}.{}
    WHERE FireCause = 'Natural' LIMIT 50""".format(
    database_name, fires_tb_parquet
)
print(statement)

df = pd.read_sql(statement, conn)
df.head(5)

# Store Variables and Close Session

In [None]:
# Store variables for subsequent notebooks
%store

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}