# Wildfire Risk - Data Pre-Prep
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [2]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.1'

# Setup

In [3]:
# Import basic and data access libraries
import pandas as pd
from profiler import profile, profile_cat

# Import utility libraries
import requests
from bs4 import BeautifulSoup

In [4]:
#clone Git Repo to local machine to run the codes below
!git clone https://github.com/davefriesen/wildfire-risk.git


fatal: destination path 'wildfire-risk' already exists and is not an empty directory.


In [5]:
#check datasets are located within data subdirectory
!ls wildfire-risk/data

Wildland_Fire_Incident_Locations.xlsx  fires.csv  weather.csv


# Data Load and Validation

## Fire Data - Filter for Select Characteristics across Western States

In [6]:
# Create dataframe from select (relevant) columns
cols = ['ContainmentDateTime',
        'ControlDateTime',
        'DiscoveryAcres',
        'EstimatedCostToDate',
        'FinalAcres',
        'FireBehaviorGeneral',
        'FireBehaviorGeneral1',
        'FireBehaviorGeneral2',
        'FireBehaviorGeneral3',
        'FireCause',
        'FireCauseGeneral',
        'FireCauseSpecific',
        'FireDiscoveryDateTime',
        'FireOutDateTime',
        'GACC',
        'IncidentName',
        'IncidentShortDescription',
        'InitialLatitude',
        'InitialLongitude',
        'IsFireCauseInvestigated',
        'IsTrespass',
        'POOCity',
        'POOState',
        'PredominantFuelModel',
        'PrimaryFuelModel']
wfil_df = pd.read_csv('wildfire-risk/data/fires.csv',
                      usecols=cols, low_memory=False)

In [7]:
# Profile selected dataframe
profile(wfil_df)

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35315,33108,53847.0,60.4,,,,,,,,nan__nan__nan__2019/
ControlDateTime,object,32570,30401,56592.0,63.5,,,,,,,,nan__nan__nan__2019/
DiscoveryAcres,float64,72195,348,16967.0,19.0,5.8,500.0,,115997.0,191.8,,,0.1__0.1__0.1__0.1__
EstimatedCostToDate,float64,2128,1040,87034.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__nan__nan__
FinalAcres,float64,2631,345,86531.0,97.0,14.9,318.1,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87664.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,87939.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,87998.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88252.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,77967,4,11195.0,12.6,,,,,,,,Undetermined__Undete


In [8]:
# Profile states for further selection
profile_cat(wfil_df, ['POOState'])


POOState - 
US-CA 68.054777
US-OR 17.786725
US-WA 14.158498


In [9]:
# Subset dataframe for select states
wfil_s_df = wfil_df[wfil_df['POOState'].isin(['US-CA', 'US-OR', 'US-WA'])]
profile(wfil_s_df)
profile_cat(wfil_s_df, ['POOState'])

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35315,33108,53847.0,60.4,,,,,,,,nan__2022/05/16 00:2
ControlDateTime,object,32570,30401,56592.0,63.5,,,,,,,,nan__2022/05/16 00:4
DiscoveryAcres,float64,72195,348,16967.0,19.0,5.8,500.0,,115997.0,191.8,,,nan__0.1__0.1__0.1__
EstimatedCostToDate,float64,2128,1040,87034.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__nan__nan__
FinalAcres,float64,2631,345,86531.0,97.0,14.9,318.1,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87664.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,87939.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,87998.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88252.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,77967,4,11195.0,12.6,,,,,,,,nan__Human__Unknown_



POOState - 
US-CA 68.054777
US-OR 17.786725
US-WA 14.158498


In [10]:
# Write filtered dataframe
wfil_s_df.to_csv('../data/fires.csv', index=False)

## Weather Data - Filter For and Combine Western States

In [11]:
# Not used - station retrieval
#colw = [11, 9, 10, 7, 3, 32, 3, 10]
#coln = ['ID', 'b', 'c', 'd', 'State', 'f', 'g', 'h']
#wstn_df = pd.read_fwf('../data/ghcnd-stations.txt', header=None, widths=colw, names=coln)
#profile_cat(wstn_df, ['State'])
#
#wstn_s_df = wstn_df[wstn_df['State'].isin(['CA', 'OR', 'WA'])]
#profile(wstn_s_df)
#profile_cat(wstn_s_df, ['State'])
#wstn_s_df

In [12]:
# Set base URL for NOAA weather files
url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-month/access/'

# Fetch HTML content of the web page - all file nbames
response = requests.get(url)
response.raise_for_status()

# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [13]:
# Extract file names from the anchor tags in the HTML FOR SELECT STATES
file_names = [fn['href'] \
              for fn in soup.find_all('a')
                  if fn['href'].startswith(('USR0000C', 'USR0000O', 'USR0000W'))]

In [14]:
# Iterate select files and create combined dataframe
tfn = len(file_names)
print(f'Retrieving {tfn:d} files: ', end='')
n = 1
df = pd.DataFrame()
for f in file_names:
    print('.', end='')
    if n % 10 == 0: print(n, end='')
    df = pd.concat([df, pd.read_csv(url + f)])
    n = n + 1

# Write combined dataframe        
df.to_csv('../data/weather.csv', index=False)
profile(df)

Retrieving 610 files: ..........10..........20..........30..........40..........50..........60..........70..........80..........90..........100..........110..........120..........130..........140..........150..........160..........170..........180..........190..........200..........210..........220..........230..........240..........250..........260..........270..........280..........290..........300..........310..........320..........330..........340..........350..........360..........370..........380..........390..........400..........410..........420..........430..........440..........450..........460..........470..........480..........490..........500..........510..........520..........530..........540..........550..........560..........570..........580..........590..........600..........610

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
STATION,object,175300,610,,,,,,,,,,USR0000OZAL__USR0000
DATE,object,175300,461,,,,,,,,,,2011-08__2008-12__19
LATITUDE,float64,175300,600,,,40.1,3.9,32.6,49.0,,,131.0,39.2753__48.1156__38
LONGITUDE,float64,175300,590,,,-118.0,6.3,-124.4,-82.4,,,266.0,-82.385__-120.1031__
ELEVATION,float64,175300,472,,,1214.0,734.3,,3694.2,,,,222.5__1085.1__2365.
NAME,object,175300,610,,,,,,,,,,"ZALESKI OHIO, OH US_"
CDSD,float64,148117,11831,27183.0,15.5,182.8,315.5,,3218.1,3.1,,,588.6__196.9__nan__0
CDSD_ATTRIBUTES,object,143784,1,31516.0,18.0,,,,,,,,U__U__nan__U__U
CLDD,float64,174374,4080,926.0,0.5,34.0,67.2,,601.0,,,23.0,152.2__0.0__2.2__0.0
CLDD_ATTRIBUTES,object,174374,6,926.0,0.5,,,,,,,,",U__,U__1,U__,U__,U"


# Release Resources

In [15]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [16]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>