# Wildfire Risk - Data Pre-Prep
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [2]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.1'

# Setup

In [3]:
# Import basic and data access libraries
import pandas as pd
from profiler import profile, profile_cat

# Import utility libraries
import requests
from bs4 import BeautifulSoup

In [4]:
#clone Git Repo to local machine to run the codes below
!git clone https://github.com/davefriesen/wildfire-risk.git

fatal: destination path 'wildfire-risk' already exists and is not an empty directory.


In [5]:
#check datasets are located within data subdirectory
!ls wildfire-risk/data

CA_COND.csv  OR_PLOT.csv  Wildland_Fire_Incident_Locations.xlsx  weather.csv
CA_PLOT.csv  WA_COND.csv  conditions.csv
OR_COND.csv  WA_PLOT.csv  fires.csv


# Data Load and Validation

## Fire Data - Filter for Select Characteristics across Western States

In [6]:
# Create dataframe from select (relevant) columns
cols = ['ContainmentDateTime',
        'ControlDateTime',
        'DiscoveryAcres',
        'EstimatedCostToDate',
        'FinalAcres',
        'FireBehaviorGeneral',
        'FireBehaviorGeneral1',
        'FireBehaviorGeneral2',
        'FireBehaviorGeneral3',
        'FireCause',
        'FireCauseGeneral',
        'FireCauseSpecific',
        'FireDiscoveryDateTime',
        'FireOutDateTime',
        'GACC',
        'IncidentName',
        'IncidentShortDescription',
        'InitialLatitude',
        'InitialLongitude',
        'IsFireCauseInvestigated',
        'IsTrespass',
        'POOCity',
        'POOState',
        'PredominantFuelModel',
        'PrimaryFuelModel']
wfil_df = pd.read_csv('../data/fires.csv',
                      usecols=cols, low_memory=False)

In [7]:
# Profile selected dataframe
profile(wfil_df)

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35352,33145,53942.0,60.4,,,,,,,,nan__2021/07/17 21:5
ControlDateTime,object,32598,30429,56696.0,63.5,,,,,,,,nan__2021/07/17 23:0
DiscoveryAcres,float64,72321,348,16973.0,19.0,5.8,499.5,,115997.0,191.9,,,0.1__1.0__nan__0.01_
EstimatedCostToDate,float64,2128,1040,87166.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__nan__nan__
FinalAcres,float64,2645,345,86649.0,97.0,14.8,317.3,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87796.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,88071.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,88130.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88384.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,78093,4,11201.0,12.5,,,,,,,,Unknown__Undetermine


In [8]:
# Profile states for further selection
profile_cat(wfil_df, ['POOState'])


POOState - 
US-CA 68.068403
US-OR 17.764911
US-WA 14.166685


In [9]:
# Subset dataframe for select states
wfil_s_df = wfil_df[wfil_df['POOState'].isin(['US-CA', 'US-OR', 'US-WA'])]
profile(wfil_s_df)
profile_cat(wfil_s_df, ['POOState'])

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35352,33145,53942.0,60.4,,,,,,,,2022/08/25 02:00:59+
ControlDateTime,object,32598,30429,56696.0,63.5,,,,,,,,2022/08/25 03:43:59+
DiscoveryAcres,float64,72321,348,16973.0,19.0,5.8,499.5,,115997.0,191.9,,,1.0__0.01__0.1__0.1_
EstimatedCostToDate,float64,2128,1040,87166.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__12224243.0
FinalAcres,float64,2645,345,86649.0,97.0,14.8,317.3,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87796.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,88071.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,88130.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88384.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,78093,4,11201.0,12.5,,,,,,,,Undetermined__Undete



POOState - 
US-CA 68.068403
US-OR 17.764911
US-WA 14.166685


In [10]:
# Write filtered dataframe
wfil_s_df.to_csv('../data/fires.csv', index=False)

## Weather Data - Filter For and Combine Western States

In [11]:
# Not used - station retrieval
#colw = [11, 9, 10, 7, 3, 32, 3, 10]
#coln = ['ID', 'b', 'c', 'd', 'State', 'f', 'g', 'h']
#wstn_df = pd.read_fwf('../data/ghcnd-stations.txt', header=None, widths=colw, names=coln)
#profile_cat(wstn_df, ['State'])
#
#wstn_s_df = wstn_df[wstn_df['State'].isin(['CA', 'OR', 'WA'])]
#profile(wstn_s_df)
#profile_cat(wstn_s_df, ['State'])
#wstn_s_df

In [12]:
# Set base URL for NOAA weather files
url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-month/access/'

# Fetch HTML content of the web page - all file nbames
response = requests.get(url)
response.raise_for_status()

# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [13]:
# Extract file names from the anchor tags in the HTML FOR SELECT STATES
file_names = [fn['href'] \
              for fn in soup.find_all('a')
                  if fn['href'].startswith(('USR0000C', 'USR0000O', 'USR0000W'))]

In [14]:
# Iterate select files and create combined dataframe
tfn = len(file_names)
print(f'Retrieving {tfn:d} files: ', end='')
n = 1
wthr_df = pd.DataFrame()
for f in file_names:
    print('.', end='')
    if n % 10 == 0: print(n, end='')
    wthr_df = pd.concat([wthr_df, pd.read_csv(url + f)])
    n = n + 1

# Write combined dataframe        
wthr_df.to_csv('../data/weather.csv', index=False)
profile(wthr_df)

Retrieving 610 files: ..........10..........20..........30..........40..........50..........60..........70..........80..........90..........100..........110..........120..........130..........140..........150..........160..........170..........180..........190..........200..........210..........220..........230..........240..........250..........260..........270..........280..........290..........300..........310..........320..........330..........340..........350..........360..........370..........380..........390..........400..........410..........420..........430..........440..........450..........460..........470..........480..........490..........500..........510..........520..........530..........540..........550..........560..........570..........580..........590..........600..........610

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
STATION,object,175300,610,,,,,,,,,,USR0000CSQS__USR0000
DATE,object,175300,461,,,,,,,,,,2013-12__2012-08__20
LATITUDE,float64,175300,600,,,40.1,3.9,32.6,49.0,,,131.0,35.37__37.8331__40.3
LONGITUDE,float64,175300,590,,,-118.0,6.3,-124.4,-82.4,,,266.0,-117.5683__-120.5__-
ELEVATION,float64,175300,472,,,1214.0,734.3,,3694.2,,,,1103.4__310.9__579.1
NAME,object,175300,610,,,,,,,,,,SQUAW SPRINGS CALIFO
CDSD,float64,148117,11831,27183.0,15.5,182.8,315.5,,3218.1,3.1,,,1313.4__736.5__0.0__
CDSD_ATTRIBUTES,object,143784,1,31516.0,18.0,,,,,,,,U__U__U__U__U
CLDD,float64,174374,4080,926.0,0.5,34.0,67.2,,601.0,,,23.0,0.0__293.5__0.0__202
CLDD_ATTRIBUTES,object,174374,6,926.0,0.5,,,,,,,,",U__,U__,U__,U__,U"


## Forest Condition Data - Filter and Combine for Western States

In [15]:
# Get the individual files
ca_plot_df = pd.read_csv('../data/CA_PLOT.csv', low_memory=False)
or_plot_df = pd.read_csv('../data/OR_PLOT.csv', low_memory=False)
wa_plot_df = pd.read_csv('../data/WA_PLOT.csv', low_memory=False)
ca_cond_df = pd.read_csv('../data/CA_COND.csv', low_memory=False)
or_cond_df = pd.read_csv('../data/OR_COND.csv', low_memory=False)
wa_cond_df = pd.read_csv('../data/WA_COND.csv', low_memory=False)

# Combine 'em
plot_df = pd.concat([ca_plot_df, or_plot_df, wa_plot_df])
cond_df = pd.concat([ca_cond_df, or_cond_df, wa_cond_df])

# Subset 'em to relevant columns
cols = ['PLT_CN', 'INVYR', 'STATECD', 'COND_STATUS_CD', 'MAPDEN', 'STDAGE',
        'STDSZCD', 'FLDSZCD', 'SITECLCD', 'SICOND', 'STDORGCD', 'SLOPE',
        'PHYSCLCD', 'GSSTKCD', 'DSTRBCD1', 'TRTCD1', 'PRESNFCD', 'FLDAGE',
        'CARBON_DOWN_DEAD', 'CARBON_LITTER', 'CARBON_SOIL_ORG',
        'CARBON_STANDING_DEAD', 'CARBON_UNDERSTORY_AG', 'CARBON_UNDERSTORY_BG']
cond_df = cond_df.loc[:, cols]

# Convert state codes to abbreviations
smap = {6: 'CA', 41: 'OR', 53: 'WA'}
cond_df['STATE'] = cond_df['STATECD'].map(smap)
cond_df = cond_df.drop(columns=['STATECD'])

# Join plot and cond dataframes to get location (latitude and longitude)
plot_df = plot_df.reset_index()
cond_df = pd.merge(cond_df, plot_df[['CN', 'WATERCD', 'LAT', 'LON']],
                   left_on='PLT_CN', right_on='CN', how='left')
cond_df = cond_df.drop(columns=['CN'])

# Profile and write
cond_df.to_csv('../data/conditions.csv', index=False)
profile(cond_df)

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
PLT_CN,int64,120208,94231,,,127745766191751.4,174962050684977.5,1.0,635060431126144.0,,,,22929974010497__2988
INVYR,int64,120208,21,,,2008.4,6.9,1994.0,2019.0,,,,2008__2009__2015__20
COND_STATUS_CD,int64,120208,5,,,1.8,1.0,1.0,5.0,,,,4__5__2__1__5
MAPDEN,float64,48111,3,72097.0,60.0,1.0,0.1,1.0,3.0,8.5,,20.0,nan__nan__nan__1.0__
STDAGE,float64,50653,488,69555.0,57.9,93.5,95.6,,9999.0,23.7,,,nan__nan__nan__142.0
STDSZCD,float64,52176,4,68032.0,56.6,1.5,1.0,1.0,5.0,,,,nan__nan__nan__1.0__
FLDSZCD,float64,48111,6,72097.0,60.0,2.7,1.0,,5.0,,,10.6,nan__nan__nan__3.0__
SITECLCD,float64,54682,7,65526.0,54.5,4.6,1.6,1.0,7.0,,,,nan__nan__nan__5.0__
SICOND,float64,45612,175,74596.0,62.1,90.7,26.9,7.0,192.0,,,,nan__nan__nan__87.0_
STDORGCD,float64,48111,2,72097.0,60.0,0.2,0.4,,1.0,,,,nan__nan__nan__0.0__


# Release Resources

In [16]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [17]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>