# Wildfire Risk - Data Pre-Prep
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [1]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.1'

# Setup

In [2]:
# Import basic and data access libraries
import pandas as pd
from profiler import profile, profile_cat

# Import utility libraries
import requests
from bs4 import BeautifulSoup

# Data Load and Validation

## Fire Data - Filter for Select Characteristics across Western States

In [3]:
# Create dataframe from select (relevant) columns
cols = ['ContainmentDateTime',
        'ControlDateTime',
        'DiscoveryAcres',
        'EstimatedCostToDate',
        'FinalAcres',
        'FireBehaviorGeneral',
        'FireBehaviorGeneral1',
        'FireBehaviorGeneral2',
        'FireBehaviorGeneral3',
        'FireCause',
        'FireCauseGeneral',
        'FireCauseSpecific',
        'FireDiscoveryDateTime',
        'FireOutDateTime',
        'GACC',
        'IncidentName',
        'IncidentShortDescription',
        'InitialLatitude',
        'InitialLongitude',
        'IsFireCauseInvestigated',
        'IsTrespass',
        'POOCity',
        'POOState',
        'PredominantFuelModel',
        'PrimaryFuelModel']
wfil_df = pd.read_csv('../data/Wildland_Fire_Incident_Locations.csv',
                      usecols=cols, low_memory=False)

In [4]:
# Profile selected dataframe
profile(wfil_df)

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,152010,128838,100224.0,39.7,,,,,,,,nan__2022/06/04 20:2
ControlDateTime,object,137117,111736,115117.0,45.6,,,,,,,,nan__2022/06/05 16:1
DiscoveryAcres,float64,187608,1748,64626.0,25.6,20.5,838.5,,250000.0,195.0,,,nan__0.1__0.1__0.1__
EstimatedCostToDate,float64,14793,3910,237441.0,94.1,1661896.9,14017525.2,,800000000.0,30.4,,,nan__nan__nan__nan__
FinalAcres,float64,18292,1682,233942.0,92.7,353.4,5411.5,,318156.0,36.8,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,16089,4,236145.0,93.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,7173,16,245061.0,97.2,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,6026,16,246208.0,97.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,3869,16,248365.0,98.5,,,,,,,,nan__nan__nan__nan__
FireCause,object,221463,4,30771.0,12.2,,,,,,,,nan__Human__Human__U


In [5]:
# Profile states for further selection
profile_cat(wfil_df, ['POOState'])


POOState - 
 US-CA  24.084382
 US-AZ   6.855935
 US-MT   6.777437
 US-OR   6.289001
 US-WA   5.013598
 US-UT   4.232974
 US-ID    4.22901
 US-CO    3.68626
 US-NM    3.52054
 US-TX   3.485256
 US-WY   3.473758
 US-ND   3.427373
 US-OK   3.151042
 US-MN   2.944092
 US-SD   2.503231
 US-NV   2.155538
 US-FL   2.105981
 US-AK   1.833615
 US-ME   1.476803
 US-MS   0.947533
 US-AR   0.735032
 US-MI   0.680717
 US-MA   0.583189
 US-MO   0.533631
 US-AL   0.500726
 US-TN   0.490814
 US-KY   0.461476
 US-SC   0.439671
 US-NC   0.371877
 US-LA   0.345711
 US-IL    0.30329
 US-NE   0.260869
 US-VA     0.2462
 US-KS   0.243028
 US-OH   0.242632
 US-GA   0.232324
 US-WI   0.202193
 US-IN   0.136381
 US-WV   0.127659
 US-PA   0.123298
 US-IA   0.108233
 US-PR   0.097528
 US-NY   0.080084
 US-NH   0.079688
 US-MD   0.043214
 US-VT   0.042421
 US-NJ   0.040835
 US-HI   0.019823
 US-DC   0.011497
 US-GU   0.005154
MX-BCN   0.003965
MX-SON   0.003568
 MX-BN   0.001982
 US-CT   0.001982
 CA-BC   0.0011

In [6]:
# Subset dataframe for select states
wfil_s_df = wfil_df[wfil_df['POOState'].isin(['US-CA', 'US-OR', 'US-WA'])]
profile(wfil_s_df)
profile_cat(wfil_s_df, ['POOState'])

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35342,33135,53916.0,60.4,,,,,,,,nan__2014/08/24 18:5
ControlDateTime,object,32592,30423,56666.0,63.5,,,,,,,,nan__2014/08/24 18:5
DiscoveryAcres,float64,72286,348,16972.0,19.0,5.8,499.6,,115997.0,191.9,,,nan__nan__0.1__2.0__
EstimatedCostToDate,float64,2128,1040,87130.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__nan__nan__
FinalAcres,float64,2639,345,86619.0,97.0,14.8,317.6,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87760.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,88035.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,88094.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88348.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,78058,4,11200.0,12.5,,,,,,,,Natural__Human__Unde



POOState - 
US-CA  68.060006
US-OR  17.772076
US-WA  14.167918


In [7]:
# Write filtered dataframe
wfil_s_df.to_csv('../data/fires.csv', index=False)

## Weather Data - Filter For and Combine Western States

In [8]:
# Not used - station retrieval
#colw = [11, 9, 10, 7, 3, 32, 3, 10]
#coln = ['ID', 'b', 'c', 'd', 'State', 'f', 'g', 'h']
#wstn_df = pd.read_fwf('../data/ghcnd-stations.txt', header=None, widths=colw, names=coln)
#profile_cat(wstn_df, ['State'])
#
#wstn_s_df = wstn_df[wstn_df['State'].isin(['CA', 'OR', 'WA'])]
#profile(wstn_s_df)
#profile_cat(wstn_s_df, ['State'])
#wstn_s_df

In [9]:
# Set base URL for NOAA weather files
url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-month/access/'

# Fetch HTML content of the web page - all file nbames
response = requests.get(url)
response.raise_for_status()

# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [10]:
# Extract file names from the anchor tags in the HTML FOR SELECT STATES
file_names = [fn['href'] \
              for fn in soup.find_all('a')
                  if fn['href'].startswith(('USR0000C', 'USR0000O', 'USR0000W'))]

In [11]:
# Iterate select files and create combined dataframe
tfn = len(file_names)
print(f'Retrieving {tfn:d} files: ', end='')
n = 1
wthr_df = pd.DataFrame()
for f in file_names:
    print('.', end='')
    if n % 10 == 0: print(n, end='')
    wthr_df = pd.concat([wthr_df, pd.read_csv(url + f)])
    n = n + 1

# Write combined dataframe        
wthr_df.to_csv('../data/weather.csv', index=False)
profile(wthr_df)

Retrieving 610 files: ..........10..........20..........30..........40..........50..........60..........70..........80..........90..........100..........110..........120..........130..........140..........150..........160..........170..........180..........190..........200..........210..........220..........230..........240..........250..........260..........270..........280..........290..........300..........310..........320..........330..........340..........350..........360..........370..........380..........390..........400..........410..........420..........430..........440..........450..........460..........470..........480..........490..........500..........510..........520..........530..........540..........550..........560..........570..........580..........590..........600..........610

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
STATION,object,175300,610,,,,,,,,,,USR0000OTUP__USR0000
DATE,object,175300,461,,,,,,,,,,1988-06__1993-10__20
LATITUDE,float64,175300,600,,,40.1,3.9,32.6,49.0,,,131.0,45.0708__48.4028__39
LONGITUDE,float64,175300,590,,,-118.0,6.3,-124.4,-82.4,,,266.0,-119.49__-121.7903__
ELEVATION,float64,175300,472,,,1214.0,734.3,,3694.2,,,,1219.2__579.1__1652.
NAME,object,175300,610,,,,,,,,,,"TUPPER OREGON, OR US"
CDSD,float64,148117,11831,27183.0,15.5,182.8,315.5,,3218.1,3.1,,,19.9__nan__0.0__1.7_
CDSD_ATTRIBUTES,object,143784,1,31516.0,18.0,,,,,,,,U__nan__U__U__U
CLDD,float64,174374,4080,926.0,0.5,34.0,67.2,,601.0,,,23.0,17.0__0.7__0.0__1.7_
CLDD_ATTRIBUTES,object,174374,6,926.0,0.5,,,,,,,,",U__,U__,U__,U__,U"


## Forest Condition Data - Filter and Combine for Western States

In [12]:
# Get the individual files
ca_plot_df = pd.read_csv('../data/CA_PLOT.csv', low_memory=False)
or_plot_df = pd.read_csv('../data/OR_PLOT.csv', low_memory=False)
wa_plot_df = pd.read_csv('../data/WA_PLOT.csv', low_memory=False)
ca_cond_df = pd.read_csv('../data/CA_COND.csv', low_memory=False)
or_cond_df = pd.read_csv('../data/OR_COND.csv', low_memory=False)
wa_cond_df = pd.read_csv('../data/WA_COND.csv', low_memory=False)

# Combine 'em
plot_df = pd.concat([ca_plot_df, or_plot_df, wa_plot_df])
cond_df = pd.concat([ca_cond_df, or_cond_df, wa_cond_df])

# Subset 'em to relevant columns
cols = ['PLT_CN', 'INVYR', 'STATECD', 'COND_STATUS_CD', 'MAPDEN', 'STDAGE',
        'STDSZCD', 'FLDSZCD', 'SITECLCD', 'SICOND', 'STDORGCD', 'SLOPE',
        'PHYSCLCD', 'GSSTKCD', 'DSTRBCD1', 'TRTCD1', 'PRESNFCD', 'FLDAGE',
        'CARBON_DOWN_DEAD', 'CARBON_LITTER', 'CARBON_SOIL_ORG',
        'CARBON_STANDING_DEAD', 'CARBON_UNDERSTORY_AG', 'CARBON_UNDERSTORY_BG']
cond_df = cond_df.loc[:, cols]

# Convert state codes to abbreviations
smap = {6: 'CA', 41: 'OR', 53: 'WA'}
cond_df['STATE'] = cond_df['STATECD'].map(smap)
cond_df = cond_df.drop(columns=['STATECD'])

# Join plot and cond dataframes to get location (latitude and longitude)
plot_df = plot_df.reset_index()
cond_df = pd.merge(cond_df, plot_df[['CN', 'WATERCD', 'LAT', 'LON']],
                   left_on='PLT_CN', right_on='CN', how='left')
cond_df = cond_df.drop(columns=['CN'])

# Profile and write
cond_df.to_csv('../data/conditions.csv', index=False)
profile(cond_df)

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
PLT_CN,int64,120208,94231,,,127745766191751.4,174962050684977.5,1.0,635060431126144.0,,,,22954109010497__2909
INVYR,int64,120208,21,,,2008.4,6.9,1994.0,2019.0,,,,2007__1994__2002__20
COND_STATUS_CD,int64,120208,5,,,1.8,1.0,1.0,5.0,,,,1__4__2__1__1
MAPDEN,float64,48111,3,72097.0,60.0,1.0,0.1,1.0,3.0,8.5,,20.0,1.0__nan__nan__1.0__
STDAGE,float64,50653,488,69555.0,57.9,93.5,95.6,,9999.0,23.7,,,60.0__nan__nan__135.
STDSZCD,float64,52176,4,68032.0,56.6,1.5,1.0,1.0,5.0,,,,1.0__nan__nan__1.0__
FLDSZCD,float64,48111,6,72097.0,60.0,2.7,1.0,,5.0,,,10.6,3.0__nan__nan__2.0__
SITECLCD,float64,54682,7,65526.0,54.5,4.6,1.6,1.0,7.0,,,,7.0__nan__nan__6.0__
SICOND,float64,45612,175,74596.0,62.1,90.7,26.9,7.0,192.0,,,,nan__nan__nan__57.0_
STDORGCD,float64,48111,2,72097.0,60.0,0.2,0.4,,1.0,,,,0.0__nan__nan__0.0__


## Experiment with Logical Joins across Dataframes

In [43]:
# Show date range on fires table
wfil_s_df = wfil_s_df.copy()
wfil_s_df.loc[:, 'year'] = wfil_s_df['FireDiscoveryDateTime'].apply(lambda x: x[:4])
year_counts = wfil_s_df.groupby('year')['FireDiscoveryDateTime'].count().sort_values(ascending=False)
print(year_counts)

year
2022    17544
2021    16335
2020    14335
2019    10072
2017     7780
2018     7212
2015     5811
2014     5097
2016     3809
2023     1260
2011        2
2004        1
Name: FireDiscoveryDateTime, dtype: int64


In [44]:
# Show date range on weather table
wthr_df['year'] = wthr_df['DATE'].apply(lambda x: x[:4])
year_counts = wthr_df.groupby(wthr_df['DATE'].str[:4]).size().sort_index(ascending=False)
print(year_counts)

DATE
2023     959
2022    5792
2021    1463
2020    5877
2019    5838
2018    5891
2017    5846
2016    5830
2015    5908
2014    5937
2013    5775
2012    6035
2011    6077
2010    6171
2009    6178
2008    6046
2007    6227
2006    6189
2005    6255
2004    6273
2003    6237
2002    5728
2001    5420
2000    5131
1999    4808
1998    4295
1997    4180
1996    4021
1995    3807
1994    3564
1993    3122
1992    3292
1991    2833
1990    2295
1989    1715
1988    1396
1987    1303
1986     910
1985     643
1984      29
1983       4
dtype: int64


In [45]:
# Show date range on conditions table
cond_df[['STATE', 'INVYR']].drop_duplicates().\
    sort_values(by=['INVYR', 'STATE'],
                ascending=[False, True]).reset_index(drop=True)

Unnamed: 0,STATE,INVYR
0,CA,2019
1,OR,2019
2,WA,2019
3,CA,2018
4,OR,2018
5,WA,2018
6,CA,2017
7,OR,2017
8,WA,2017
9,CA,2016
