# Wildfire Risk - Data Pre-Prep
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [1]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.1'

# Setup

In [2]:
# Import basic and data access libraries
import pandas as pd
from profiler import profile, profile_cat

# Import utility libraries
import requests
from bs4 import BeautifulSoup

# Data Load and Validation

## Fire Data - Filter for Select Characteristics across Western States

In [3]:
# Create dataframe from select (relevant) columns
cols = ['ContainmentDateTime',
        'ControlDateTime',
        'DiscoveryAcres',
        'EstimatedCostToDate',
        'FinalAcres',
        'FireBehaviorGeneral',
        'FireBehaviorGeneral1',
        'FireBehaviorGeneral2',
        'FireBehaviorGeneral3',
        'FireCause',
        'FireCauseGeneral',
        'FireCauseSpecific',
        'FireDiscoveryDateTime',
        'FireOutDateTime',
        'GACC',
        'IncidentName',
        'IncidentShortDescription',
        'InitialLatitude',
        'InitialLongitude',
        'IsFireCauseInvestigated',
        'IsTrespass',
        'POOCity',
        'POOState',
        'PredominantFuelModel',
        'PrimaryFuelModel']
wfil_df = pd.read_csv('../data/Wildland_Fire_Incident_Locations.csv',
                      usecols=cols, low_memory=False)

In [4]:
# Profile selected dataframe
profile(wfil_df)

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,152073,128890,100290.0,39.7,,,,,,,,nan__2018/07/11 15:3
ControlDateTime,object,137147,111764,115216.0,45.7,,,,,,,,2022/08/12 18:49:59+
DiscoveryAcres,float64,187721,1750,64642.0,25.6,20.5,838.3,,250000.0,195.1,,,0.1__0.5__0.3__0.1__
EstimatedCostToDate,float64,14805,3910,237558.0,94.1,1660555.3,14011922.0,,800000000.0,30.5,,,nan__nan__nan__nan__
FinalAcres,float64,18330,1683,234033.0,92.7,352.8,5405.9,,318156.0,36.9,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,16111,4,236252.0,93.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,7173,16,245190.0,97.2,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,6028,16,246335.0,97.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,3869,16,248494.0,98.5,,,,,,,,nan__nan__nan__nan__
FireCause,object,221578,4,30785.0,12.2,,,,,,,,Natural__Human__Unkn


In [5]:
# Profile states for further selection
profile_cat(wfil_df, ['POOState'])


POOState - 
 US-CA  24.084751
 US-AZ    6.85362
 US-MT   6.773972
 US-OR   6.285787
 US-WA   5.012621
 US-UT    4.23081
 US-ID   4.227244
 US-CO   3.684375
 US-NM   3.519533
 US-TX   3.485455
 US-WY   3.471983
 US-ND   3.425621
 US-OK   3.151017
 US-MN   2.942587
 US-SD   2.501952
 US-NV   2.154833
 US-FL   2.107678
 US-AK   1.832678
 US-ME   1.476841
 US-MS    0.94903
 US-AR   0.735052
 US-MI   0.680369
 US-MA   0.587249
 US-MO   0.533359
 US-AL   0.507602
 US-TN   0.492941
 US-KY   0.471939
 US-SC   0.440635
 US-NC   0.374461
 US-LA   0.345534
 US-IL   0.301153
 US-NE   0.260736
 US-VA   0.247263
 US-KS     0.2433
 US-OH   0.237357
 US-GA   0.232998
 US-WI    0.20209
 US-IN   0.137104
 US-WV   0.127198
 US-PA   0.123235
 US-IA   0.108178
 US-PR   0.097479
 US-NY   0.080043
 US-NH   0.079647
 US-MD   0.043192
 US-VT   0.042399
 US-NJ   0.040814
 US-HI   0.019813
 US-DC   0.011888
 US-GU   0.005151
MX-BCN   0.003963
MX-SON   0.003566
 MX-BN   0.001981
 US-CT   0.001981
 CA-BC   0.0011

In [6]:
# Subset dataframe for select states
wfil_s_df = wfil_df[wfil_df['POOState'].isin(['US-CA', 'US-OR', 'US-WA'])]
profile(wfil_s_df)
profile_cat(wfil_s_df, ['POOState'])

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35352,33145,53942.0,60.4,,,,,,,,nan__nan__2021/06/26
ControlDateTime,object,32598,30429,56696.0,63.5,,,,,,,,nan__nan__2021/06/29
DiscoveryAcres,float64,72321,348,16973.0,19.0,5.8,499.5,,115997.0,191.9,,,nan__nan__1.0__0.1__
EstimatedCostToDate,float64,2128,1040,87166.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__nan__nan__
FinalAcres,float64,2645,345,86649.0,97.0,14.8,317.3,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87796.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,88071.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,88130.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88384.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,78093,4,11201.0,12.5,,,,,,,,nan__nan__Natural__U



POOState - 
US-CA  68.068403
US-OR  17.764911
US-WA  14.166685


In [7]:
# Write filtered dataframe
wfil_s_df.to_csv('../data/fires.csv', index=False)

## Weather Data - Filter For and Combine Western States

In [8]:
# Not used - station retrieval
#colw = [11, 9, 10, 7, 3, 32, 3, 10]
#coln = ['ID', 'b', 'c', 'd', 'State', 'f', 'g', 'h']
#wstn_df = pd.read_fwf('../data/ghcnd-stations.txt', header=None, widths=colw, names=coln)
#profile_cat(wstn_df, ['State'])
#
#wstn_s_df = wstn_df[wstn_df['State'].isin(['CA', 'OR', 'WA'])]
#profile(wstn_s_df)
#profile_cat(wstn_s_df, ['State'])
#wstn_s_df

In [9]:
# Set base URL for NOAA weather files
url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-month/access/'

# Fetch HTML content of the web page - all file nbames
response = requests.get(url)
response.raise_for_status()

# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [10]:
# Extract file names from the anchor tags in the HTML FOR SELECT STATES
file_names = [fn['href'] \
              for fn in soup.find_all('a')
                  if fn['href'].startswith(('USR0000C', 'USR0000O', 'USR0000W'))]

In [11]:
# Iterate select files and create combined dataframe
tfn = len(file_names)
print(f'Retrieving {tfn:d} files: ', end='')
n = 1
wthr_df = pd.DataFrame()
for f in file_names:
    print('.', end='')
    if n % 10 == 0: print(n, end='')
    wthr_df = pd.concat([wthr_df, pd.read_csv(url + f)])
    n = n + 1

# Write combined dataframe        
wthr_df.to_csv('../data/weather.csv', index=False)
profile(wthr_df)

Retrieving 610 files: ..........10..........20..........30..........40..........50..........60..........70..........80..........90..........100..........110..........120..........130..........140..........150..........160..........170..........180..........190..........200..........210..........220..........230..........240..........250..........260..........270..........280..........290..........300..........310..........320..........330..........340..........350..........360..........370..........380..........390..........400..........410..........420..........430..........440..........450..........460..........470..........480..........490..........500..........510..........520..........530..........540..........550..........560..........570..........580..........590..........600..........610

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
STATION,object,175300,610,,,,,,,,,,USR0000CDNK__USR0000
DATE,object,175300,461,,,,,,,,,,2010-04__2019-01__20
LATITUDE,float64,175300,600,,,40.1,3.9,32.6,49.0,,,131.0,37.0664__33.9306__38
LONGITUDE,float64,175300,590,,,-118.0,6.3,-124.4,-82.4,,,266.0,-119.0394__-116.95__
ELEVATION,float64,175300,472,,,1214.0,734.3,,3694.2,,,,1727.6__793.7__1426.
NAME,object,175300,610,,,,,,,,,,"DINKEY CALIFORNIA, C"
CDSD,float64,148117,11831,27183.0,15.5,182.8,315.5,,3218.1,3.1,,,0.0__0.0__141.1__199
CDSD_ATTRIBUTES,object,143784,1,31516.0,18.0,,,,,,,,U__U__U__U__U
CLDD,float64,174374,4080,926.0,0.5,34.0,67.2,,601.0,,,23.0,0.0__0.0__69.2__115.
CLDD_ATTRIBUTES,object,174374,6,926.0,0.5,,,,,,,,",U__,U__,U__,U__,U"


## Forest Condition Data - Filter and Combine for Western States

In [12]:
# Get the individual files
ca_plot_df = pd.read_csv('../data/CA_PLOT.csv', low_memory=False)
or_plot_df = pd.read_csv('../data/OR_PLOT.csv', low_memory=False)
wa_plot_df = pd.read_csv('../data/WA_PLOT.csv', low_memory=False)
ca_cond_df = pd.read_csv('../data/CA_COND.csv', low_memory=False)
or_cond_df = pd.read_csv('../data/OR_COND.csv', low_memory=False)
wa_cond_df = pd.read_csv('../data/WA_COND.csv', low_memory=False)

# Combine 'em
plot_df = pd.concat([ca_plot_df, or_plot_df, wa_plot_df])
cond_df = pd.concat([ca_cond_df, or_cond_df, wa_cond_df])

# Subset 'em to relevant columns
cols = ['PLT_CN', 'INVYR', 'STATECD', 'COND_STATUS_CD', 'MAPDEN', 'STDAGE',
        'STDSZCD', 'FLDSZCD', 'SITECLCD', 'SICOND', 'STDORGCD', 'SLOPE',
        'PHYSCLCD', 'GSSTKCD', 'DSTRBCD1', 'TRTCD1', 'PRESNFCD', 'FLDAGE',
        'CARBON_DOWN_DEAD', 'CARBON_LITTER', 'CARBON_SOIL_ORG',
        'CARBON_STANDING_DEAD', 'CARBON_UNDERSTORY_AG', 'CARBON_UNDERSTORY_BG']
cond_df = cond_df.loc[:, cols]

# Convert state codes to abbreviations
smap = {6: 'CA', 41: 'OR', 53: 'WA'}
cond_df['STATE'] = cond_df['STATECD'].map(smap)
cond_df = cond_df.drop(columns=['STATECD'])

# Join plot and cond dataframes to get location (latitude and longitude)
plot_df = plot_df.reset_index()
cond_df = pd.merge(cond_df, plot_df[['CN', 'WATERCD', 'LAT', 'LON']],
                   left_on='PLT_CN', right_on='CN', how='left')
cond_df = cond_df.drop(columns=['CN'])

# Profile and write
cond_df.to_csv('../data/conditions.csv', index=False)
profile(cond_df)

FileNotFoundError: [Errno 2] No such file or directory: '../data/CA_PLOT.csv'