# Wildfire Risk - Data Pre-Prep
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [1]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.1'

# Setup

In [2]:
# Import basic and data access libraries
import pandas as pd
from profiler import profile, profile_cat

# Import utility libraries
import requests
from bs4 import BeautifulSoup

# Data Load and Validation

## Fire Data - Filter for Select Characteristics across Western States

In [3]:
# Create dataframe from select (relevant) columns
cols = ['ContainmentDateTime',
        'ControlDateTime',
        'DiscoveryAcres',
        'EstimatedCostToDate',
        'FinalAcres',
        'FireBehaviorGeneral',
        'FireBehaviorGeneral1',
        'FireBehaviorGeneral2',
        'FireBehaviorGeneral3',
        'FireCause',
        'FireCauseGeneral',
        'FireCauseSpecific',
        'FireDiscoveryDateTime',
        'FireOutDateTime',
        'GACC',
        'IncidentName',
        'IncidentShortDescription',
        'InitialLatitude',
        'InitialLongitude',
        'IsFireCauseInvestigated',
        'IsTrespass',
        'POOCity',
        'POOState',
        'PredominantFuelModel',
        'PrimaryFuelModel']
wfil_df = pd.read_csv('../data/Wildland_Fire_Incident_Locations.csv',
                      usecols=cols, low_memory=False)

In [4]:
# Profile selected dataframe
profile(wfil_df)

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,151768,128615,99916.0,39.7,,,,,,,,2020/09/13 18:49:59+
ControlDateTime,object,136925,111573,114759.0,45.6,,,,,,,,2020/09/14 00:21:59+
DiscoveryAcres,float64,187111,1742,64573.0,25.7,20.6,839.6,,250000.0,194.8,,,15.0__0.1__0.01__0.1
EstimatedCostToDate,float64,14771,3907,236913.0,94.1,1664361.4,14027815.1,,800000000.0,30.4,,,nan__nan__nan__nan__
FinalAcres,float64,18206,1682,233478.0,92.8,355.0,5424.2,,318156.0,36.7,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,16042,4,235642.0,93.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,7173,16,244511.0,97.1,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,6020,16,245664.0,97.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,3865,16,247819.0,98.5,,,,,,,,nan__nan__nan__nan__
FireCause,object,220973,4,30711.0,12.2,,,,,,,,Natural__Natural__Un


In [5]:
# Profile states for further selection
profile_cat(wfil_df, ['POOState'])


POOState - 
 US-CA   24.1092
 US-AZ  6.866547
 US-MT  6.784698
 US-OR  6.301155
 US-WA  5.015813
 US-UT  4.241827
 US-ID  4.236265
 US-CO  3.692328
 US-NM  3.523863
 US-TX  3.480158
 US-WY  3.478568
 US-ND  3.434863
 US-OK  3.147995
 US-MN  2.950128
 US-SD  2.507509
 US-NV  2.158262
 US-FL  2.099458
 US-AK  1.838019
 US-ME  1.479633
 US-MS  0.946822
 US-AR  0.724718
 US-MI  0.679423
 US-MA  0.566186
 US-MO  0.529235
 US-AL  0.498244
 US-TN  0.481159
 US-KY  0.437453
 US-SC  0.427918
 US-NC  0.370306
 US-LA   0.34448
 US-IL  0.296801
 US-NE  0.261439
 US-VA  0.244751
 US-KS  0.242765
 US-OH  0.240381
 US-GA  0.229256
 US-WI  0.202238
 US-IN   0.13509
 US-WV  0.127938
 US-PA  0.121581
 US-IA  0.108469
 US-PR   0.09655
 US-NY  0.080259
 US-NH  0.079862
 US-MD  0.043308
 US-VT  0.042514
 US-NJ  0.040527
 US-HI  0.019866
 US-DC  0.011522
 US-GU  0.005165
MX-BCN  0.003973
MX-SON  0.003576
 MX-BN  0.001987
 US-CT  0.001987
 CA-BC  0.001192
MX-TAM  0.001192
 US-DE  0.001192
 CA-SK  0.000795
 

In [6]:
# Subset dataframe for select states
wfil_s_df = wfil_df[wfil_df['POOState'].isin(['US-CA', 'US-OR', 'US-WA'])]
profile(wfil_s_df)
profile_cat(wfil_s_df, ['POOState'])

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
ContainmentDateTime,object,35315,33108,53847.0,60.4,,,,,,,,2021/06/11 00:14:59+
ControlDateTime,object,32570,30401,56592.0,63.5,,,,,,,,2021/06/13 16:38:00+
DiscoveryAcres,float64,72195,348,16967.0,19.0,5.8,500.0,,115997.0,191.8,,,0.1__0.1__1.0__0.01_
EstimatedCostToDate,float64,2128,1040,87034.0,97.6,8193477.0,34440012.7,,800000000.0,13.1,,,nan__nan__nan__nan__
FinalAcres,float64,2631,345,86531.0,97.0,14.9,318.1,,13440.0,34.3,,,nan__nan__nan__nan__
FireBehaviorGeneral,object,1498,4,87664.0,98.3,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral1,object,1223,16,87939.0,98.6,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral2,object,1164,16,87998.0,98.7,,,,,,,,nan__nan__nan__nan__
FireBehaviorGeneral3,object,910,16,88252.0,99.0,,,,,,,,nan__nan__nan__nan__
FireCause,object,77967,4,11195.0,12.6,,,,,,,,Human__Unknown__Huma



POOState - 
US-CA  68.054777
US-OR  17.786725
US-WA  14.158498


In [7]:
# Write filtered dataframe
wfil_s_df.to_csv('../data/fires.csv', index=False)

## Weather Data - Filter For and Combine Western States

In [8]:
# Not used - station retrieval
#colw = [11, 9, 10, 7, 3, 32, 3, 10]
#coln = ['ID', 'b', 'c', 'd', 'State', 'f', 'g', 'h']
#wstn_df = pd.read_fwf('../data/ghcnd-stations.txt', header=None, widths=colw, names=coln)
#profile_cat(wstn_df, ['State'])
#
#wstn_s_df = wstn_df[wstn_df['State'].isin(['CA', 'OR', 'WA'])]
#profile(wstn_s_df)
#profile_cat(wstn_s_df, ['State'])
#wstn_s_df

In [9]:
# Set base URL for NOAA weather files
url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-month/access/'

# Fetch HTML content of the web page - all file nbames
response = requests.get(url)
response.raise_for_status()

# Parse HTML content
soup = BeautifulSoup(response.text, 'html.parser')

In [10]:
# Extract file names from the anchor tags in the HTML FOR SELECT STATES
file_names = [fn['href'] \
              for fn in soup.find_all('a')
                  if fn['href'].startswith(('USR0000C', 'USR0000O', 'USR0000W'))]

In [11]:
# Iterate select files and create combined dataframe
tfn = len(file_names)
print(f'Retrieving {tfn:d} files: ', end='')
n = 1
df = pd.DataFrame()
for f in file_names:
    print('.', end='')
    if n % 10 == 0: print(n, end='')
    df = pd.concat([df, pd.read_csv(url + f)])
    n = n + 1

# Write combined dataframe        
df.to_csv('../data/weather.csv', index=False)
profile(df)

Retrieving 610 files: ..........10..........20..........30..........40..........50..........60..........70..........80..........90..........100..........110..........120..........130..........140..........150..........160..........170..........180..........190..........200..........210..........220..........230..........240..........250..........260..........270..........280..........290..........300..........310..........320..........330..........340..........350..........360..........370..........380..........390..........400..........410..........420..........430..........440..........450..........460..........470..........480..........490..........500..........510..........520..........530..........540..........550..........560..........570..........580..........590..........600..........610

Unnamed: 0,Dtype,count,unique,na,na%,mean,std,min,max,skew(>=3),<v0.01,VIF(>=10),examples
STATION,object,175300,610,,,,,,,,,,USR0000CCRO__USR0000
DATE,object,175300,461,,,,,,,,,,2010-12__2007-10__20
LATITUDE,float64,175300,600,,,40.1,3.9,32.6,49.0,,,131.0,39.3528__39.3858__42
LONGITUDE,float64,175300,590,,,-118.0,6.3,-124.4,-82.4,,,266.0,-107.0931__-82.985__
ELEVATION,float64,175300,472,,,1214.0,734.3,,3694.2,,,,2530.8__192.0__1703.
NAME,object,175300,610,,,,,,,,,,"THE CROWN COLORADO,"
CDSD,float64,148117,11831,27183.0,15.5,182.8,315.5,,3218.1,3.1,,,nan__707.0__0.0__0.0
CDSD_ATTRIBUTES,object,143784,1,31516.0,18.0,,,,,,,,nan__U__U__U__nan
CLDD,float64,174374,4080,926.0,0.5,34.0,67.2,,601.0,,,23.0,0.0__43.3__0.0__0.0_
CLDD_ATTRIBUTES,object,174374,6,926.0,0.5,,,,,,,,",U__,U__,U__,U__,U"
