# Wildfire Risk - Data Pre-Prep - Join Challenge
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [1]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.2'

# Setup

In [2]:
# Import basic and data access libraries
import pandas as pd
from profiler import profile, profile_cat

# Import utility libraries
import h3

# Data Load

In [3]:
wfil_df = pd.read_csv('../data/fires.csv', low_memory=False)
wthr_df = pd.read_csv('../data/weather.csv', low_memory=False)
cond_df = pd.read_csv('../data/conditions.csv', low_memory=False)

# Date Check and Conversion

In [4]:
# Show date range on fires table
wfil_df.loc[:, 'year'] = wfil_df['FireDiscoveryDateTime'].apply(lambda x: x[:4])
year_counts = wfil_df.groupby('year')['FireDiscoveryDateTime'].count().sort_values(ascending=False)
print(wfil_df['FireDiscoveryDateTime'].head(10))
print(year_counts)

0    2020/02/28 20:45:40+00
1    2019/07/01 19:54:00+00
2    2016/06/20 22:05:59+00
3    2021/11/25 15:17:32+00
4    2022/11/21 11:25:33+00
5    2017/07/07 20:10:00+00
6    2016/06/20 20:03:59+00
7    2017/08/27 14:33:32+00
8    2017/10/09 14:50:17+00
9    2019/11/18 17:36:59+00
Name: FireDiscoveryDateTime, dtype: object
year
2022    17544
2021    16335
2020    14335
2019    10072
2017     7780
2018     7212
2015     5811
2014     5097
2016     3809
2023     1296
2011        2
2004        1
Name: FireDiscoveryDateTime, dtype: int64


In [5]:
# Show date range on weather table
wthr_df['year'] = wthr_df['DATE'].apply(lambda x: x[:4])
year_counts = wthr_df.groupby(wthr_df['DATE'].str[:4]).size().sort_index(ascending=False)
print(wthr_df['DATE'].head(10))
print(year_counts)

0    1995-01
1    1995-02
2    1995-03
3    1995-04
4    1995-05
5    1995-06
6    1995-07
7    1995-08
8    1995-09
9    1995-10
Name: DATE, dtype: object
DATE
2023     959
2022    5792
2021    1463
2020    5877
2019    5838
2018    5891
2017    5846
2016    5830
2015    5908
2014    5937
2013    5775
2012    6035
2011    6077
2010    6171
2009    6178
2008    6046
2007    6227
2006    6189
2005    6255
2004    6273
2003    6237
2002    5728
2001    5420
2000    5131
1999    4808
1998    4295
1997    4180
1996    4021
1995    3807
1994    3564
1993    3122
1992    3292
1991    2833
1990    2295
1989    1715
1988    1396
1987    1303
1986     910
1985     643
1984      29
1983       4
dtype: int64


In [6]:
# Show date range on conditions table
cond_df[['STATE', 'INVYR']].drop_duplicates().\
    sort_values(by=['INVYR', 'STATE'],
                ascending=[False, True]).reset_index(drop=True)

Unnamed: 0,STATE,INVYR
0,CA,2019
1,OR,2019
2,WA,2019
3,CA,2018
4,OR,2018
5,WA,2018
6,CA,2017
7,OR,2017
8,WA,2017
9,CA,2016


In [7]:
# Convert?


# Geohash and Merge

In [8]:
# Define function to encode latitude and longitude into h3 hexagons
def encode_geohash(row, lat, lng):
    try:
        if pd.notnull(row[lat]) and pd.notnull(row[lng]):
            return h3.geo_to_h3(row[lat], row[lng], resolution=7)
    except ValueError as e:
        print(f"Error: {e}")
    return None

# Add h3 hexagon column to dataframe using apply method
wfil_df['geohash'] = wfil_df.apply(encode_geohash, axis=1, args=('InitialLatitude', 'InitialLongitude'))
geohash_counts = wfil_df.groupby('geohash').size().reset_index(name='Count')
print(geohash_counts.sort_values(by='Count', ascending=False).head(10))

wthr_df['geohash'] = wthr_df.apply(encode_geohash, axis=1, args=('LATITUDE', 'LONGITUDE'))
geohash_counts = wthr_df.groupby('geohash').size().reset_index(name='Count')
print(geohash_counts.sort_values(by='Count', ascending=False).head(10))

cond_df['geohash'] = cond_df.apply(encode_geohash, axis=1, args=('LAT', 'LON'))
geohash_counts = wfil_df.groupby('geohash').size().reset_index(name='Count')
print(geohash_counts.sort_values(by='Count', ascending=False).head(10))

# Save updated dataframe to CSV file
wfil_df.to_csv('../data/fires_geohash.csv', index=False)
wthr_df.to_csv('../data/weather_geohash.csv', index=False)
cond_df.to_csv('../data/conditions_geohash.csv', index=False)

               geohash  Count
26364  8729a56f2ffffff    384
26357  8729a56e9ffffff    325
26356  8729a56e8ffffff    268
26301  8729a5689ffffff    254
25093  8729a1441ffffff    234
25128  8729a146affffff    224
26342  8729a56d6ffffff    218
25237  8729a1559ffffff    215
26315  8729a569dffffff    194
25103  8729a144cffffff    183
             geohash  Count
554  8729ab19effffff    549
23   87268276affffff    449
26   8726835a8ffffff    449
297  872885accffffff    444
321  8728a312affffff    442
335  8728a9620ffffff    441
58   872698809ffffff    438
274  872833804ffffff    437
299  872885c6cffffff    436
292  8728818b6ffffff    436
               geohash  Count
26364  8729a56f2ffffff    384
26357  8729a56e9ffffff    325
26356  8729a56e8ffffff    268
26301  8729a5689ffffff    254
25093  8729a1441ffffff    234
25128  8729a146affffff    224
26342  8729a56d6ffffff    218
25237  8729a1559ffffff    215
26315  8729a569dffffff    194
25103  8729a144cffffff    183


In [9]:
# First, merge weather and conditions for 'superset'
merged_df = pd.merge(wthr_df, cond_df, on='geohash', how='inner')

# Then merge superset with fires for full set
merged_df = pd.merge(merged_df, wfil_df, on='geohash', how='left')

In [10]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 256316 entries, 0 to 256315
Data columns (total 89 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   STATION                   256316 non-null  object 
 1   DATE                      256316 non-null  object 
 2   LATITUDE                  256316 non-null  float64
 3   LONGITUDE                 256316 non-null  float64
 4   ELEVATION                 256316 non-null  float64
 5   NAME                      256316 non-null  object 
 6   CDSD                      219457 non-null  float64
 7   CDSD_ATTRIBUTES           212821 non-null  object 
 8   CLDD                      254954 non-null  float64
 9   CLDD_ATTRIBUTES           254954 non-null  object 
 10  DT00                      255794 non-null  float64
 11  DT00_ATTRIBUTES           255794 non-null  object 
 12  DT32                      255794 non-null  float64
 13  DT32_ATTRIBUTES           255794 non-null  o