# Wildfire Risk - Data Prep B - Geohash and Merge
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [2]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.2'

# Setup

In [3]:
!pip install h3

# Import basic and data access libraries
import pandas as pd
from profiler import profile, profile_cat

# Import utility libraries
import h3

[0m

# Data Load

In [4]:
wfil_df = pd.read_csv('../data/fires.csv', low_memory=False)
wthr_df = pd.read_csv('../data/weather.csv', low_memory=False)
cond_df = pd.read_csv('../data/conditions.csv', low_memory=False)

print(len(wfil_df))
print(len(wthr_df))
print(len(cond_df))

89294
175300
120208


# Geohash and Merge

In [5]:
# Define function to encode latitude and longitude into h3 hexagons
def encode_geohash(row, lat, lng):
    try:
        if pd.notnull(row[lat]) and pd.notnull(row[lng]):
            return h3.geo_to_h3(row[lat], row[lng], resolution=5)
    except ValueError as e:
        print(f"Error: {e}")
    return None

# Add h3 hexagon column to dataframe using apply method
wfil_df['geohash'] = wfil_df.apply(encode_geohash, axis=1, args=('InitialLatitude', 'InitialLongitude'))
geohash_counts = wfil_df.groupby('geohash').size().reset_index(name='Count')
print(geohash_counts.sort_values(by='Count', ascending=False).head(10))

wthr_df['geohash'] = wthr_df.apply(encode_geohash, axis=1, args=('LATITUDE', 'LONGITUDE'))
geohash_counts = wthr_df.groupby('geohash').size().reset_index(name='Count')
print(geohash_counts.sort_values(by='Count', ascending=False).head(10))

cond_df['geohash'] = cond_df.apply(encode_geohash, axis=1, args=('LAT', 'LON'))
geohash_counts = wfil_df.groupby('geohash').size().reset_index(name='Count')
print(geohash_counts.sort_values(by='Count', ascending=False).head(10))

# Save updated dataframe to CSV file
wfil_df.to_csv('../data/fires_geohash.csv', index=False)
wthr_df.to_csv('../data/weather_geohash.csv', index=False)
cond_df.to_csv('../data/conditions_geohash.csv', index=False)

              geohash  Count
2561  8529a56ffffffff   2593
2446  8529a147fffffff   2180
2475  8529a1d3fffffff   2095
2477  8529a1dbfffffff   1116
2450  8529a157fffffff   1016
2560  8529a56bfffffff    997
2423  8529a0affffffff    912
2389  8529a013fffffff    894
2385  8529a003fffffff    695
2575  8529a63bfffffff    653
             geohash  Count
477  8529a1cffffffff    986
443  8529a037fffffff    704
236  8528308bfffffff    657
463  8529a117fffffff    654
556  8529ae4ffffffff    646
172  85281597fffffff    634
152  8528140bfffffff    623
459  8529a0cbfffffff    616
490  8529a44bfffffff    591
420  85298907fffffff    589
              geohash  Count
2561  8529a56ffffffff   2593
2446  8529a147fffffff   2180
2475  8529a1d3fffffff   2095
2477  8529a1dbfffffff   1116
2450  8529a157fffffff   1016
2560  8529a56bfffffff    997
2423  8529a0affffffff    912
2389  8529a013fffffff    894
2385  8529a003fffffff    695
2575  8529a63bfffffff    653


# Date Check and Conversion

## Fires

In [6]:
# Show date range on fires table
wfil_df.loc[:, 'year'] = wfil_df['FireDiscoveryDateTime'].apply(lambda x: x[:4])
year_counts = wfil_df.groupby('year')['FireDiscoveryDateTime'].count().sort_values(ascending=False)
print(wfil_df['FireDiscoveryDateTime'].head(10))
print(year_counts)

0    2020/02/28 20:45:40+00
1    2019/07/01 19:54:00+00
2    2016/06/20 22:05:59+00
3    2021/11/25 15:17:32+00
4    2022/11/21 11:25:33+00
5    2017/07/07 20:10:00+00
6    2016/06/20 20:03:59+00
7    2017/08/27 14:33:32+00
8    2017/10/09 14:50:17+00
9    2019/11/18 17:36:59+00
Name: FireDiscoveryDateTime, dtype: object
year
2022    17544
2021    16335
2020    14335
2019    10072
2017     7780
2018     7212
2015     5811
2014     5097
2016     3809
2023     1296
2011        2
2004        1
Name: FireDiscoveryDateTime, dtype: int64


In [7]:
wfil_df = wfil_df.copy()

def convert_date(date_str):
    try:
        return pd.to_datetime(date_str, format='%Y/%m/%d %H:%M:%S+%f')
    except ValueError:
        return pd.NaT
wfil_df['ym_date'] = wfil_df['FireDiscoveryDateTime'].apply(convert_date)

wfil_df['year'] = wfil_df['ym_date'].dt.year
wfil_df['month'] = wfil_df['ym_date'].dt.month

wfil_df['fire'] = 1

In [8]:
print(len(wfil_df), end='')
wfil_df = wfil_df.loc[(wfil_df['year'] >= 2016) & (wfil_df['year'] <= 2019)]
wfil_df = wfil_df.dropna(subset=['FireCause'])
print('->', len(wfil_df))

89294-> 24067


In [9]:
summary = wfil_df.groupby(['geohash', 'year', 'month']).size().reset_index(name='count')
print(summary)

               geohash  year  month  count
0      85021217fffffff  2018      7      1
1      8512d23bfffffff  2016      7      1
2      8512d803fffffff  2017      7      2
3      8512d803fffffff  2018      5      1
4      8512d803fffffff  2019      6      1
...                ...   ...    ...    ...
12339  85485bc7fffffff  2016      7      1
12340  85754e67fffffff  2017      9      1
12341  85754e67fffffff  2018      8      1
12342  85754e67fffffff  2018      9      1
12343  85754e67fffffff  2019     11      1

[12344 rows x 4 columns]


## Weather

In [10]:
# Show date range on weather table
wthr_df['year'] = wthr_df['DATE'].apply(lambda x: x[:4])
year_counts = wthr_df.groupby(wthr_df['DATE'].str[:4]).size().sort_index(ascending=False)
print(wthr_df['DATE'].head(10))
print(year_counts)

0    1995-01
1    1995-02
2    1995-03
3    1995-04
4    1995-05
5    1995-06
6    1995-07
7    1995-08
8    1995-09
9    1995-10
Name: DATE, dtype: object
DATE
2023     959
2022    5792
2021    1463
2020    5877
2019    5838
2018    5891
2017    5846
2016    5830
2015    5908
2014    5937
2013    5775
2012    6035
2011    6077
2010    6171
2009    6178
2008    6046
2007    6227
2006    6189
2005    6255
2004    6273
2003    6237
2002    5728
2001    5420
2000    5131
1999    4808
1998    4295
1997    4180
1996    4021
1995    3807
1994    3564
1993    3122
1992    3292
1991    2833
1990    2295
1989    1715
1988    1396
1987    1303
1986     910
1985     643
1984      29
1983       4
dtype: int64


In [11]:
wthr_df = wthr_df.copy()

wthr_df[['year', 'month']] = wthr_df['DATE'].str.split('-', expand=True)
wthr_df[['year', 'month']] = wthr_df[['year', 'month']].apply(pd.to_numeric)
wthr_df = wthr_df.drop(columns=['DATE'])

In [12]:
wthr_df = wthr_df.loc[(wthr_df['year'] >= 2016) & (wthr_df['year'] <= 2019)]

In [13]:
summary = wthr_df.groupby(['geohash', 'year', 'month']).size().reset_index(name='count')
print(summary)

               geohash  year  month  count
0      8512d80bfffffff  2016      1      1
1      8512d80bfffffff  2016      4      1
2      8512d80bfffffff  2016      5      1
3      8512d80bfffffff  2016      6      1
4      8512d80bfffffff  2016      7      1
...                ...   ...    ...    ...
22674  8548db2bfffffff  2018     12      1
22675  8548db2bfffffff  2019      1      1
22676  8548db2bfffffff  2019      2      1
22677  8548db2bfffffff  2019      3      1
22678  8548db2bfffffff  2019      4      1

[22679 rows x 4 columns]


## Conditions

In [14]:
# Show date range on conditions table
cond_df[['STATE', 'INVYR']].drop_duplicates().\
    sort_values(by=['INVYR', 'STATE'],
                ascending=[False, True]).reset_index(drop=True)

Unnamed: 0,STATE,INVYR
0,CA,2019
1,OR,2019
2,WA,2019
3,CA,2018
4,OR,2018
5,WA,2018
6,CA,2017
7,OR,2017
8,WA,2017
9,CA,2016


In [15]:
cond_df = cond_df.rename(columns={'INVYR': 'year'})
cond_df = cond_df.loc[(cond_df['year'] >= 2016) & (cond_df['year'] <= 2019)]

In [16]:
summary = cond_df.groupby(['geohash', 'year']).size().reset_index(name='count')
print(summary)

               geohash  year  count
0      8512d803fffffff  2016      4
1      8512d803fffffff  2017      5
2      8512d803fffffff  2018      2
3      8512d803fffffff  2019      4
4      8512d807fffffff  2016      2
...                ...   ...    ...
12089  85485bb7fffffff  2018      1
12090  85485bb7fffffff  2019      1
12091  85485bbbfffffff  2016      1
12092  85485bbbfffffff  2018      1
12093  85485bbbfffffff  2019      1

[12094 rows x 3 columns]


# Merge Final Dataframe

In [17]:
print(len(wfil_df))
print(len(wthr_df))
print(len(cond_df))

24067
23405
22718


In [18]:
# First, merge weather and conditions for 'superset'
merged_df = pd.merge(wthr_df, cond_df, on=['year', 'geohash'], how='inner')
print(len(merged_df))

37540


In [19]:
# Then merge superset with fires for full set
merged_df = pd.merge(merged_df, wfil_df, on=['year', 'month', 'geohash'], how='left')
print(len(merged_df))

42289


In [20]:
merged_df['fire'] = merged_df['fire'].fillna(0)

In [21]:
print(merged_df.groupby(['fire']).size().reset_index(name='count'))

   fire  count
0   0.0  32559
1   1.0   9730


In [22]:
print(merged_df.info())
merged_df.to_csv('../data/merged.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42289 entries, 0 to 42288
Data columns (total 89 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   STATION                   42289 non-null  object        
 1   LATITUDE                  42289 non-null  float64       
 2   LONGITUDE                 42289 non-null  float64       
 3   ELEVATION                 42289 non-null  float64       
 4   NAME                      42289 non-null  object        
 5   CDSD                      39981 non-null  float64       
 6   CDSD_ATTRIBUTES           39577 non-null  object        
 7   CLDD                      42241 non-null  float64       
 8   CLDD_ATTRIBUTES           42241 non-null  object        
 9   DT00                      42279 non-null  float64       
 10  DT00_ATTRIBUTES           42279 non-null  object        
 11  DT32                      42279 non-null  float64       
 12  DT32_ATTRIBUTES   

In [23]:
%store

Stored variables and their in-db values:
autopilot_train_s3_uri                                -> 's3://sagemaker-us-east-1-857283526476/data/amazon
balanced_bias_data_jsonlines_s3_uri                   -> 's3://sagemaker-us-east-1-857283526476/bias-detect
balanced_bias_data_s3_uri                             -> 's3://sagemaker-us-east-1-857283526476/bias-detect
bias_data_s3_uri                                      -> 's3://sagemaker-us-east-1-857283526476/bias-detect
ingest_create_athena_db_passed                        -> True
ingest_create_athena_table_parquet_passed             -> True
ingest_create_athena_table_passed                     -> True
ingest_create_athena_table_tsv_passed                 -> True
s3_private_path                                       -> 's3://sagemaker-us-east-1-857283526476/widfire-ris
s3_private_path_tsv                                   -> 's3://sagemaker-us-east-1-857283526476/amazon-revi
s3_public_path                                        -> 's3://

In [24]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [25]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>