# Wildfire Risk - Data Pre-Prep - Join Challenge
__Team 3 - Dave Friesen, John Chen, and Kyle Dalope__<br>
__ADS-508-02-SP23__<br><br>
__GitHub link: https://github.com/davefriesen/wildfire-risk__

In [1]:
__authors__ = ['Dave Friesen', 'John Chen', 'Kyle Dalope']
__contact__ = ['dfriesen@sandiego.edu', 'johnchen@sandiego.edu', 'kdalope@sandiego.edu']
__date__ = '2023-03-20'
__license__ = 'MIT'
__version__ = '1.0.1'

# Setup

In [2]:
# Import basic and data access libraries
import pandas as pd
from profiler import profile, profile_cat

# Data Load and Validation

In [3]:
wfil_df = pd.read_csv('../data/fires.csv', low_memory=False)
wthr_df = pd.read_csv('../data/weather.csv', low_memory=False)
cond_df = pd.read_csv('../data/conditions.csv', low_memory=False)

## Experiment with Logical Joins across Dataframes

In [4]:
# Show date range on fires table
wfil_df.loc[:, 'year'] = wfil_df['FireDiscoveryDateTime'].apply(lambda x: x[:4])
year_counts = wfil_df.groupby('year')['FireDiscoveryDateTime'].count().sort_values(ascending=False)
print(year_counts)

year
2022    17544
2021    16335
2020    14335
2019    10072
2017     7780
2018     7212
2015     5811
2014     5097
2016     3809
2023     1260
2011        2
2004        1
Name: FireDiscoveryDateTime, dtype: int64


In [5]:
# Show date range on weather table
wthr_df['year'] = wthr_df['DATE'].apply(lambda x: x[:4])
year_counts = wthr_df.groupby(wthr_df['DATE'].str[:4]).size().sort_index(ascending=False)
print(year_counts)

DATE
2023     959
2022    5792
2021    1463
2020    5877
2019    5838
2018    5891
2017    5846
2016    5830
2015    5908
2014    5937
2013    5775
2012    6035
2011    6077
2010    6171
2009    6178
2008    6046
2007    6227
2006    6189
2005    6255
2004    6273
2003    6237
2002    5728
2001    5420
2000    5131
1999    4808
1998    4295
1997    4180
1996    4021
1995    3807
1994    3564
1993    3122
1992    3292
1991    2833
1990    2295
1989    1715
1988    1396
1987    1303
1986     910
1985     643
1984      29
1983       4
dtype: int64


In [6]:
# Show date range on conditions table
cond_df[['STATE', 'INVYR']].drop_duplicates().\
    sort_values(by=['INVYR', 'STATE'],
                ascending=[False, True]).reset_index(drop=True)

Unnamed: 0,STATE,INVYR
0,CA,2019
1,OR,2019
2,WA,2019
3,CA,2018
4,OR,2018
5,WA,2018
6,CA,2017
7,OR,2017
8,WA,2017
9,CA,2016


In [7]:
import numpy as np
from math import radians, cos, sin, asin, sqrt
from scipy.spatial.distance import cdist
import time
from joblib import Parallel, delayed

def haversine(coords1, coords2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lon1, lat1 = coords1
    lon2, lat2 = coords2
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 3965
#    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

# Example data frames
#wfil_x_df = wfil_df.copy()
#wthr_x_df = wthr_df.copy()
wfil_x_df = wfil_df[:3000].copy()
wthr_x_df = wthr_df[:3000].reset_index(drop=True).copy()

# Calculate distance matrix using cdist in parallel
wfil_x_coords = np.radians(wfil_x_df[['InitialLatitude', 'InitialLongitude']])
wthr_x_coords = np.radians(wthr_x_df[['LATITUDE', 'LONGITUDE']])
start_time = time.time()

num_cores = 4
with Parallel(n_jobs=num_cores) as parallel:
    dist_matrix = np.array(parallel(delayed(cdist)(wfil_x_coords[i:i+1000], wthr_x_coords, haversine)
                                    for i in range(0, len(wfil_x_coords), 1000)))

end_time = time.time()
elapsed_time = end_time - start_time
print("Processing time: {} seconds".format(elapsed_time))

# Flatten the distance matrix and find the index of the nearest neighbor in wthr_x_df for each point in wfil_x_df
flat_dist_matrix = dist_matrix.reshape(len(wfil_x_df), -1)
wfil_x_df['closest_index'] = np.argmin(flat_dist_matrix, axis=1)

# Find the closest pair of coordinates in wthr_x_df for each coordinate in wfil_x_df
closest_coords = wthr_x_df.loc[wfil_x_df['closest_index'], ['LATITUDE', 'LONGITUDE']].reset_index(drop=True)
wfil_x_df[['closest_lat', 'closest_lon']] = closest_coords
wfil_x_df['closest_distance'] = np.min(flat_dist_matrix, axis=1)

# Add index of closest row in wthr_x_df to wfil_x_df
wfil_x_df['closest_index_wthr'] = closest_coords.index

# Show the result
print(wfil_x_df)

Processing time: 7.882153034210205 seconds
     ContainmentDateTime ControlDateTime  DiscoveryAcres  EstimatedCostToDate  \
0                    NaN             NaN            0.10                  NaN   
1                    NaN             NaN             NaN                  NaN   
2                    NaN             NaN             NaN                  NaN   
3                    NaN             NaN            0.10                  NaN   
4                    NaN             NaN            0.01                  NaN   
...                  ...             ...             ...                  ...   
2995                 NaN             NaN            0.01                  NaN   
2996                 NaN             NaN            0.10                  NaN   
2997                 NaN             NaN             NaN                  NaN   
2998                 NaN             NaN            0.10                  NaN   
2999                 NaN             NaN            0.10          