In [1]:
import pandas as pd
import numpy as np
from scipy import spatial

In [2]:
# Input files
sample_weather_data_file = '/mnt/SSD/data/NOAA_weather_data/scratch/air.2m.1992.us_only.parquet'
fire_data_file = '/mnt/SSD/data/USDA_wildfire_data/us_fires.parquet'

# Output file
regridded_fire_data = '/mnt/SSD/data/USDA_wildfire_data/regridded_us_fires.parquet'

In [3]:
weather = pd.read_parquet(sample_weather_data_file)
bins = weather[['lat','lon']]
unique_bins = bins.drop_duplicates()

In [4]:
len(unique_bins)

7277

In [5]:
fires = pd.read_parquet(fire_data_file)
fires.head()

Unnamed: 0,lat,lon,date
0,40.036945,-121.005836,2005-02-02
1,38.933056,-120.404442,2004-05-12
2,38.984165,-120.735558,2004-05-31
3,38.559166,-119.91333,2004-06-28
4,38.559166,-119.933052,2004-06-28


In [6]:
len(fires)

1787253

In [7]:
bin_array = np.column_stack([unique_bins['lon'], unique_bins['lat']])
fire_array = np.column_stack([fires['lon'], fires['lat']])

bin_tree = spatial.cKDTree(bin_array)
dist, indexes = bin_tree.query(fire_array)
indexes = pd.Series(indexes)

In [8]:
# Note: in testing, loop is faster than apply with lambda function by 3 tenths of a second

fire_bins = []

def index_loop():
    for index in indexes:
        fire_bins.append([unique_bins.iloc[index, 0], unique_bins.iloc[index, 1]])
        
%time index_loop()

CPU times: user 47.7 s, sys: 282 ms, total: 48 s
Wall time: 48 s


In [9]:
fires[['lat', 'lon']] = fire_bins
# fires['lat'] = round(fires['lat'], 4)
# fires['lon'] = round(fires['lon'], 4)
fires.head()

Unnamed: 0,lat,lon,date
0,39.934269,-121.159798,2005-02-02
1,38.8577,-120.520798,2004-05-12
2,39.139919,-120.587402,2004-05-31
3,38.62627,-120.094002,2004-06-28
4,38.62627,-120.094002,2004-06-28


In [10]:
len(fires)

1787253

In [11]:
fires.drop_duplicates(keep=False,inplace=True)

In [12]:
len(fires)

1135498

In [13]:
fires.to_parquet(regridded_fire_data, index=False)

In [14]:
# def add_ignition():
#     for index, row in fires.iterrows():
#         date = row['date']
#         lat = row['lat']
#         lon = row['lon']
#         weather.loc[(weather['time'] == date) & (weather['lat'] == lat) & (weather['lon'] == lon), 'ignition'] = 1

# %time add_ignition()

In [15]:
# %time fires.apply(lambda row: weather.loc[(weather[row['date']] == date) & (weather[row['lat']] == lat) & (weather[row['lon']] == lon), 'ignition'] = 1)