In [1]:
import pandas as pd

# Packages used for filtering based on California boundary
import shapefile
from shapely.geometry import shape, Point
from shapely.ops import unary_union

In [2]:
climate_data = pd.read_csv('./datasets/CIMIS/processed/daily_CIMIS.csv')
fire_data = pd.read_csv('./datasets/DL_FIRE_SV-C2/fire_archive_SV-C2.csv')
station_data = pd.read_csv('cimis_station_info.csv')

In [3]:
climate_data.shape

(216645, 33)

In [4]:
fire_data.shape

(354319, 15)

In [5]:
climate_data.head()

Unnamed: 0,Stn Id,Stn Name,CIMIS Region,Date,Jul,ETo (mm),qc,Precip (mm),qc.1,Sol Rad (W/sq.m),...,Avg Rel Hum (%),qc.9,Dew Point (C),qc.10,Avg Wind Speed (m/s),qc.11,Wind Run (km),qc.12,Avg Soil Temp (C),qc.13
0,181,Westmorland North,Imperial/Coachella Valley,1/1/2021,1,1.96,,0.0,,159.0,...,51.0,,2.5,,1.2,,99.9,,11.0,
1,181,Westmorland North,Imperial/Coachella Valley,1/2/2021,2,1.63,,0.0,,152.0,...,65.0,,2.9,,0.5,,40.6,,11.0,
2,181,Westmorland North,Imperial/Coachella Valley,1/3/2021,3,1.57,R,0.0,,149.0,...,67.0,,3.9,,0.5,,40.8,,10.9,
3,181,Westmorland North,Imperial/Coachella Valley,1/4/2021,4,1.69,R,0.0,,154.0,...,68.0,,4.6,,0.5,,39.0,,10.8,
4,181,Westmorland North,Imperial/Coachella Valley,1/5/2021,5,1.74,,0.0,,162.0,...,73.0,,5.5,,0.6,,48.5,,10.9,


In [6]:
fire_data.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,42.14804,-121.21143,301.73,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,266.88,0.8,N,0
1,42.14388,-121.20705,296.58,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,266.42,1.23,N,0
2,42.14107,-121.2131,311.14,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,266.68,1.23,N,0
3,42.13758,-121.21394,332.16,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,268.83,2.64,N,0
4,42.13826,-121.21915,306.48,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,266.1,1.41,N,0


In [7]:
station_data.head()

Unnamed: 0,StationNbr,Name,City,RegionalOffice,County,ConnectDate,DisconnectDate,IsActive,IsEtoStation,Elevation,GroundCover,HmsLatitude,HmsLongitude,ZipCodes,SitingDesc
0,1,Fresno/F.S.U. USDA,Fresno,South Central Region Office,Fresno,6/7/1982,9/25/1988,False,True,340,Grass,36º48'52N / 36.814444,-119º43'54W / -119.731670,"['93766', '93762', '93761', '93760', '93759', ...",
1,2,FivePoints,Five Points,South Central Region Office,Fresno,6/7/1982,12/31/2050,True,True,285,Grass,36º20'10N / 36.336222,-120º6'46W / -120.112910,['93624'],
2,3,Beach /Santa Cruz CO,Watsonville,South Central Region Office,Santa Cruz,5/30/1982,8/25/1986,False,True,10,Grass,36º52'50N / 36.880556,-121º47'36W / -121.793330,"['95077', '95076', '95075', '95019', '95018']",
3,4,Webb /Santa Cruz CO,Watsonville,South Central Region Office,Santa Cruz,5/30/1982,4/29/1988,False,True,230,Grass,36º58'21N / 36.9725,-121º43'34W / -121.726110,"['95077', '95076', '95075', '95019', '95018']",
4,5,Shafter,Shafter,South Central Region Office,Kern,6/1/1982,12/31/2050,True,False,360,Grass,35º31'57N / 35.532556,-119º16'54W / -119.281790,"['93263', '93280', '93388']",


In [8]:
# Update the station_data and change the latitude and longitude format to float
def extract_decimal(coord):
    try:
        return float(coord.split('/')[1].strip())
    except:
        return None
    
station_data['Latitude'] = station_data['HmsLatitude'].apply(extract_decimal)
station_data['Longitude'] = station_data['HmsLongitude'].apply(extract_decimal)

station_data = station_data.drop(columns=['HmsLatitude', 'HmsLongitude'])

In [9]:
station_data.head()

Unnamed: 0,StationNbr,Name,City,RegionalOffice,County,ConnectDate,DisconnectDate,IsActive,IsEtoStation,Elevation,GroundCover,ZipCodes,SitingDesc,Latitude,Longitude
0,1,Fresno/F.S.U. USDA,Fresno,South Central Region Office,Fresno,6/7/1982,9/25/1988,False,True,340,Grass,"['93766', '93762', '93761', '93760', '93759', ...",,36.814444,-119.73167
1,2,FivePoints,Five Points,South Central Region Office,Fresno,6/7/1982,12/31/2050,True,True,285,Grass,['93624'],,36.336222,-120.11291
2,3,Beach /Santa Cruz CO,Watsonville,South Central Region Office,Santa Cruz,5/30/1982,8/25/1986,False,True,10,Grass,"['95077', '95076', '95075', '95019', '95018']",,36.880556,-121.79333
3,4,Webb /Santa Cruz CO,Watsonville,South Central Region Office,Santa Cruz,5/30/1982,4/29/1988,False,True,230,Grass,"['95077', '95076', '95075', '95019', '95018']",,36.9725,-121.72611
4,5,Shafter,Shafter,South Central Region Office,Kern,6/1/1982,12/31/2050,True,False,360,Grass,"['93263', '93280', '93388']",,35.532556,-119.28179


In [10]:
# Merge station's latitude and longitude to CIMIS dataset using station number
# Drop the StationNbr column after merging - no need for redundancy 
climate_data = climate_data.merge(
    station_data[['StationNbr', 'Latitude', 'Longitude']], 
    left_on="Stn Id", 
    right_on="StationNbr", 
    how="left").drop(columns=['StationNbr'])

climate_data.head()

Unnamed: 0,Stn Id,Stn Name,CIMIS Region,Date,Jul,ETo (mm),qc,Precip (mm),qc.1,Sol Rad (W/sq.m),...,Dew Point (C),qc.10,Avg Wind Speed (m/s),qc.11,Wind Run (km),qc.12,Avg Soil Temp (C),qc.13,Latitude,Longitude
0,181,Westmorland North,Imperial/Coachella Valley,1/1/2021,1,1.96,,0.0,,159.0,...,2.5,,1.2,,99.9,,11.0,,33.078611,-115.66056
1,181,Westmorland North,Imperial/Coachella Valley,1/2/2021,2,1.63,,0.0,,152.0,...,2.9,,0.5,,40.6,,11.0,,33.078611,-115.66056
2,181,Westmorland North,Imperial/Coachella Valley,1/3/2021,3,1.57,R,0.0,,149.0,...,3.9,,0.5,,40.8,,10.9,,33.078611,-115.66056
3,181,Westmorland North,Imperial/Coachella Valley,1/4/2021,4,1.69,R,0.0,,154.0,...,4.6,,0.5,,39.0,,10.8,,33.078611,-115.66056
4,181,Westmorland North,Imperial/Coachella Valley,1/5/2021,5,1.74,,0.0,,162.0,...,5.5,,0.6,,48.5,,10.9,,33.078611,-115.66056


In [11]:
fire_data.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,42.14804,-121.21143,301.73,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,266.88,0.8,N,0
1,42.14388,-121.20705,296.58,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,266.42,1.23,N,0
2,42.14107,-121.2131,311.14,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,266.68,1.23,N,0
3,42.13758,-121.21394,332.16,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,268.83,2.64,N,0
4,42.13826,-121.21915,306.48,0.43,0.38,2021-01-01,948,N,VIIRS,n,2,266.1,1.41,N,0


In [12]:


print('Shape of unfiltered fire data:', fire_data.shape)

# Load CA shapefile (e.g. county subdivisions)
sf = shapefile.Reader("./ca/tl_2019_06_cousub.shp")
shapes = [shape(record.shape.__geo_interface__) for record in sf.shapeRecords()]

# Combine all shapes into one multipolygon
california_shape = unary_union(shapes)

# Convert fire points to shapely Point geometries
fire_data['geometry'] = fire_data.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

# Filter: keep only fire points within the California shape
fire_data_ca = fire_data[fire_data['geometry'].apply(lambda point: california_shape.contains(point))]

# Drop geometry if needed
fire_data_ca = fire_data_ca.drop(columns='geometry')

print("Shape of filtered fire in CA:", fire_data_ca.shape)


Shape of unfiltered fire data: (354319, 15)
Shape of filtered fire in CA: (301159, 15)


In [13]:
print("Number of fires with high confidence level:", (fire_data_ca['confidence'] == 'h').sum())

Number of fires with high confidence level: 21209


In [19]:
fire_data_ca = fire_data_ca[
    (fire_data_ca['confidence'] == 'h') & 
    (fire_data_ca['type'] == 0)
]

fire_data_ca.shape

(21164, 15)

In [41]:
from datetime import datetime
from shapely.geometry import Point
from geopy.distance import geodesic

# Convert dates to datetime
climate_data['Date'] = pd.to_datetime(climate_data['Date'])
fire_data_ca['acq_date'] = pd.to_datetime(fire_data_ca['acq_date'])

# Add label column
climate_data['fire_occurred'] = 0

# Set threshold distance in km
threshold_km = 80

# Create list of fires grouped by date
fires_by_date = fire_data_ca.groupby('acq_date')[['latitude', 'longitude']].apply(lambda x: x.values.tolist()).to_dict()

def match_fire(row):
    fires = fires_by_date.get(row['Date'], [])
    station_point = (row['Latitude'], row['Longitude'])
    for fire_point in fires:
        fire_loc = (fire_point[0], fire_point[1])
        if geodesic(station_point, fire_loc).km <= threshold_km:
            return 1
    return 0

climate_data['fire_occurred'] = climate_data.apply(match_fire, axis=1)

KeyboardInterrupt: 

In [39]:
print(climate_data['Stn Id'].nunique())

142


In [33]:
climate_data[climate_data.fire_occurred == 1]

Unnamed: 0,Stn Id,Stn Name,CIMIS Region,Date,Jul,ETo (mm),qc,Precip (mm),qc.1,Sol Rad (W/sq.m),...,qc.10,Avg Wind Speed (m/s),qc.11,Wind Run (km),qc.12,Avg Soil Temp (C),qc.13,Latitude,Longitude,fire_occurred
1587,182,Delano,San Joaquin Valley,2021-05-07,127,6.18,,0.0,,340.0,...,,1.7,,144.9,,21.7,,35.833000,-119.25596,1
1736,182,Delano,San Joaquin Valley,2021-10-03,276,3.88,,0.0,,208.0,...,,1.1,,95.8,,17.6,,35.833000,-119.25596,1
6087,187,Black Point,North Coast Valleys,2021-09-01,244,4.38,,0.1,,262.0,...,,1.8,,157.5,,20.8,,38.090933,-122.52670,1
11576,192,Lake Arrowhead,San Bernardino,2024-09-11,255,5.06,,0.8,,241.0,...,,2.5,,215.7,,16.5,,34.255942,-117.21814,1
13189,113,King City-Oasis Rd.,Monterey Bay,2021-02-10,41,2.20,,0.0,,150.0,...,,2.2,,186.0,,10.6,,36.121083,-121.08457,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207717,114,Arroyo Seco,Monterey Bay,2024-07-23,205,6.80,,0.0,,317.0,...,,2.7,,232.2,,21.3,,36.347306,-121.29135,1
207858,114,Arroyo Seco,Monterey Bay,2024-12-11,346,1.88,,0.0,,122.0,...,,1.5,,127.4,,10.1,,36.347306,-121.29135,1
209898,117,Victorville,San Bernardino,2022-07-13,194,8.91,,0.0,,357.0,...,,2.8,,241.6,,19.9,,34.475914,-117.26351,1
213162,125,Arvin-Edison,San Joaquin Valley,2023-06-20,171,7.16,,0.0,,361.0,...,,2.4,,205.1,,27.3,,35.205583,-118.77841,1
