# Collection and Cleaning of California Wildfire Data

---

What's the correlation between these wildfires and meteorilogical data? (avg temp, avg rainfall, co2 emissions). Plot average rainfall and average temperature linearly (x=time), using clusters of dots to show when fires occurred (the sizes representing acres burned).  

Training K-means clustering on latitude and longitude for locating hotspots. Potentially predicting future wildfires based on preceding meteorilogical data.
Training K-nearest neighbors on weather conditions to predict conditions that lead to fires.

In [1]:
county_ids = {
  'Tuolumne' : 'CA-109',
  'Los Angeles' : 'CA-037', 
  'Riverside' : 'CA-065', 
  'Placer' : 'CA-061', 
  'Ventura' : 'CA-111',
  'Fresno' : 'CA-019', 
  'Siskiyou' : 'CA-093', 
  'Humboldt' : 'CA-023',
  'Tehama' : 'CA-103', 
  'Shasta' : 'CA-089', 
  'San Diego' : 'CA-073',
  'Kern' : 'CA-029', 
  'Sonoma' : 'CA-097', 
  'Contra Costa' : 'CA-013', # added to SCU
  'Butte' : 'CA-007',
  'Tulare' : 'CA-107',
  'Santa Barbara' : 'CA-083', 
  'Mariposa' : 'CA-043', 
  'Monterey' : 'CA-053', 
  'El Dorado' : 'CA-017',
  'San Bernardino' : 'CA-071', 
  'Plumas' : 'CA-063', # PNF (Plumas National Forest)
  'Modoc' : 'CA-049', 
  'San Luis Obispo' : 'CA-079', 
  'Madera' : 'CA-039',
  'Inyo' : 'CA-027', # Inyo National Forest and Death Valley National Park (negligible)
  'Napa' : 'CA-055', 
  'San Benito' : 'CA-069', 
  'San Joaquin' : 'CA-077', # added to TCU
  'Lake' : 'CA-033', 
  'Alameda' : 'CA-001', # added to SCU
  'Glenn' : 'CA-021', 
  'Yolo' : 'CA-113', # added to LNU
  'Sacramento' : 'CA-067', # added to AEU
  'Stanislaus' : 'CA-099', # STF (Stanislaus National Forest)
  'Solano' : 'CA-095', # added to LNU
  'Merced' : 'CA-047',
  'Mendocino' : 'CA-045', 
  'Lassen' : 'CA-035', 
  'Amador' : 'CA-005', 
  'Yuba' : 'CA-115', 
  'Nevada' : 'CA-057', 
  'Santa Clara' : 'CA-085',
  'Calaveras' : 'CA-009', 
  'San Mateo' : 'CA-081', 
  'Orange' : 'CA-059', 
  'Colusa' : 'CA-011', # added to LNU
  'Trinity' : 'CA-105',
  'Del Norte' : 'CA-015', 
  'Mono' : 'CA-051', # Nothing exists here
  'Alpine' : 'CA-003', # Added to AEU
  'Sutter' : 'CA-101', # Added to NEU
  'Kings' : 'CA-031', 
  'Sierra' : 'CA-091', # Added to NEU
  'Santa Cruz' : 'CA-087', 
  'Marin' : 'CA-041', 
  'Mexico' : None, 
  'State of Oregon' : None,
  'State of Nevada' : None
}

In [2]:
unit_id_to_counties = {
  'AEU': ['Amador', 'El Dorado', 'Sacramento', 'Alpine'],
  'BDU': ['San Bernardino'],
  'BEU': ['San Benito', 'Monterey'],
  'BTU': ['Butte'],
  'CZU': ['San Mateo', 'Santa Cruz'], # south bay
  'FKU': ['Fresno', 'Kings'],
  'HUU': ['Humboldt', 'Del Norte'],
  'LMU': ['Lassen', 'Modoc'],
  'LNU': ['Sonoma', 'Lake', 'Napa', 'Yolo', 'Solano', 'Colusa'], # close fires
  'MEU': ['Mendocino'],
  'MMU': ['Madera', 'Mariposa', 'Merced'],
  'MVU': ['San Diego'],
  'NEU': ['Nevada', 'Yuba', 'Placer', 'Sutter', 'Sierra'],
  'RRU': ['Riverside'],
  'SCU': ['Santa Clara', 'Contra Costa', 'Alameda'], # east bay
  'SHU': ['Shasta', 'Trinity'],
  'SKU': ['Siskiyou'],
  'SLU': ['San Luis Obispo'],
  'TCU': ['Tuolumne', 'Calaveras', 'San Joaquin'],
  'TGU': ['Tehama', 'Glenn'],
  'TUU': ['Tulare'],
  'KRN': ['Kern'],
  'LAC': ['Los Angeles'],
  'MRN': ['Marin'],
  'ORC': ['Orange'],
  'SBC': ['Santa Barbara'],
  'VNC': ['Ventura'],
  'PNF': ['Plumas'],
  'STF': ['Stanislaus']
}

In [3]:
import pandas as pd
from altair import *
import requests
import time
import numpy as np
import copy

df_fire = pd.read_csv("ca_fires.csv")

df_fire = df_fire[df_fire["STATE"]=="CA"] # filter to CA only fires
df_fire["YEAR_"] = df_fire["YEAR_"].dropna().astype(int) # not working why?????
df_fire = df_fire.drop(df_fire[df_fire["YEAR_"]<=1900].index) # filter out everything before 1900
df_fire["ALARM_DATE"] = df_fire["ALARM_DATE"].map(lambda x: str(x)[:10])
df_fire["counties"] = df_fire["UNIT_ID"].map(unit_id_to_counties) # create county column

#df_fire["ALARM_DATE"] = pd.to_datetime(df_fire["ALARM_DATE"])

In [4]:
def climate_data(county):
  # get precip
  resp = requests.get('https://www.ncdc.noaa.gov/cag/county/time-series/%s-pcp-12-12-1900-2022.json?base_prd=true&begbaseyear=1901&endbaseyear=2000' % county_ids[county])
  df_county_precip = pd.DataFrame(resp.json()["data"]).T
  df_county_precip = df_county_precip.rename(columns={"value" : "precip_value", "anomaly" : "precip_anomaly"})
  time.sleep(0.5)
  # get temp
  resp = requests.get('https://www.ncdc.noaa.gov/cag/county/time-series/%s-tavg-12-12-1900-2022.json?base_prd=true&begbaseyear=1901&endbaseyear=2000' % county_ids[county])
  df_county_temp = pd.DataFrame(resp.json()["data"]).T
  df_county_temp = df_county_temp.rename(columns={"value" : "temp_value", "anomaly" : "temp_anomaly"})
  df_county = pd.concat([df_county_temp, df_county_precip], axis=1)
  # cast values to numeric
  df_county["temp_value"] = df_county["temp_value"].astype(float)
  df_county["temp_anomaly"] = df_county["temp_anomaly"].astype(float)
  df_county["precip_value"] = df_county["precip_value"].astype(float)
  df_county["precip_anomaly"] = df_county["precip_anomaly"].astype(float)
  # indices to year
  df_county = df_county.reset_index().drop("index", axis=1)
  df_county.index = df_county.index + 1900
  return df_county

In [5]:
# combines county data, getting average between them 

def combine_counties(unit_id):
  # combine counties in unit_id
  counties = copy.deepcopy(unit_id_to_counties.get(unit_id))
  length = len(counties)
  df_climate = climate_data(counties[0])
  counties.pop(0)
  while counties != []:
    time.sleep(0.5)
    temp = climate_data(counties[0])
    counties.pop(0)
    df_climate = df_climate + temp
  df_climate = df_climate / length
  return df_climate

In [6]:
def combine_data(df_fire, unit_id):
  # get fire data
  df_fire = df_fire.drop(df_fire[df_fire["UNIT_ID"] != unit_id].index)
  frequency = df_fire["YEAR_"].value_counts().sort_index()
  acres_sum = df_fire.groupby("YEAR_").sum()["GIS_ACRES"].sort_index()
  acres_mean = df_fire.groupby("YEAR_").mean()["GIS_ACRES"].sort_index()
  fire_data = pd.DataFrame({'frequency' : frequency, 'acres_sum' : acres_sum, 'acres_mean' : acres_mean})
  # combine with county data
  df_county = combine_counties(unit_id)
  return df_county.merge(fire_data, left_index=True, right_index=True, how='outer').fillna(0)

In [7]:
combine_data(df_fire, "HUU")

Unnamed: 0,temp_value,temp_anomaly,precip_value,precip_anomaly,frequency,acres_sum,acres_mean
1900,51.70,-0.20,67.750,-3.940,0.0,0.000000,0.000000
1901,50.50,-1.40,66.690,-5.000,0.0,0.000000,0.000000
1902,50.90,-1.00,103.350,31.660,0.0,0.000000,0.000000
1903,51.10,-0.80,82.015,10.325,0.0,0.000000,0.000000
1904,51.80,-0.10,107.270,35.580,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...
2017,53.60,1.70,84.640,12.950,6.0,154.809919,25.801653
2018,53.85,1.95,51.540,-20.150,3.0,36.224920,12.074973
2019,53.30,1.40,75.565,3.875,0.0,0.000000,0.000000
2020,54.55,2.65,46.815,-24.875,3.0,122.948170,40.982723
