In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
tqdm.pandas()

### Observe the dataset: This is the dataset after combining NIFC & Openweather data, with no further cleaning steps.

In [2]:
raw_data = pd.read_csv('data/raw_data.csv').drop(columns=['Unnamed: 0'])
raw_data.head()

Unnamed: 0,ID,lat,lon,start_date,incident name,fully_contained,geographic_area,new,size,temp,...,pressure,humidity,dew_point,clouds,visibility,wind_speed,wind_deg,rain,weather,weather_description
0,19337,34.871,-94.666,12/31/2022 12:00:00 AM,Pine_Mt._Fire,c,SAC,y,436.0,285.28,...,1015,58,277.27,0,10000.0,0.0,0,,Clear,clear sky
1,19339,29.362,-82.6,1/11/2023 12:00:00 AM,NE_CR_337,c,SAC,y,182.0,286.23,...,1020,74,281.71,0,,3.18,272,,Clear,clear sky
2,19340,33.203,-99.255,1/11/2023 12:00:00 AM,Cellers_Hill,c,SAC,y,695.0,298.12,...,1009,10,265.39,100,,4.63,249,,Clouds,overcast clouds
3,19341,27.686,-81.934,1/10/2023 12:00:00 AM,STREAMSONG,c,SAC,,475.0,292.92,...,1020,78,288.98,1,,1.34,313,,Clear,clear sky
4,19343,35.296,-95.924,1/14/2023 12:00:00 AM,RATTLESNAKE_HOLLOW,c,SAC,y,888.0,278.55,...,1029,62,272.01,89,,1.84,45,,Clouds,overcast clouds


In [3]:
# Dropping columns - these are columns that we determined were unfit for regression before feature selection, due to missing values or irrelevancy. 
raw_data = raw_data.drop(columns=['visibility', 'new', 'wind_deg'])

In [4]:
# Type conversion: Hold on to dates as pandas datetimes
raw_data['start_date'] = pd.to_datetime(raw_data['start_date'])
# Messy data - fully contained should be boolean-valued, but has strings instead
print(raw_data['fully_contained'].value_counts())
raw_data['fully_contained'] = raw_data['fully_contained'].apply(lambda x: str(x).strip().lower() == 'c')
# Missing values - usually NaN values instead of 0 to indicate no rainfall, due to the nature of the OpenWeather API
raw_data['rain'] = raw_data['rain'].fillna(0)

fully_contained
c     898
       48
c      12
C       1
Name: count, dtype: int64


  raw_data['start_date'] = pd.to_datetime(raw_data['start_date'])


In [5]:
# Observe: Several mislabeled observations due to human error
# Codes are  abbreviations for Geographic Area Coordination Centers (GACC).
# We manually relabeled them by learning what the proper classifications are supposed to be
print(raw_data['geographic_area'].value_counts())
def area_mapper(x):
    area_map = {'SACC':'SAC', 'SAC3_':'SAC', 'NWC':'NWCC', 'RMCC':'RMC', 'EAC':'EACC', 'OSCC':'ONCC'}
    if x in area_map.keys(): # Map mislabeled centers to the proper ones
        return area_map[x]
    elif (type(x) != str): # This is only one observation, we found the proper code manually by checking Lat/Lon coordinates
        return 'SAC'
    return x
raw_data['geographic_area'] = raw_data['geographic_area'].apply(area_mapper)
raw_data['geographic_area'].value_counts()

geographic_area
SAC      640
SWC      132
EACC      98
NWCC      85
RMC       76
NRC       41
ACC       35
GBC       31
OSCC      23
ONCC      18
SACC       4
RMCC       3
SAC3_      1
NWC        1
EAC        1
Name: count, dtype: int64


geographic_area
SAC     646
SWC     132
EACC     99
NWCC     86
RMC      79
ONCC     41
NRC      41
ACC      35
GBC      31
Name: count, dtype: int64

In [6]:
# Unbalanced categories in the weather column cause errors in regression. To prevent this, we choose to map them to "Other"
weather_counts = raw_data['weather'].value_counts()
print(weather_counts)
other_weather = weather_counts.index[weather_counts < 10]
weather_map =  lambda x: 'Other' if x in other_weather else x
raw_data['weather'] = raw_data['weather'].map(weather_map)

weather
Clear     564
Clouds    545
Rain       69
Smoke       6
Haze        3
Dust        1
Snow        1
Mist        1
Name: count, dtype: int64


In [7]:
# The same is true for weather descriptions
desc_counts = raw_data['weather_description'].value_counts()
print(desc_counts)
other_desc = desc_counts.index[desc_counts < 10]
desc_map =  lambda x: 'other' if x in other_desc else x
raw_data['weather_description'] = raw_data['weather_description'].map(desc_map)

weather_description
clear sky           564
overcast clouds     183
broken clouds       148
scattered clouds    123
few clouds           91
light rain           55
moderate rain        14
smoke                 6
haze                  3
dust                  1
light snow            1
mist                  1
Name: count, dtype: int64


In [8]:
raw_data.to_csv('data/clean_fire_data.csv')

In [10]:
raw_data.shape

(1190, 18)

In [13]:
raw_data.iloc[0]

ID                                   19337
lat                                 34.871
lon                                -94.666
start_date             2022-12-31 00:00:00
incident name                Pine_Mt._Fire
fully_contained                       True
geographic_area                        SAC
size                                 436.0
temp                                285.28
feels_like                          284.06
pressure                              1015
humidity                                58
dew_point                           277.27
clouds                                   0
wind_speed                             0.0
rain                                   0.0
weather                              Clear
weather_description              clear sky
Name: 0, dtype: object

In [12]:
raw_data.dtypes

ID                              int64
lat                           float64
lon                           float64
start_date             datetime64[ns]
incident name                  object
fully_contained                  bool
geographic_area                object
size                          float64
temp                          float64
feels_like                    float64
pressure                        int64
humidity                        int64
dew_point                     float64
clouds                          int64
wind_speed                    float64
rain                          float64
weather                        object
weather_description            object
dtype: object