In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer

In [3]:
# loading dataset
df = pd.read_csv("../data/unprocessed/wildfire-dataset.csv", low_memory=False)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166753 entries, 0 to 2166752
Data columns (total 37 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   FOD_ID                         int64  
 1   FPA_ID                         object 
 2   SOURCE_SYSTEM_TYPE             object 
 3   SOURCE_SYSTEM                  object 
 4   NWCG_REPORTING_AGENCY          object 
 5   NWCG_REPORTING_UNIT_ID         object 
 6   NWCG_REPORTING_UNIT_NAME       object 
 7   SOURCE_REPORTING_UNIT          object 
 8   SOURCE_REPORTING_UNIT_NAME     object 
 9   LOCAL_FIRE_REPORT_ID           object 
 10  LOCAL_INCIDENT_ID              object 
 11  FIRE_CODE                      object 
 12  FIRE_NAME                      object 
 13  ICS_209_PLUS_INCIDENT_JOIN_ID  object 
 14  ICS_209_PLUS_COMPLEX_JOIN_ID   object 
 15  MTBS_ID                        object 
 16  MTBS_FIRE_NAME                 object 
 17  COMPLEX_NAME                   object 
 18  FI

In [5]:
df.isna().sum()

FOD_ID                                 0
FPA_ID                                 0
SOURCE_SYSTEM_TYPE                     0
SOURCE_SYSTEM                          0
NWCG_REPORTING_AGENCY                  0
NWCG_REPORTING_UNIT_ID                 0
NWCG_REPORTING_UNIT_NAME               0
SOURCE_REPORTING_UNIT                  0
SOURCE_REPORTING_UNIT_NAME             0
LOCAL_FIRE_REPORT_ID             1701854
LOCAL_INCIDENT_ID                 734948
FIRE_CODE                        1797127
FIRE_NAME                         960596
ICS_209_PLUS_INCIDENT_JOIN_ID    2135993
ICS_209_PLUS_COMPLEX_JOIN_ID     2165833
MTBS_ID                          2153848
MTBS_FIRE_NAME                   2153848
COMPLEX_NAME                     2161081
FIRE_YEAR                              0
DISCOVERY_DATE                         0
DISCOVERY_DOY                          0
DISCOVERY_TIME                    754468
NWCG_CAUSE_CLASSIFICATION              1
NWCG_GENERAL_CAUSE                     0
NWCG_CAUSE_AGE_C

In [6]:
df = df.drop(columns=["ICS_209_PLUS_INCIDENT_JOIN_ID", "ICS_209_PLUS_COMPLEX_JOIN_ID", "MTBS_ID", "MTBS_FIRE_NAME", "COMPLEX_NAME", "LOCAL_FIRE_REPORT_ID","FIRE_CODE", "FIRE_NAME", "LOCAL_INCIDENT_ID", "DISCOVERY_TIME", "NWCG_CAUSE_AGE_CATEGORY", "CONT_DATE", "CONT_DOY", "CONT_TIME", "FIPS_CODE", "FIPS_NAME"], axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166753 entries, 0 to 2166752
Data columns (total 21 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   FOD_ID                      int64  
 1   FPA_ID                      object 
 2   SOURCE_SYSTEM_TYPE          object 
 3   SOURCE_SYSTEM               object 
 4   NWCG_REPORTING_AGENCY       object 
 5   NWCG_REPORTING_UNIT_ID      object 
 6   NWCG_REPORTING_UNIT_NAME    object 
 7   SOURCE_REPORTING_UNIT       object 
 8   SOURCE_REPORTING_UNIT_NAME  object 
 9   FIRE_YEAR                   int64  
 10  DISCOVERY_DATE              object 
 11  DISCOVERY_DOY               int64  
 12  NWCG_CAUSE_CLASSIFICATION   object 
 13  NWCG_GENERAL_CAUSE          object 
 14  FIRE_SIZE                   float64
 15  FIRE_SIZE_CLASS             object 
 16  LATITUDE                    float64
 17  LONGITUDE                   float64
 18  OWNER_DESCR                 object 
 19  STATE                

In [8]:
df.isna().sum()

FOD_ID                             0
FPA_ID                             0
SOURCE_SYSTEM_TYPE                 0
SOURCE_SYSTEM                      0
NWCG_REPORTING_AGENCY              0
NWCG_REPORTING_UNIT_ID             0
NWCG_REPORTING_UNIT_NAME           0
SOURCE_REPORTING_UNIT              0
SOURCE_REPORTING_UNIT_NAME         0
FIRE_YEAR                          0
DISCOVERY_DATE                     0
DISCOVERY_DOY                      0
NWCG_CAUSE_CLASSIFICATION          1
NWCG_GENERAL_CAUSE                 0
FIRE_SIZE                          0
FIRE_SIZE_CLASS                    0
LATITUDE                           0
LONGITUDE                          0
OWNER_DESCR                        0
STATE                              0
COUNTY                        657235
dtype: int64

In [9]:
import reverse_geocoder as rg 

missing_county = df[df['COUNTY'].isna()]

coordinates = list(zip(missing_county.LATITUDE, missing_county.LONGITUDE))
results = rg.search(coordinates)

df.loc[df['COUNTY'].isna(), 'COUNTY'] = [r['admin2'] for r in results]

Loading formatted geocoded file...


In [10]:
df.isna().sum()

FOD_ID                        0
FPA_ID                        0
SOURCE_SYSTEM_TYPE            0
SOURCE_SYSTEM                 0
NWCG_REPORTING_AGENCY         0
NWCG_REPORTING_UNIT_ID        0
NWCG_REPORTING_UNIT_NAME      0
SOURCE_REPORTING_UNIT         0
SOURCE_REPORTING_UNIT_NAME    0
FIRE_YEAR                     0
DISCOVERY_DATE                0
DISCOVERY_DOY                 0
NWCG_CAUSE_CLASSIFICATION     1
NWCG_GENERAL_CAUSE            0
FIRE_SIZE                     0
FIRE_SIZE_CLASS               0
LATITUDE                      0
LONGITUDE                     0
OWNER_DESCR                   0
STATE                         0
COUNTY                        0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166753 entries, 0 to 2166752
Data columns (total 21 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   FOD_ID                      int64  
 1   FPA_ID                      object 
 2   SOURCE_SYSTEM_TYPE          object 
 3   SOURCE_SYSTEM               object 
 4   NWCG_REPORTING_AGENCY       object 
 5   NWCG_REPORTING_UNIT_ID      object 
 6   NWCG_REPORTING_UNIT_NAME    object 
 7   SOURCE_REPORTING_UNIT       object 
 8   SOURCE_REPORTING_UNIT_NAME  object 
 9   FIRE_YEAR                   int64  
 10  DISCOVERY_DATE              object 
 11  DISCOVERY_DOY               int64  
 12  NWCG_CAUSE_CLASSIFICATION   object 
 13  NWCG_GENERAL_CAUSE          object 
 14  FIRE_SIZE                   float64
 15  FIRE_SIZE_CLASS             object 
 16  LATITUDE                    float64
 17  LONGITUDE                   float64
 18  OWNER_DESCR                 object 
 19  STATE                

In [12]:
df.columns = df.columns.str.lower()

In [13]:
df['county'] = df['county'].str.lower()
df['nwcg_cause_classification'] = df['nwcg_cause_classification'].str.lower()
df['nwcg_general_cause'] = df['nwcg_general_cause'].str.lower()
df['fire_size_class'] = df['fire_size_class'].str.lower()
df['state'] = df['state'].str.lower()

In [14]:
# drop 'source_system_type', 'source_system', 'nwcg_reporting_agency', 'nwcg_reporting_unit_id', 'nwcg_reporting_unit_name', 'source_reporting_unit', 'source_reporting_unit_name', 'owner_descr',

In [15]:
df = df.drop(columns=['source_system_type', 'source_system', 'nwcg_reporting_agency', 'nwcg_reporting_unit_id', 'nwcg_reporting_unit_name', 'source_reporting_unit', 'source_reporting_unit_name', 'owner_descr'], axis=1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166753 entries, 0 to 2166752
Data columns (total 13 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   fod_id                     int64  
 1   fpa_id                     object 
 2   fire_year                  int64  
 3   discovery_date             object 
 4   discovery_doy              int64  
 5   nwcg_cause_classification  object 
 6   nwcg_general_cause         object 
 7   fire_size                  float64
 8   fire_size_class            object 
 9   latitude                   float64
 10  longitude                  float64
 11  state                      object 
 12  county                     object 
dtypes: float64(3), int64(3), object(7)
memory usage: 214.9+ MB


In [17]:
df = df.drop(columns=['fpa_id'], axis=1)

In [18]:
df['discovery_date'] = pd.to_datetime(df['discovery_date'])

# Extract year, month, day, and day of week
df['year'] = df['discovery_date'].dt.year
df['month'] = df['discovery_date'].dt.month
df['day'] = df['discovery_date'].dt.day
# df['day_of_week'] = df['discovery_date'].dt.dayofweek

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166753 entries, 0 to 2166752
Data columns (total 15 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   fod_id                     int64         
 1   fire_year                  int64         
 2   discovery_date             datetime64[ns]
 3   discovery_doy              int64         
 4   nwcg_cause_classification  object        
 5   nwcg_general_cause         object        
 6   fire_size                  float64       
 7   fire_size_class            object        
 8   latitude                   float64       
 9   longitude                  float64       
 10  state                      object        
 11  county                     object        
 12  year                       int32         
 13  month                      int32         
 14  day                        int32         
dtypes: datetime64[ns](1), float64(3), int32(3), int64(3), object(5)
memory usage: 223.2

In [20]:
df = df.drop(columns=['discovery_date'], axis=1)

In [21]:
# !pip install category_encoders

In [22]:
import category_encoders as ce

# One-hot encoding
one_hot_encoder = ce.OneHotEncoder(cols=['nwcg_cause_classification', 'nwcg_general_cause', 'fire_size_class'])
df = one_hot_encoder.fit_transform(df)

# Target encoding for Location Model
target_encoder_location = ce.TargetEncoder(cols=['county', 'state'])
df_location = target_encoder_location.fit_transform(df, df['latitude'])  

# Define targets and features for Location Model
location_target = df_location[['latitude', 'longitude']]  
location_features = df_location.drop(['fire_size', 'latitude', 'longitude'], axis=1)  

# Target encoding for Size Model
target_encoder_size = ce.TargetEncoder(cols=['county', 'state'])
df_size = target_encoder_size.fit_transform(df, df['fire_size'])  

# Define targets and features for Size Model
size_target = df_size['fire_size']  # or 'fire_size_class'
size_features = df_size.drop(['fire_size'], axis=1)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166753 entries, 0 to 2166752
Data columns (total 35 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   fod_id                       int64  
 1   fire_year                    int64  
 2   discovery_doy                int64  
 3   nwcg_cause_classification_1  int64  
 4   nwcg_cause_classification_2  int64  
 5   nwcg_cause_classification_3  int64  
 6   nwcg_cause_classification_4  int64  
 7   nwcg_general_cause_1         int64  
 8   nwcg_general_cause_2         int64  
 9   nwcg_general_cause_3         int64  
 10  nwcg_general_cause_4         int64  
 11  nwcg_general_cause_5         int64  
 12  nwcg_general_cause_6         int64  
 13  nwcg_general_cause_7         int64  
 14  nwcg_general_cause_8         int64  
 15  nwcg_general_cause_9         int64  
 16  nwcg_general_cause_10        int64  
 17  nwcg_general_cause_11        int64  
 18  nwcg_general_cause_12        int64  
 19  

In [24]:
location_features = location_features.drop(['state', 'county'], axis=1)
size_features = size_features.drop(['state', 'county'], axis=1)

In [25]:
location_features.to_csv('../data/processed/location_features.csv', index=False)
location_target.to_csv('../data/processed/location_target.csv', index=False)
size_features.to_csv('../data/processed/size_features.csv', index=False)
size_target.to_csv('../data/processed/size_target.csv', index=False)

In [26]:
import pickle

# Save encoders
with open('../models/one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

with open('../models/target_encoder_location.pkl', 'wb') as f:
    pickle.dump(target_encoder_location, f)

with open('../models/target_encoder_size.pkl', 'wb') as f:
    pickle.dump(target_encoder_size, f)