In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer

In [2]:
# loading dataset
df = pd.read_csv("../data/unprocessed/wildfire-dataset.csv", low_memory=False)

In [3]:
# df.info()

In [4]:
# df.isna().sum()

In [5]:
df = df.drop(columns=["ICS_209_PLUS_INCIDENT_JOIN_ID", "ICS_209_PLUS_COMPLEX_JOIN_ID", "MTBS_ID", "MTBS_FIRE_NAME", "COMPLEX_NAME", "LOCAL_FIRE_REPORT_ID","FIRE_CODE", "FIRE_NAME", "LOCAL_INCIDENT_ID", "DISCOVERY_TIME", "NWCG_CAUSE_AGE_CATEGORY", "CONT_DATE", "CONT_DOY", "CONT_TIME", "FIPS_CODE", "FIPS_NAME"], axis=1)

In [6]:
# df.info()

In [7]:
# df.isna().sum()

In [8]:
import reverse_geocoder as rg 

missing_county = df[df['COUNTY'].isna()]

coordinates = list(zip(missing_county.LATITUDE, missing_county.LONGITUDE))
results = rg.search(coordinates)

df.loc[df['COUNTY'].isna(), 'COUNTY'] = [r['admin2'] for r in results]

Loading formatted geocoded file...


In [9]:
# df.isna().sum()

In [10]:
# df.info()

In [11]:
df.columns = df.columns.str.lower()

In [12]:
df['county'] = df['county'].str.lower()
df['nwcg_cause_classification'] = df['nwcg_cause_classification'].str.lower()
df['nwcg_general_cause'] = df['nwcg_general_cause'].str.lower()
df['fire_size_class'] = df['fire_size_class'].str.lower()
df['state'] = df['state'].str.lower()

In [13]:
# drop 'source_system_type', 'source_system', 'nwcg_reporting_agency', 'nwcg_reporting_unit_id', 'nwcg_reporting_unit_name', 'source_reporting_unit', 'source_reporting_unit_name', 'owner_descr',

In [14]:
df = df.drop(columns=['source_system_type', 'source_system', 'nwcg_reporting_agency', 'nwcg_reporting_unit_id', 'nwcg_reporting_unit_name', 'source_reporting_unit', 'source_reporting_unit_name', 'owner_descr'], axis=1)

In [15]:
# df.info()

In [16]:
df = df.drop(columns=['fpa_id'], axis=1)

In [17]:
df['discovery_date'] = pd.to_datetime(df['discovery_date'])

# Extract year, month, day, and day of week
df['year'] = df['discovery_date'].dt.year
df['month'] = df['discovery_date'].dt.month
df['day'] = df['discovery_date'].dt.day
# df['day_of_week'] = df['discovery_date'].dt.dayofweek

In [18]:
# df.info()

In [19]:
df = df.drop(columns=['discovery_date'], axis=1)

In [20]:
# !pip install category_encoders

In [21]:
import category_encoders as ce

# One-hot encoding
one_hot_encoder = ce.OneHotEncoder(cols=['nwcg_cause_classification', 'nwcg_general_cause', 'fire_size_class'])
df = one_hot_encoder.fit_transform(df)

# Target encoding for Location Model
target_encoder_location = ce.TargetEncoder(cols=['county', 'state'])
df_location = target_encoder_location.fit_transform(df, df['latitude'])  

# Define targets and features for Location Model
location_target = df_location[['latitude', 'longitude']]  
location_features = df_location.drop(['fire_size', 'latitude', 'longitude'], axis=1)  

# Target encoding for Size Model
target_encoder_size = ce.TargetEncoder(cols=['county', 'state'])
df_size = target_encoder_size.fit_transform(df, df['fire_size'])  

# Define targets and features for Size Model
size_target = df_size['fire_size']  # or 'fire_size_class'
size_features = df_size.drop(['fire_size'], axis=1)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166753 entries, 0 to 2166752
Data columns (total 35 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   fod_id                       int64  
 1   fire_year                    int64  
 2   discovery_doy                int64  
 3   nwcg_cause_classification_1  int64  
 4   nwcg_cause_classification_2  int64  
 5   nwcg_cause_classification_3  int64  
 6   nwcg_cause_classification_4  int64  
 7   nwcg_general_cause_1         int64  
 8   nwcg_general_cause_2         int64  
 9   nwcg_general_cause_3         int64  
 10  nwcg_general_cause_4         int64  
 11  nwcg_general_cause_5         int64  
 12  nwcg_general_cause_6         int64  
 13  nwcg_general_cause_7         int64  
 14  nwcg_general_cause_8         int64  
 15  nwcg_general_cause_9         int64  
 16  nwcg_general_cause_10        int64  
 17  nwcg_general_cause_11        int64  
 18  nwcg_general_cause_12        int64  
 19  

In [24]:
location_features = location_features.drop(['state', 'county'], axis=1)
size_features = size_features.drop(['state', 'county'], axis=1)

In [25]:
df.to_csv("../data/processed/cleaned-dataframe.csv", index=False)
location_features.to_csv('../data/processed/location_features.csv', index=False)
location_target.to_csv('../data/processed/location_target.csv', index=False)
size_features.to_csv('../data/processed/size_features.csv', index=False)
size_target.to_csv('../data/processed/size_target.csv', index=False)

In [26]:
import pickle

# Save encoders
with open('../models/one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

with open('../models/target_encoder_location.pkl', 'wb') as f:
    pickle.dump(target_encoder_location, f)

with open('../models/target_encoder_size.pkl', 'wb') as f:
    pickle.dump(target_encoder_size, f)