# Wildfire Dataset Cleaning Steps

In [2]:
import pandas as pd

wildfire = pd.read_csv("../data/raw/wildfire/wildfire.csv")
wildfire.head()


Unnamed: 0,YEAR,FIRE_NUMBER,FIRE_NAME,CURRENT_SIZE,SIZE_CLASS,LATITUDE,LONGITUDE,FIRE_ORIGIN,GENERAL_CAUSE,INDUSTRY_IDENTIFIER,...,DISTANCE_FROM_WATER_SOURCE,FIRST_BUCKET_DROP_DATE,FIRST_BH_DATE,FIRST_BH_SIZE,FIRST_UC_DATE,FIRST_UC_SIZE,FIRST_TO_DATE,FIRST_TO_SIZE,FIRST_EX_DATE,FIRST_EX_SIZE_PERIMETER
0,2006,PWF001,,0.1,A,56.249956,-117.18196,Private Land,Resident,,...,,,2006-04-02 22:00:00,0.01,2006-04-02 22:00:00,0.01,,,2006-04-03 10:20:00,0.1
1,2006,EWF002,,0.2,B,53.606367,-115.915733,Provincial Land,Incendiary,,...,,,2006-04-03 13:20:00,0.2,2006-04-03 13:20:00,0.2,,,2006-04-03 14:00:00,0.2
2,2006,EWF001,,0.5,B,53.610933,-115.594267,Provincial Land,Incendiary,,...,,,2006-04-03 13:23:00,0.5,2006-04-03 13:23:00,0.5,,,2006-04-03 15:00:00,0.5
3,2006,EWF003,,0.01,A,53.608867,-115.609467,Provincial Land,Incendiary,,...,,,2006-04-03 14:08:00,0.01,2006-04-03 14:08:00,0.01,,,2006-04-03 15:05:00,0.01
4,2006,PWF002,,0.1,A,56.249956,-117.050249,Provincial Land,Other Industry,Waste Disposal,...,,,2006-04-03 19:57:00,0.1,2006-04-03 20:19:00,0.1,2006-04-03 20:20:00,0.1,2006-04-05 10:18:00,0.1


# Handle missing values

In [3]:
## First I checked how many rows do we have : 
num_rows = wildfire.shape[0]
print("Number of rows:", num_rows)

Number of rows: 26551


In [4]:
wildfire.isna().sum()

YEAR                                0
FIRE_NUMBER                         0
FIRE_NAME                       25756
CURRENT_SIZE                        0
SIZE_CLASS                          0
LATITUDE                            0
LONGITUDE                           0
FIRE_ORIGIN                        12
GENERAL_CAUSE                       0
INDUSTRY_IDENTIFIER             26071
RESPONSIBLE_GROUP               17029
ACTIVITY_CLASS                  10416
TRUE_CAUSE                      12529
FIRE_START_DATE                   689
DETECTION_AGENT_TYPE                0
DETECTION_AGENT                     0
DISCOVERED_DATE                  5409
DISCOVERED_SIZE                 26402
REPORTED_DATE                       0
DISPATCHED_RESOURCE                12
DISPATCH_DATE                      12
START_FOR_FIRE_DATE                17
ASSESSMENT_RESOURCE                 0
ASSESSMENT_DATETIME                 0
ASSESSMENT_HECTARES                 0
FIRE_SPREAD_RATE                 2806
FIRE_TYPE   

## Drop uninformative or very sparse columns
These columns have more than 50% missing values and we can remove them. 

In [5]:
## These columns are very sparse
# FIRE_NAME                       25756
# INDUSTRY_IDENTIFIER             26071
# DISCOVERED_SIZE                 26402
# DISTANCE_FROM_WATER_SOURCE      18958
# FIRST_BUCKET_DROP_DATE          18957
# FIRST_TO_DATE                   23809
# FIRST_TO_SIZE                   23809



wildfire = wildfire.drop(columns=[
    "FIRE_NAME", "INDUSTRY_IDENTIFIER", "DISCOVERED_SIZE", 
    "DISTANCE_FROM_WATER_SOURCE", "FIRST_BUCKET_DROP_DATE", "FIRST_TO_DATE", "FIRST_TO_SIZE"
], errors="ignore")


## Handle moderate missing values

These are useful variables but have some missing data. We can fill with "unknown", 0, or mean.

In [6]:
# few missing
wildfire["DISPATCHED_RESOURCE"] = wildfire["DISPATCHED_RESOURCE"].fillna("Unknown")
wildfire["DISPATCH_DATE"] = pd.to_datetime(wildfire["DISPATCH_DATE"], errors="coerce") # I converted this column to date, and If a value can't be converted to a date (e.g., it's missing or malformed), it will be replaced with NaT
wildfire["START_FOR_FIRE_DATE"] = pd.to_datetime(wildfire["START_FOR_FIRE_DATE"], errors="coerce")
wildfire["INITIAL_ACTION_BY"] = wildfire["INITIAL_ACTION_BY"].fillna("Unknown")
wildfire["FIRST_EX_DATE"] = pd.to_datetime(wildfire["FIRST_EX_DATE"], errors="coerce")
wildfire["FIRST_EX_SIZE_PERIMETER"] = wildfire["FIRST_EX_SIZE_PERIMETER"].fillna(-1) # missing or unknown numeric data

# Moderate missing
wildfire["FIRE_START_DATE"] = pd.to_datetime(wildfire["FIRE_START_DATE"], errors="coerce")
wildfire["DISCOVERED_DATE"] = pd.to_datetime(wildfire["DISCOVERED_DATE"], errors="coerce")
wildfire["FIRE_SPREAD_RATE"] = wildfire["FIRE_SPREAD_RATE"].fillna(wildfire["FIRE_SPREAD_RATE"].mean())  ## mean value
wildfire["FIRE_POSITION_ON_SLOPE"] = wildfire["FIRE_POSITION_ON_SLOPE"].fillna("Unknown")
wildfire["WEATHER_CONDITIONS_OVER_FIRE"] = wildfire["WEATHER_CONDITIONS_OVER_FIRE"].fillna("Unknown")
wildfire["WIND_DIRECTION"] = wildfire["WIND_DIRECTION"].fillna("Unknown")
wildfire["FUEL_TYPE"] = wildfire["FUEL_TYPE"].fillna("Unknown")
wildfire["TRUE_CAUSE"] = wildfire["TRUE_CAUSE"].fillna("Unknown") 
wildfire["FIRE_TYPE"] = wildfire["FIRE_TYPE"].fillna("Unknown") 

# Higher missing 
wildfire["RESPONSIBLE_GROUP"] = wildfire["RESPONSIBLE_GROUP"].fillna("Unknown")
wildfire["ACTIVITY_CLASS"] = wildfire["ACTIVITY_CLASS"].fillna("Unknown")
wildfire["IA_ARRIVAL_AT_FIRE_DATE"] = pd.to_datetime(wildfire["IA_ARRIVAL_AT_FIRE_DATE"], errors="coerce")
wildfire["IA_ACCESS"] = wildfire["IA_ACCESS"].fillna("Unknown")
wildfire["FIRE_FIGHTING_START_DATE"] = pd.to_datetime(wildfire["FIRE_FIGHTING_START_DATE"], errors="coerce")
wildfire["FIRE_FIGHTING_START_SIZE"] = wildfire["FIRE_FIGHTING_START_SIZE"].fillna(wildfire["FIRE_FIGHTING_START_SIZE"].median()) ## median value
wildfire["BUCKETING_ON_FIRE"] = wildfire["BUCKETING_ON_FIRE"].fillna("Unknown")




In [7]:
## Double check everything one more time  --> alll of the date columns now have NaT 
wildfire.isna().sum()
## Please let me know about the other columns, like humidity or temperature and wind_s, what should we do 

YEAR                               0
FIRE_NUMBER                        0
CURRENT_SIZE                       0
SIZE_CLASS                         0
LATITUDE                           0
LONGITUDE                          0
FIRE_ORIGIN                       12
GENERAL_CAUSE                      0
RESPONSIBLE_GROUP                  0
ACTIVITY_CLASS                     0
TRUE_CAUSE                         0
FIRE_START_DATE                  693
DETECTION_AGENT_TYPE               0
DETECTION_AGENT                    0
DISCOVERED_DATE                 5409
REPORTED_DATE                      0
DISPATCHED_RESOURCE                0
DISPATCH_DATE                     12
START_FOR_FIRE_DATE               17
ASSESSMENT_RESOURCE                0
ASSESSMENT_DATETIME                0
ASSESSMENT_HECTARES                0
FIRE_SPREAD_RATE                   0
FIRE_TYPE                          0
FIRE_POSITION_ON_SLOPE             0
WEATHER_CONDITIONS_OVER_FIRE       0
TEMPERATURE                     2872
R

# Fix the Data types 

In [10]:
## First I checked the data types
print(wildfire.dtypes)

YEAR                                     int64
FIRE_NUMBER                             object
CURRENT_SIZE                           float64
SIZE_CLASS                              object
LATITUDE                               float64
LONGITUDE                              float64
FIRE_ORIGIN                             object
GENERAL_CAUSE                           object
RESPONSIBLE_GROUP                       object
ACTIVITY_CLASS                          object
TRUE_CAUSE                              object
FIRE_START_DATE                 datetime64[ns]
DETECTION_AGENT_TYPE                    object
DETECTION_AGENT                         object
DISCOVERED_DATE                 datetime64[ns]
REPORTED_DATE                   datetime64[ns]
DISPATCHED_RESOURCE                     object
DISPATCH_DATE                   datetime64[ns]
START_FOR_FIRE_DATE             datetime64[ns]
ASSESSMENT_RESOURCE                     object
ASSESSMENT_DATETIME             datetime64[ns]
ASSESSMENT_HE

##  Date/Time

In [11]:
date_cols = [
    "FIRE_START_DATE", "DISCOVERED_DATE", "REPORTED_DATE", "DISPATCH_DATE",
    "START_FOR_FIRE_DATE", "IA_ARRIVAL_AT_FIRE_DATE", "FIRE_FIGHTING_START_DATE", "FIRST_BH_DATE", "FIRST_UC_DATE",
    "FIRST_EX_DATE", "ASSESSMENT_DATETIME"
]

for col in date_cols:
    wildfire[col] = pd.to_datetime(wildfire[col], errors="coerce")


 ## Format all float columns to 2 decimal places

In [12]:
float_cols = wildfire.select_dtypes(include=["float64"]).columns
wildfire[float_cols] = wildfire[float_cols].round(2)

# Column Names:

In [13]:
wildfire.columns.tolist()

['YEAR',
 'FIRE_NUMBER',
 'CURRENT_SIZE',
 'SIZE_CLASS',
 'LATITUDE',
 'LONGITUDE',
 'FIRE_ORIGIN',
 'GENERAL_CAUSE',
 'RESPONSIBLE_GROUP',
 'ACTIVITY_CLASS',
 'TRUE_CAUSE',
 'FIRE_START_DATE',
 'DETECTION_AGENT_TYPE',
 'DETECTION_AGENT',
 'DISCOVERED_DATE',
 'REPORTED_DATE',
 'DISPATCHED_RESOURCE',
 'DISPATCH_DATE',
 'START_FOR_FIRE_DATE',
 'ASSESSMENT_RESOURCE',
 'ASSESSMENT_DATETIME',
 'ASSESSMENT_HECTARES',
 'FIRE_SPREAD_RATE',
 'FIRE_TYPE',
 'FIRE_POSITION_ON_SLOPE',
 'WEATHER_CONDITIONS_OVER_FIRE',
 'TEMPERATURE',
 'RELATIVE_HUMIDITY',
 'WIND_DIRECTION',
 'WIND_SPEED',
 'FUEL_TYPE',
 'INITIAL_ACTION_BY',
 'IA_ARRIVAL_AT_FIRE_DATE',
 'IA_ACCESS',
 'FIRE_FIGHTING_START_DATE',
 'FIRE_FIGHTING_START_SIZE',
 'BUCKETING_ON_FIRE',
 'FIRST_BH_DATE',
 'FIRST_BH_SIZE',
 'FIRST_UC_DATE',
 'FIRST_UC_SIZE',
 'FIRST_EX_DATE',
 'FIRST_EX_SIZE_PERIMETER']

In [14]:
wildfire.columns = wildfire.columns.str.lower()


# Export cleaned dataset

In [16]:
wildfire.to_csv("../data/clean/wildfire_clean.csv", index=False)

##  Validate cleaned file

In [17]:
clean = pd.read_csv("../data/clean/wildfire_clean.csv")
clean.shape
clean.head()

FileNotFoundError: [Errno 2] No such file or directory: 'wildfire_clean.csv'