In [44]:
import pandas as pd
import numpy as np
from matplotlib.pyplot import subplots

In [45]:
# load in dataset
main = pd.read_csv("../data/unprocessed/wildfire-dataset_copy.csv", low_memory=False)

In [46]:
# All columns
all_columns = list(main.columns)
print(all_columns)

['FOD_ID', 'FPA_ID', 'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID', 'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID', 'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME', 'ICS_209_PLUS_INCIDENT_JOIN_ID', 'ICS_209_PLUS_COMPLEX_JOIN_ID', 'MTBS_ID', 'MTBS_FIRE_NAME', 'COMPLEX_NAME', 'FIRE_YEAR', 'DISCOVERY_DATE', 'DISCOVERY_DOY', 'DISCOVERY_TIME', 'NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE', 'NWCG_CAUSE_AGE_CATEGORY', 'CONT_DATE', 'CONT_DOY', 'CONT_TIME', 'FIRE_SIZE', 'FIRE_SIZE_CLASS', 'LATITUDE', 'LONGITUDE', 'OWNER_DESCR', 'STATE', 'COUNTY', 'FIPS_CODE', 'FIPS_NAME']


In [47]:
# plan to drop columns
dropped_columns = [
    "ICS_209_PLUS_INCIDENT_JOIN_ID", 
    "ICS_209_PLUS_COMPLEX_JOIN_ID", 
    "MTBS_ID", 
    "MTBS_FIRE_NAME", 
    "COMPLEX_NAME", 
    "LOCAL_FIRE_REPORT_ID",
    "FIRE_CODE", 
    "FIRE_NAME", 
    "LOCAL_INCIDENT_ID", 
    "NWCG_CAUSE_AGE_CATEGORY", 
    "CONT_DATE", 
    "CONT_DOY", 
    "CONT_TIME", 
    "FIPS_CODE", 
    "FIPS_NAME", 
    "SOURCE_SYSTEM_TYPE", 
    "SOURCE_SYSTEM", 
    "NWCG_REPORTING_AGENCY",
    "NWCG_REPORTING_UNIT_ID",
    "NWCG_REPORTING_UNIT_NAME",
    "SOURCE_REPORTING_UNIT",
    "SOURCE_REPORTING_UNIT_NAME",
    "OWNER_DESCR",
    "FPA_ID",
    ]

In [48]:
# Creating a new list of columns
# Columns in all_columns but not in dropped_columns

columns = list(set(all_columns) - set(dropped_columns))
print(columns)

['NWCG_CAUSE_CLASSIFICATION', 'DISCOVERY_TIME', 'LATITUDE', 'FIRE_SIZE_CLASS', 'NWCG_GENERAL_CAUSE', 'STATE', 'DISCOVERY_DOY', 'FIRE_YEAR', 'COUNTY', 'DISCOVERY_DATE', 'LONGITUDE', 'FIRE_SIZE', 'FOD_ID']


In [49]:
# using the subset of columns to make a new dataframe to use
main_dataframe = main[columns]
main_dataframe.head()

Unnamed: 0,NWCG_CAUSE_CLASSIFICATION,DISCOVERY_TIME,LATITUDE,FIRE_SIZE_CLASS,NWCG_GENERAL_CAUSE,STATE,DISCOVERY_DOY,FIRE_YEAR,COUNTY,DISCOVERY_DATE,LONGITUDE,FIRE_SIZE,FOD_ID
0,Human,1300.0,40.036944,A,Power generation/transmission/distribution,CA,33,2005,63,2/2/2005 0:00,-121.005833,0.1,1
1,Natural,845.0,38.933056,A,Natural,CA,133,2004,61,5/12/2004 0:00,-120.404444,0.25,2
2,Human,1921.0,38.984167,A,Debris and open burning,CA,152,2004,17,5/31/2004 0:00,-120.735556,0.1,3
3,Natural,1600.0,38.559167,A,Natural,CA,180,2004,3,6/28/2004 0:00,-119.913333,0.1,4
4,Natural,1600.0,38.559167,A,Natural,CA,180,2004,3,6/28/2004 0:00,-119.933056,0.1,5


In [50]:
# Checking for missing values

main_dataframe.isna().sum()

NWCG_CAUSE_CLASSIFICATION         1
DISCOVERY_TIME               754468
LATITUDE                          0
FIRE_SIZE_CLASS                   0
NWCG_GENERAL_CAUSE                0
STATE                             0
DISCOVERY_DOY                     0
FIRE_YEAR                         0
COUNTY                       657235
DISCOVERY_DATE                    0
LONGITUDE                         0
FIRE_SIZE                         0
FOD_ID                            0
dtype: int64

In [51]:
# Using the coordinates to LATITUDE and LONGITUDE to find the county

import reverse_geocoder as rg

df = main_dataframe.copy()

# empty out all data in "county" field
df["COUNTY"] = None

# list all missing counties
missing_counties = df[df["COUNTY"].isna()].copy()

# get coordinates for all missing counties
coordinates = list(zip(missing_counties.LATITUDE, missing_counties.LONGITUDE))

# get data for missing counties
counties = rg.search(coordinates)

# get county names from counties dictionary
county_names =  [county["admin2"] for county in counties]

# check first 3 missing counties 
print(f"County Name: {county_names[:3]}")

# replace all missing counties with their corresponding counties
df.loc[df["COUNTY"].isna(), "COUNTY"] = county_names

# I have decided to make all current county codes into None values then fill them with their respective county names.

County Name: ['Plumas County', 'El Dorado County', 'Placer County']


In [52]:
# Have decided to scrap this for the time being, will be converting all counties in to their respective name then encoding them.
# will convert county name to county code
# https://www2.census.gov/geo/docs/reference/codes2020/national_cousub2020.txt

# state_data = pd.read_csv("../data/unprocessed/national_cousub2020.txt", delimiter="|", low_memory=False)

# # Create a mapping DataFrame
# county_code_map = state_data[['COUNTYNAME', 'COUNTYFP']].drop_duplicates()

# # Merge main_dataframe with county_code_map
# main_dataframe = pd.merge(main_dataframe, county_code_map, left_on='COUNTY', right_on='COUNTYNAME', how='left')

# # Rename the column as needed and drop the extra column
# main_dataframe.rename(columns={'COUNTYFP': 'COUNTY_CODE'}, inplace=True)
# main_dataframe.drop('COUNTYNAME', axis=1, inplace=True)

# Now main_dataframe should have a new column 'COUNTY_CODE' with the corresponding county codes

In [53]:
df.head()

Unnamed: 0,NWCG_CAUSE_CLASSIFICATION,DISCOVERY_TIME,LATITUDE,FIRE_SIZE_CLASS,NWCG_GENERAL_CAUSE,STATE,DISCOVERY_DOY,FIRE_YEAR,COUNTY,DISCOVERY_DATE,LONGITUDE,FIRE_SIZE,FOD_ID
0,Human,1300.0,40.036944,A,Power generation/transmission/distribution,CA,33,2005,Plumas County,2/2/2005 0:00,-121.005833,0.1,1
1,Natural,845.0,38.933056,A,Natural,CA,133,2004,El Dorado County,5/12/2004 0:00,-120.404444,0.25,2
2,Human,1921.0,38.984167,A,Debris and open burning,CA,152,2004,Placer County,5/31/2004 0:00,-120.735556,0.1,3
3,Natural,1600.0,38.559167,A,Natural,CA,180,2004,Douglas County,6/28/2004 0:00,-119.913333,0.1,4
4,Natural,1600.0,38.559167,A,Natural,CA,180,2004,El Dorado County,6/28/2004 0:00,-119.933056,0.1,5


In [54]:
# splitting up Discovery date
# We already have the year, therefore we only need the month and day from this field.

# First we convert this column into datetime datatype
df["DISCOVERY_DATE"] = pd.to_datetime(df["DISCOVERY_DATE"])

# Now we extract the Month and Day
df["FIRE_MONTH"] = df["DISCOVERY_DATE"].dt.month
df["FIRE_DAY"] = df["DISCOVERY_DATE"].dt.day

# Now we verify change
df.head()

Unnamed: 0,NWCG_CAUSE_CLASSIFICATION,DISCOVERY_TIME,LATITUDE,FIRE_SIZE_CLASS,NWCG_GENERAL_CAUSE,STATE,DISCOVERY_DOY,FIRE_YEAR,COUNTY,DISCOVERY_DATE,LONGITUDE,FIRE_SIZE,FOD_ID,FIRE_MONTH,FIRE_DAY
0,Human,1300.0,40.036944,A,Power generation/transmission/distribution,CA,33,2005,Plumas County,2005-02-02,-121.005833,0.1,1,2,2
1,Natural,845.0,38.933056,A,Natural,CA,133,2004,El Dorado County,2004-05-12,-120.404444,0.25,2,5,12
2,Human,1921.0,38.984167,A,Debris and open burning,CA,152,2004,Placer County,2004-05-31,-120.735556,0.1,3,5,31
3,Natural,1600.0,38.559167,A,Natural,CA,180,2004,Douglas County,2004-06-28,-119.913333,0.1,4,6,28
4,Natural,1600.0,38.559167,A,Natural,CA,180,2004,El Dorado County,2004-06-28,-119.933056,0.1,5,6,28


In [55]:
# Since we split up discovery date we no longer need the field, so we drop it.
df = df.drop(columns=["DISCOVERY_DATE"], axis=1)

# verify change
df.head()

Unnamed: 0,NWCG_CAUSE_CLASSIFICATION,DISCOVERY_TIME,LATITUDE,FIRE_SIZE_CLASS,NWCG_GENERAL_CAUSE,STATE,DISCOVERY_DOY,FIRE_YEAR,COUNTY,LONGITUDE,FIRE_SIZE,FOD_ID,FIRE_MONTH,FIRE_DAY
0,Human,1300.0,40.036944,A,Power generation/transmission/distribution,CA,33,2005,Plumas County,-121.005833,0.1,1,2,2
1,Natural,845.0,38.933056,A,Natural,CA,133,2004,El Dorado County,-120.404444,0.25,2,5,12
2,Human,1921.0,38.984167,A,Debris and open burning,CA,152,2004,Placer County,-120.735556,0.1,3,5,31
3,Natural,1600.0,38.559167,A,Natural,CA,180,2004,Douglas County,-119.913333,0.1,4,6,28
4,Natural,1600.0,38.559167,A,Natural,CA,180,2004,El Dorado County,-119.933056,0.1,5,6,28


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2166753 entries, 0 to 2166752
Data columns (total 14 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   NWCG_CAUSE_CLASSIFICATION  object 
 1   DISCOVERY_TIME             float64
 2   LATITUDE                   float64
 3   FIRE_SIZE_CLASS            object 
 4   NWCG_GENERAL_CAUSE         object 
 5   STATE                      object 
 6   DISCOVERY_DOY              int64  
 7   FIRE_YEAR                  int64  
 8   COUNTY                     object 
 9   LONGITUDE                  float64
 10  FIRE_SIZE                  float64
 11  FOD_ID                     int64  
 12  FIRE_MONTH                 int32  
 13  FIRE_DAY                   int32  
dtypes: float64(4), int32(2), int64(3), object(5)
memory usage: 214.9+ MB


In [57]:
df.isna().sum()

NWCG_CAUSE_CLASSIFICATION         1
DISCOVERY_TIME               754468
LATITUDE                          0
FIRE_SIZE_CLASS                   0
NWCG_GENERAL_CAUSE                0
STATE                             0
DISCOVERY_DOY                     0
FIRE_YEAR                         0
COUNTY                            0
LONGITUDE                         0
FIRE_SIZE                         0
FOD_ID                            0
FIRE_MONTH                        0
FIRE_DAY                          0
dtype: int64

In [58]:
# We will convert discovery time into time as cyclical showing closeness between 23:59 and 0:00
# Issue: Deal with NaN floats, aka missing values

# get time mean
time_mean = df["DISCOVERY_TIME"].mean()

# fill in missing values with the mean 
df.fillna({"DISCOVERY_TIME": time_mean}, inplace=True)

# A function to extract the hour and minutes from discovery time.
def extract_time(time):
    hour = int(time // 100)
    minute = int(time % 100)
    return hour, minute

df["FIRE_HOUR"], df["FIRE_MINUTE"] = zip(*df["DISCOVERY_TIME"].apply(extract_time))

df["FIRE_HOUR_SIN"] = np.sin(2 * np.pi * df["FIRE_HOUR"]/24)
df["FIRE_HOUR_COS"] = np.cos(2 * np.pi * df["FIRE_HOUR"]/24)
df["FIRE_MINUTE_SIN"] = np.sin(2 * np.pi * df["FIRE_MINUTE"]/60)
df["FIRE_MINUTE_COS"] = np.cos(2 * np.pi * df["FIRE_MINUTE"]/60)


In [59]:
# verify time change
df.head()

Unnamed: 0,NWCG_CAUSE_CLASSIFICATION,DISCOVERY_TIME,LATITUDE,FIRE_SIZE_CLASS,NWCG_GENERAL_CAUSE,STATE,DISCOVERY_DOY,FIRE_YEAR,COUNTY,LONGITUDE,FIRE_SIZE,FOD_ID,FIRE_MONTH,FIRE_DAY,FIRE_HOUR,FIRE_MINUTE,FIRE_HOUR_SIN,FIRE_HOUR_COS,FIRE_MINUTE_SIN,FIRE_MINUTE_COS
0,Human,1300.0,40.036944,A,Power generation/transmission/distribution,CA,33,2005,Plumas County,-121.005833,0.1,1,2,2,13,0,-0.258819,-0.965926,0.0,1.0
1,Natural,845.0,38.933056,A,Natural,CA,133,2004,El Dorado County,-120.404444,0.25,2,5,12,8,45,0.866025,-0.5,-1.0,-1.83697e-16
2,Human,1921.0,38.984167,A,Debris and open burning,CA,152,2004,Placer County,-120.735556,0.1,3,5,31,19,21,-0.965926,0.258819,0.809017,-0.5877853
3,Natural,1600.0,38.559167,A,Natural,CA,180,2004,Douglas County,-119.913333,0.1,4,6,28,16,0,-0.866025,-0.5,0.0,1.0
4,Natural,1600.0,38.559167,A,Natural,CA,180,2004,El Dorado County,-119.933056,0.1,5,6,28,16,0,-0.866025,-0.5,0.0,1.0


In [60]:
# Now we can drop the field discovery_time

df = df.drop(columns=["DISCOVERY_TIME"], axis=1)

In [61]:
df.isna().sum()

NWCG_CAUSE_CLASSIFICATION    1
LATITUDE                     0
FIRE_SIZE_CLASS              0
NWCG_GENERAL_CAUSE           0
STATE                        0
DISCOVERY_DOY                0
FIRE_YEAR                    0
COUNTY                       0
LONGITUDE                    0
FIRE_SIZE                    0
FOD_ID                       0
FIRE_MONTH                   0
FIRE_DAY                     0
FIRE_HOUR                    0
FIRE_MINUTE                  0
FIRE_HOUR_SIN                0
FIRE_HOUR_COS                0
FIRE_MINUTE_SIN              0
FIRE_MINUTE_COS              0
dtype: int64

In [62]:
# We will drop the record where their is a missing value in the following field, then check for any more missing values.
df = df.dropna(subset=["NWCG_CAUSE_CLASSIFICATION"])
df.isna().sum()

NWCG_CAUSE_CLASSIFICATION    0
LATITUDE                     0
FIRE_SIZE_CLASS              0
NWCG_GENERAL_CAUSE           0
STATE                        0
DISCOVERY_DOY                0
FIRE_YEAR                    0
COUNTY                       0
LONGITUDE                    0
FIRE_SIZE                    0
FOD_ID                       0
FIRE_MONTH                   0
FIRE_DAY                     0
FIRE_HOUR                    0
FIRE_MINUTE                  0
FIRE_HOUR_SIN                0
FIRE_HOUR_COS                0
FIRE_MINUTE_SIN              0
FIRE_MINUTE_COS              0
dtype: int64

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2166752 entries, 0 to 2166752
Data columns (total 19 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   NWCG_CAUSE_CLASSIFICATION  object 
 1   LATITUDE                   float64
 2   FIRE_SIZE_CLASS            object 
 3   NWCG_GENERAL_CAUSE         object 
 4   STATE                      object 
 5   DISCOVERY_DOY              int64  
 6   FIRE_YEAR                  int64  
 7   COUNTY                     object 
 8   LONGITUDE                  float64
 9   FIRE_SIZE                  float64
 10  FOD_ID                     int64  
 11  FIRE_MONTH                 int32  
 12  FIRE_DAY                   int32  
 13  FIRE_HOUR                  int64  
 14  FIRE_MINUTE                int64  
 15  FIRE_HOUR_SIN              float64
 16  FIRE_HOUR_COS              float64
 17  FIRE_MINUTE_SIN            float64
 18  FIRE_MINUTE_COS            float64
dtypes: float64(7), int32(2), int64(5), object(5)
me

In [64]:
# TODO: Convert all objects into numerical values.

First we will begin by looking at how the data looks like for each column that has object data type.

In [65]:
df["NWCG_CAUSE_CLASSIFICATION"].unique()

array(['Human', 'Natural', 'Missing data/not specified/undetermined'],
      dtype=object)

In [66]:
df["FIRE_SIZE_CLASS"].unique()

array(['A', 'B', 'G', 'C', 'D', 'F', 'E'], dtype=object)

In [67]:
df["STATE"].unique()

array(['CA', 'NM', 'OR', 'NC', 'WY', 'CO', 'WA', 'MT', 'UT', 'AZ', 'SD',
       'AR', 'NV', 'ID', 'MN', 'TX', 'FL', 'SC', 'LA', 'OK', 'KS', 'MO',
       'NE', 'MI', 'KY', 'OH', 'IN', 'VA', 'IL', 'TN', 'GA', 'AK', 'ND',
       'WV', 'WI', 'AL', 'NH', 'PA', 'MS', 'ME', 'VT', 'NY', 'IA', 'DC',
       'MD', 'CT', 'MA', 'NJ', 'HI', 'DE', 'PR', 'RI'], dtype=object)

In [68]:
df["NWCG_GENERAL_CAUSE"].unique()

array(['Power generation/transmission/distribution', 'Natural',
       'Debris and open burning',
       'Missing data/not specified/undetermined',
       'Recreation and ceremony', 'Equipment and vehicle use',
       'Arson/incendiarism', 'Fireworks', 'Other causes',
       'Railroad operations and maintenance', 'Smoking',
       'Misuse of fire by a minor', 'Firearms and explosives use'],
      dtype=object)

In [69]:
df["COUNTY"].unique()

array(['Plumas County', 'El Dorado County', 'Placer County', ...,
       'Kenosha County', 'Pepin County', 'Moultrie County'], dtype=object)

Let the encoding begin, I will be merging the column "STATE" and "COUNTY" because their may be some counties that have the same name across different states. 

After that I will use One hot encoder on the following columns, "NWCG_CAUSE_CLASSIFICATION", "NWCG_GENERAL_CAUSE", and "STATE_COUNTY".

Since their is no hierarchy in these three columns one hot encoding will be a good choice. 

Then I will apply ordinal encoding to the column "FIRE_SIZE_CLASS" because I would like to show an order between A - G

In [70]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 2166752 entries, 0 to 2166752
Data columns (total 19 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   NWCG_CAUSE_CLASSIFICATION  object 
 1   LATITUDE                   float64
 2   FIRE_SIZE_CLASS            object 
 3   NWCG_GENERAL_CAUSE         object 
 4   STATE                      object 
 5   DISCOVERY_DOY              int64  
 6   FIRE_YEAR                  int64  
 7   COUNTY                     object 
 8   LONGITUDE                  float64
 9   FIRE_SIZE                  float64
 10  FOD_ID                     int64  
 11  FIRE_MONTH                 int32  
 12  FIRE_DAY                   int32  
 13  FIRE_HOUR                  int64  
 14  FIRE_MINUTE                int64  
 15  FIRE_HOUR_SIN              float64
 16  FIRE_HOUR_COS              float64
 17  FIRE_MINUTE_SIN            float64
 18  FIRE_MINUTE_COS            float64
dtypes: float64(7), int32(2), int64(5), object(5)
me

In [71]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# concatenate state and county
df["STATE_COUNTY"] = df["STATE"] + "-" + df["COUNTY"]

# selecting columns that we will encode for onehotencoding
columns_to_encode_oh = ["NWCG_CAUSE_CLASSIFICATION", "NWCG_GENERAL_CAUSE", "STATE_COUNTY"]

# Instatiate OneHotEncoder 
oh_encoder = OneHotEncoder()

# Fit and transform 
oh_encoded = oh_encoder.fit_transform(df[columns_to_encode_oh])


In [72]:
# now encoding fire_size_class with ordinal
ordinal_encoder = OrdinalEncoder(categories=[["A", "B", "C", "D", "E", "F", "G"]])


In [73]:

df["FIRE_SIZE_CLASS_ENCODED"] = ordinal_encoder.fit_transform(df[["FIRE_SIZE_CLASS"]]) 

In [74]:
# creating a df with the encoded data 
oh_encoded_df = pd.DataFrame.sparse.from_spmatrix(oh_encoded, columns=oh_encoder.get_feature_names_out(columns_to_encode_oh))

In [75]:
# verifying that the columns after the one hot encoding are or are not being duplicated

duplicates_oh_df = oh_encoded_df.columns[oh_encoded_df.columns.duplicated()]
print(f"one hot encoding duplicate columns: {duplicates_oh_df}")

one hot encoding duplicate columns: Index([], dtype='object')


In [76]:
# Later on we can revert the dataset into a dense format for further data analysis here is the code:
# df_dense = df_encoded.sparse.to_dense()

In [77]:
df = df.drop(columns=columns_to_encode_oh + ["FIRE_SIZE_CLASS"])

final_df = pd.concat([df, oh_encoded_df, df[["FIRE_SIZE_CLASS_ENCODED"]]], axis=1)

In [78]:
final_df = final_df.drop(columns=["STATE", "COUNTY"])

will save file as a parquet, make sure to have pyarrow and fastparquet as your dependencies already

In [79]:
# TODO: save as csv's 

troubleshooting duplicates when attempting to save dataframe as csv/parquet

In [81]:
# check for duplicates in the final dataframe
duplicates_final = final_df.columns[final_df.columns.duplicated()]
print(f"final dataframe duplicate columns: {duplicates_final}")

# ensure unique column names by renaming duplicates
if len(duplicates_final) > 0:
    for col in duplicates_final.unique():
        col_indices = [i for i, x in enumerate(final_df.columns) if x == col]
        for j, idx in enumerate(col_indices):
            if j > 0: # skip the first occurrence 
                final_df.columns.values[idx] = f"{col}_{j+1}"


# verify that there are no more duplicates after renaming
duplicates_final_after = final_df.columns[final_df.columns.duplicated()]
print(f"final dataframe duplicate columns after renaming: {duplicates_final_after}")

final dataframe duplicate columns: Index(['FIRE_SIZE_CLASS_ENCODED'], dtype='object')
final dataframe duplicate columns after renaming: Index([], dtype='object')


In [None]:
# # Renaming or dropping duplicate columns
# for col, count in duplicates.items():
#     if count > 1:
#         for i in range(count):
#             duplicate_col_name = f"{col}_{i+1}"
#             final_df.columns = [duplicate_col_name if col == name else name for name in final_df.columns]

# # Verify that there are no duplicates
# column_counts = Counter(final_df.columns)
# duplicates = {col: count for col, count in column_counts.items() if count > 1}
# print(f"Duplicate column names and counts after renaming: {duplicates}")

In [None]:
# # verifying that the initial dataframe does not have any duplicate column names
# initial_columns = main_dataframe.columns
# duplicates_initial = initial_columns[initial_columns.duplicated()]
# print(f"Initial duplicate columns: {duplicates_initial}") 

In [None]:
# # ensure that the encoded columns are unique before merging 

# oh_encoded_df = pd.DataFrame.sparse.from_spmatrix(oh_encoded, oh_encoded_columns) 

# duplicates_oh_df = oh_encoded_df.columns[oh_encoded_df.columns.duplicated()]
# print(f"one hot encoding dataframe duplicate columns: {duplicates_oh_df}")

In [None]:
# # Drop original columns that were encoded
# df = df.drop(columns=columns_to_encode_oh + ["FIRE_SIZE_CLASS"])

# # Combine DataFrames
# final_df = pd.concat([df, oh_encoded_df, df[["FIRE_SIZE_CLASS_ENCODED"]]], axis=1)

# # Check for duplicates in the final DataFrame
# duplicates_final = final_df.columns[final_df.columns.duplicated()]
# print(f"Final DataFrame duplicate columns: {duplicates_final}")

# # Ensure unique column names by renaming duplicates
# if len(duplicates_final) > 0:
#     for col in duplicates_final.unique():
#         col_indices = [i for i, x in enumerate(final_df.columns) if x == col]
#         for j, idx in enumerate(col_indices):
#             if j > 0:  # Skip the first occurrence
#                 final_df.columns.values[idx] = f"{col}_{j+1}"

# # Verify that there are no duplicates after renaming
# duplicates_final_after = final_df.columns[final_df.columns.duplicated()]
# print(f"Final DataFrame duplicate columns after renaming: {duplicates_final_after}")


In [None]:
# Save the final DataFrame to a CSV file
final_df.to_parquet('../data/processed/processed_wildfire_data.parquet')


In [84]:
final_df.to_csv("../data/processed/processed_wildfire_data.csv")

KeyboardInterrupt: 

In [None]:
# TODO: Save Encoders


from joblib import dump

# Save the onehotencoder
dump(oh_encoder, 'onehotencoder.joblib')

# save the ordinalencoder 
dump(ordinal_encoder, 'ordinalencoder.joblib')