In [40]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
import pickle
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.utils import resample

# Load in all the Data

In [41]:
with open('ei_intermediate_file_paths.json') as file_path_file:
    file_paths = json.load(file_path_file)


SI_HISTORICAL_INPUT_PATH = file_paths.get("historical_si_per_year_csv")

NOAA_BLS_MERGED_INPUT_PATH = file_paths.get("stage2_fs_merged_output.csv")

EI_MODEL_OUTPUT_PATH = file_paths.get("ei_model_output")

In [42]:
# Load dataset
si_historical_df = pd.read_csv(SI_HISTORICAL_INPUT_PATH)

noaa_bls_df = pd.read_csv(NOAA_BLS_MERGED_INPUT_PATH)

## Review the Smoke Impact Data

In [43]:
print(si_historical_df.head(5))

   year  fire_year  total_amortized_smoke_impact  total_fire_duration  \
0  1961        NaN                           NaN                  NaN   
1  1962        NaN                           NaN                  NaN   
2  1963        NaN                           NaN                  NaN   
3  1964     1964.0                  7.432192e-07            10.002545   
4  1965     1965.0                  1.181915e-06            30.003913   

   total_acres_burned  avg_daily_smoke_impact  scaled_avg_daily_smoke_impact  
0                 NaN                     NaN                            NaN  
1                 NaN                     NaN                            NaN  
2                 NaN                     NaN                            NaN  
3           28.468121            7.430301e-08                        0.74303  
4           43.781494            3.939202e-08                        0.39392  


In [44]:
# List all column names
columns = si_historical_df.columns
print("Column names:\n", columns)

# Check data types of each column
data_types = si_historical_df.dtypes
print("Data types of columns:\n", data_types)

# Find out the number of rows in the dataset
num_rows = len(si_historical_df)
print("Number of rows:", num_rows)

Column names:
 Index(['year', 'fire_year', 'total_amortized_smoke_impact',
       'total_fire_duration', 'total_acres_burned', 'avg_daily_smoke_impact',
       'scaled_avg_daily_smoke_impact'],
      dtype='object')
Data types of columns:
 year                               int64
fire_year                        float64
total_amortized_smoke_impact     float64
total_fire_duration              float64
total_acres_burned               float64
avg_daily_smoke_impact           float64
scaled_avg_daily_smoke_impact    float64
dtype: object
Number of rows: 61


## Review the BLS and NOAA Data

In [45]:
print(noaa_bls_df.head(5))

         date    bls_date  bls_curr_sae_hrs  bls_curr_laus_labor_force  \
0  2009-09-01  2009-09-01              36.8                   421301.0   
1  2009-10-01  2009-10-01              36.7                   422913.0   
2  2010-05-01  2010-05-01              38.0                   415895.0   
3  2010-06-01  2010-06-01              37.9                   421034.0   
4  2010-07-01  2010-07-01              37.4                   423239.0   

   bls_curr_laus_unemployment_rate  bls_prev_yr_sae_hrs  \
0                              9.1                 37.4   
1                              8.8                 37.0   
2                              8.2                 37.4   
3                              8.7                 37.9   
4                              8.4                 37.1   

   bls_prev_yr_laus_labor_force  bls_prev_yr_laus_unemployment_rate  \
0                      427171.0                                 5.8   
1                      431740.0                           

In [46]:
# List all column names
noaa_bls_columns = noaa_bls_df.columns
print("Column names:\n", noaa_bls_columns)

# Check data types of each column
noaa_bls_data_types = noaa_bls_df.dtypes
print("Data types of columns:\n", noaa_bls_data_types)

# Find out the number of rows in the dataset
noaa_bls_num_rows = len(noaa_bls_df)
print("Number of rows:", noaa_bls_num_rows)

Column names:
 Index(['date', 'bls_date', 'bls_curr_sae_hrs', 'bls_curr_laus_labor_force',
       'bls_curr_laus_unemployment_rate', 'bls_prev_yr_sae_hrs',
       'bls_prev_yr_laus_labor_force', 'bls_prev_yr_laus_unemployment_rate',
       'bls_pct_diff_sae_hrs', 'bls_pct_diff_laus_labor_force',
       'bls_pct_diff_laus_unemployment_rate', 'noaa_name', 'noaa_date',
       'noaa_tavg', 'noaa_tmax', 'noaa_tmin', 'noaa_prcp'],
      dtype='object')
Data types of columns:
 date                                    object
bls_date                                object
bls_curr_sae_hrs                       float64
bls_curr_laus_labor_force              float64
bls_curr_laus_unemployment_rate        float64
bls_prev_yr_sae_hrs                    float64
bls_prev_yr_laus_labor_force           float64
bls_prev_yr_laus_unemployment_rate     float64
bls_pct_diff_sae_hrs                   float64
bls_pct_diff_laus_labor_force          float64
bls_pct_diff_laus_unemployment_rate    float64
noaa_nam

## Merge the Data

In [47]:
# Ensure the date columns are in datetime format for NOAA/BLS dataset
noaa_bls_df['date'] = pd.to_datetime(noaa_bls_df['date'])

# Repeat the 'scaled_avg_daily_smoke_impact' value for each month in the year
# Create a new 'year' column in NOAA/BLS dataset to merge with SI historical data
noaa_bls_df['year'] = noaa_bls_df['date'].dt.year

In [48]:
merged_df = pd.merge(
    noaa_bls_df[['date', 'year', 'noaa_tavg', 'noaa_prcp', 'bls_pct_diff_sae_hrs', 'bls_pct_diff_laus_labor_force', 'bls_pct_diff_laus_unemployment_rate']],
    si_historical_df[['year', 'scaled_avg_daily_smoke_impact']],
    on='year',
    how='left'
)

# Drop the 'year' column as it's no longer needed
merged_df = merged_df.drop(columns=['year'])



In [49]:
# Display the merged dataset
print(merged_df.head(18))

         date  noaa_tavg  noaa_prcp  bls_pct_diff_sae_hrs  \
0  2009-09-01       75.7       8.60             -1.604278   
1  2009-10-01       60.7      10.55             -0.810811   
2  2010-05-01       74.2       9.26              1.604278   
3  2010-06-01       84.6       0.31              0.000000   
4  2010-07-01       84.9       6.30              0.808625   
5  2010-08-01       86.6       2.32             -0.534759   
6  2010-09-01       79.2       0.15              0.000000   
7  2010-10-01       66.1       2.04              0.544959   
8  2011-05-01       70.2       7.12             -1.578947   
9  2011-06-01       83.7       3.56             -1.583113   
10 2011-07-01       86.0       2.95             -1.604278   
11 2011-08-01       83.9       3.08             -0.268817   
12 2011-09-01       73.3       2.56              0.543478   
13 2011-10-01       62.7       1.28              0.813008   
14 2012-05-01       76.3       3.18             -2.673797   
15 2012-06-01       80.2