In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd 

def read_data(file_path:str) -> pd.DataFrame:
    data: pd.DataFrame = pd.read_csv(file_path)
    return data
  
def format_feature_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    # turn the Year-Month column into the index
    df = df.set_index('Year-Month')
    # convert the index to a datetime object
    df.index = pd.to_datetime(df.index)
    return df

def create_feature_and_target_arrays(df: pd.DataFrame, target_col: str) -> tuple[pd.DataFrame, np.ndarray]:
    feature_cols : list[str] = [col for col in df.columns if col != target_col]
    X : pd.DataFrame = df[feature_cols]
    y : np.ndarray = df[target_col].to_numpy()
    return X, y

def generate_padded_data(feature_df: pd.DataFrame, df_window_sizes: pd.DataFrame) -> np.ndarray:
    data_cols : list[str] = feature_df.columns.tolist()
    num_features : int = len(data_cols)
    window_sizes : dict[str, int] = df_window_sizes.set_index('feature')['window_size'].to_dict()
    max_window_size : int = max(window_sizes.values())
    num_samples : int = feature_df.shape[0] - max_window_size + 1; # how many feature vectors I will have as an np.ndarray
    padded_data : np.ndarray = np.zeros((max_window_size, num_features, num_samples))
    
    # Now I need to fill in padded data... For each row in the df_data dataframe, I need to 
    # fill the padded_data array with the appropriate feature vector size. For example, some of the 
    # features have a window size of 12, so I will take all 12 data points. In other features, the
    # window size is 1, so I will have 11 zeros and 1 datapoint at the end. 
    
    for i in range(num_samples):
        for j, feature in enumerate(data_cols):
            window_size = window_sizes[feature]
            # Fill in the last 'window_size' entries with actual data
            padded_data[max_window_size - window_size:, j, i] = feature_df[feature].iloc[i:i + window_size].to_numpy()
            # The rest are already zeros due to initialization
    
    
    return padded_data
    
  

    
df_fresno_agg = read_data("../../data/Fresno_Aggregate.csv")
best_window_vals = read_data("../../data/fresno_lstm_best_feature_window_results.csv")
df_fresno = format_feature_dataframe(df_fresno_agg)
print(" ---- Fresno Aggregate Data ---- ")
print(df_fresno)
print(" ---- Best Window Values ---- ")
print(best_window_vals)

[df_features, target_vec] = create_feature_and_target_arrays(df_fresno, target_col='VFRate')
print(" ---- Feature DataFrame ---- ")
print(df_features)
print(" ---- Target Vector ---- ")
print(target_vec)

padded_data = generate_padded_data(df_features, best_window_vals)
print(" ---- Padded Data Shape ---- ")
print(padded_data.shape)

print(" --- Padded Data Example --- ")
print(pd.DataFrame(padded_data[:,:,1]))  # Print the first sample's padded data
print(" --- Corresponding Target Value --- ")
print(target_vec[12])  # Print the target value corresponding to the first sample


 ---- Fresno Aggregate Data ---- 
              VFRate  FIRE_Acres_Burned  PRECIP  WIND_EventCount  WIND_AvgMPH  \
Year-Month                                                                      
2008-10-01  6.156349             163.91    0.18              0.0     3.667742   
2008-11-01  3.407979              17.30    1.49              0.0     3.106667   
2008-12-01  6.486154               0.00    1.19              0.0     3.306452   
2009-01-01  6.619800               1.00    1.25              0.0     2.587097   
2009-02-01  5.751629              28.00    2.33              0.0     3.928571   
...              ...                ...     ...              ...          ...   
2015-08-01  2.569621              28.06    0.00              0.0     4.900000   
2015-09-01  2.775191             210.76    0.06              0.0     4.046667   
2015-10-01  5.344812              16.99    0.41              0.0     3.503226   
2015-11-01  2.569621              30.46    2.13              0.0     3.1166

In [2]:
df_features.head(15)

Unnamed: 0_level_0,FIRE_Acres_Burned,PRECIP,WIND_EventCount,WIND_AvgMPH,WIND_RunMiles,AQI_PM25,AQI_PM10,EARTHQUAKE_Total,PESTICIDE_Total
Year-Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2008-10-01,163.91,0.18,0.0,3.667742,87.825806,70.0,53.0,0,23.056051
2008-11-01,17.3,1.49,0.0,3.106667,74.49,95.5,38.5,0,0.519323
2008-12-01,0.0,1.19,0.0,3.306452,79.312903,94.0,18.5,0,0.0
2009-01-01,1.0,1.25,0.0,2.587097,62.019355,102.0,33.0,0,24.6304
2009-02-01,28.0,2.33,0.0,3.928571,94.096429,51.5,13.0,0,71.542885
2009-03-01,15.51,0.32,1.0,4.680645,112.274193,42.0,22.0,0,698.577858
2009-04-01,38.05,0.59,2.0,5.55,133.32,46.5,19.0,0,3647.746472
2009-05-01,60.36,0.41,0.0,5.880645,141.096774,58.0,26.5,1,7683.718158
2009-06-01,123.7,0.48,0.0,5.753333,138.173333,49.0,28.5,0,1114.16389
2009-07-01,278.33,0.0,0.0,5.267742,126.554839,57.0,26.0,0,609.835147


In [3]:
pd.DataFrame(padded_data[:,:,1])  # Print the second sample's padded data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,17.3,1.49,0.0,3.106667,0.0,0.0,38.5,0.0,0.519323
1,0.0,1.19,0.0,3.306452,0.0,0.0,18.5,0.0,0.0
2,1.0,1.25,0.0,2.587097,0.0,0.0,33.0,0.0,24.6304
3,28.0,2.33,0.0,3.928571,0.0,0.0,13.0,0.0,71.542885
4,15.51,0.32,1.0,4.680645,74.49,0.0,22.0,0.0,698.577858
5,38.05,0.59,2.0,5.55,79.312903,0.0,19.0,0.0,3647.746472
6,60.36,0.41,0.0,5.880645,62.019355,0.0,26.5,1.0,7683.718158
7,123.7,0.48,0.0,5.753333,94.096429,0.0,28.5,0.0,1114.16389
8,278.33,0.0,0.0,5.267742,112.274193,0.0,26.0,0.0,609.835147
9,35.02,0.0,0.0,4.574194,133.32,0.0,50.0,0.0,659.81802


In [4]:
best_window_vals

Unnamed: 0,feature,window_size,rmse
0,AQI_PM10,12,1.399237
1,AQI_PM25,1,2.715623
2,All Features,12,0.883979
3,EARTHQUAKE_Total,12,2.231402
4,FIRE_Acres_Burned,12,1.393601
5,PESTICIDE_Total,12,2.192513
6,PRECIP,12,0.986096
7,WIND_AvgMPH,12,2.188072
8,WIND_EventCount,12,2.692434
9,WIND_RunMiles,8,2.494878


In [5]:
df_features.tail(15)

Unnamed: 0_level_0,FIRE_Acres_Burned,PRECIP,WIND_EventCount,WIND_AvgMPH,WIND_RunMiles,AQI_PM25,AQI_PM10,EARTHQUAKE_Total,PESTICIDE_Total
Year-Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-10-01,13.55,0.42,0.0,3.432258,82.503226,60.0,50.0,0,245.054086
2014-11-01,0.2,0.92,0.0,2.813333,67.693333,79.5,34.0,0,350.874392
2014-12-01,2.1,2.93,0.0,3.335484,80.209677,66.0,19.0,0,72.433633
2015-01-01,2.0,0.31,0.0,2.641935,63.367742,99.0,35.0,0,128.900946
2015-02-01,0.11,1.34,0.0,3.389286,81.257143,76.5,29.0,1,1217.116397
2015-03-01,6.51,0.11,0.0,3.8,91.309677,53.0,23.0,0,3882.368247
2015-04-01,16.91,1.28,0.0,4.943333,118.633333,52.0,24.0,0,5476.335944
2015-05-01,148.38,0.83,0.0,5.341935,128.106452,52.0,22.0,0,2969.363052
2015-06-01,228.38,0.0,0.0,4.91,117.893333,57.0,31.0,0,2572.316711
2015-07-01,421.69,0.19,0.0,5.441935,130.783871,57.0,29.0,0,1820.939465


In [6]:
pd.DataFrame(padded_data[:,:,-1])  # Print the first sample's padded data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,2.0,0.31,0.0,2.641935,0.0,0.0,35.0,0.0,128.900946
1,0.11,1.34,0.0,3.389286,0.0,0.0,29.0,1.0,1217.116397
2,6.51,0.11,0.0,3.8,0.0,0.0,23.0,0.0,3882.368247
3,16.91,1.28,0.0,4.943333,0.0,0.0,24.0,0.0,5476.335944
4,148.38,0.83,0.0,5.341935,63.367742,0.0,22.0,0.0,2969.363052
5,228.38,0.0,0.0,4.91,81.257143,0.0,31.0,0.0,2572.316711
6,421.69,0.19,0.0,5.441935,91.309677,0.0,29.0,0.0,1820.939465
7,28.06,0.0,0.0,4.9,118.633333,0.0,44.0,0.0,737.718551
8,210.76,0.06,0.0,4.046667,128.106452,0.0,52.0,0.0,544.970947
9,16.99,0.41,0.0,3.503226,117.893333,0.0,38.0,0.0,181.244457


In [7]:
len(target_vec)

87