<a href="https://colab.research.google.com/github/davidsherman96/mids_capstone_spring24_chang_li_sherman/blob/main/Hypertuned_State_wide_CA_drought_random_forest_multivariate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
import csv
import tensorflow as tf

from xgboost import XGBRegressor
import datetime
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Set random seed for TensorFlow
tf.random.set_seed(123)

# Set random seed for Python
np.random.seed(123)

In [4]:
tf.keras.utils.set_random_seed(1)

tf.config.experimental.enable_op_determinism()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data_all_county = pd.read_csv('/content/drive/MyDrive/CA_data_lat_log_weekly.csv')

In [7]:
data_all_county['date'] = pd.to_datetime(data_all_county['date'])
data_all_county['month'] = data_all_county['date'].dt.month
data_all_county['month'] = data_all_county['month'].astype('category')

Define helper functions

In [8]:
def ts_multi_data_prep(dataset, target, start, end, window, step_out):
    X = []
    y = []
    start = start + window
    if end is None:
        end = len(dataset) - step_out
        #end = len(dataset)
    for i in range(start, end):
        indices = range(i-window, i)
        X.append(dataset[indices])

        indicey = range(i, i+step_out) #revise the window definition
        y.append(target[indicey])
    return np.array(X), np.array(y)

In [9]:
def timeseries_evaluation_metrics_func(y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    print('Evaluation metric results:-')
    mse = metrics.mean_squared_error(y_true.flatten(), y_pred.flatten())
    mae = metrics.mean_absolute_error(y_true.flatten(), y_pred.flatten())
    rmse = np.sqrt(mse)
    r2 = metrics.r2_score(y_true.flatten(), y_pred.flatten())
    return mse, mae

In [10]:
def timeseries_evaluation_metrics_binary(y_true, y_pred):
    print('Evaluation metric results:-')
    accuracy = accuracy_score(y_true.flatten(), y_pred.flatten())
    precision = precision_score(y_true.flatten(), y_pred.flatten(), average='macro')
    recall = recall_score(y_true.flatten(), y_pred.flatten(), average='macro')
    f1 = f1_score(y_true.flatten(), y_pred.flatten(), average='macro')
    return f1

In [11]:
def transform_county_data(x_data_array, y_data_array):
    # Lists to store x_train_c and y_train_c arrays
    x_c_list = []
    y_c_list = []
    
    # Divide the arrays into 'unique_fips_count' number of subarrays
    x_subarrays = np.array_split(x_data_array, unique_fips_count, axis=0)
    y_subarrays = np.array_split(y_data_array, unique_fips_count, axis=0)

    # Combine x_subarrays and y_subarrays into tuples
    data_tuples = [(x_subarray, y_subarray) for x_subarray, y_subarray in zip(x_subarrays, y_subarrays)]

    # Print or use the data tuples as needed
    for idx, data_tuple in enumerate(data_tuples):
        x_window_c, y_window_c = ts_multi_data_prep(data_tuple[0],data_tuple[1], 0, None, hist_window, step_out)
        # Append x_window_c and y_window_c arrays to lists
        x_c_list.append(x_window_c)
        y_c_list.append(y_window_c)

    # Stack arrays in lists to create x_train_c and y_train_c
    x_all_county = np.vstack(x_c_list)
    y_all_county = np.vstack(y_c_list)

    return x_all_county, y_all_county

In [12]:
x_train_c, y_train_c, x_vali_c, y_vali_c, x_test_c, y_test_c = [], [], [], [], [], []
unique_fips = data_all_county['fips'].unique()
unique_fips_count = data_all_county['fips'].nunique()

for fips in unique_fips:
    # Extract dataframe for the current FIPS value
    data_county = data_all_county[data_all_county['fips'] == fips]

    X_data = data_county[['lat','lon','PRECTOT', 'PS', 'QV2M', 'T2M', 'T2MDEW', 'T2MWET',
       'T2M_MAX', 'T2M_MIN', 'T2M_RANGE', 'TS', 'WS10M', 'WS10M_MAX',
       'WS10M_MIN', 'WS10M_RANGE', 'WS50M', 'WS50M_MAX', 'WS50M_MIN',
       'WS50M_RANGE', 'score', 'month']]
   
    Y_data = data_county[['score']]
    #train_val_test split 70%-10%-20%
    n = len(X_data)

    x_train_county = X_data[0:int(n*0.7)]
    y_train_county = Y_data[0:int(n*0.7)]
    x_vali_county = X_data[int(n*0.7):int(n*0.8)]
    y_vali_county = Y_data[int(n*0.7):int(n*0.8)]
    x_test_county = X_data[int(n*0.8):]
    y_test_county = Y_data[int(n*0.8):]


    if fips == 6001:
        x_train_c, y_train_c, x_vali_c, y_vali_c, x_test_c, y_test_c = x_train_county, y_train_county, x_vali_county, y_vali_county, x_test_county, y_test_county

    else:
        x_train_c = np.concatenate((x_train_c, x_train_county), axis=0)
        y_train_c = np.concatenate((y_train_c, y_train_county), axis=0)
        x_vali_c = np.concatenate((x_vali_c, x_vali_county), axis=0)
        y_vali_c = np.concatenate((y_vali_c, y_vali_county), axis=0)
        x_test_c = np.concatenate((x_test_c, x_test_county), axis=0)
        y_test_c = np.concatenate((y_test_c, y_test_county), axis=0)

In [13]:
X_scaler_train = MinMaxScaler()
Y_scaler_train = MinMaxScaler()
X_scaler_test = MinMaxScaler()
Y_scaler_test = MinMaxScaler()
X_scaler_vali = MinMaxScaler()
Y_scaler_vali = MinMaxScaler()
x_train_data = X_scaler_train.fit_transform(x_train_c)
y_train_data = Y_scaler_train.fit_transform(y_train_c)
x_vali_data = X_scaler_vali.fit_transform(x_vali_c)
y_vali_data = Y_scaler_vali.fit_transform(y_vali_c)
x_test_data = X_scaler_test.fit_transform(x_test_c)
y_test_data = Y_scaler_test.fit_transform(y_test_c)

In [14]:
def modeling1(hist_window,
              step_out,
              num_estimators,
              max_depth,
              random_state,
              flag_report):

  x_train, y_train = transform_county_data(x_train_data, y_train_data)
  x_vali, y_vali = transform_county_data(x_vali_data, y_vali_data)
  x_test, y_test = transform_county_data(x_test_data, y_test_data)

  # Convert all the 3D arrays to 2D for Random Forest
  train_len = len(x_train)
  num_features = X_data.shape[1]
  vali_len = len(x_vali)
  test_len = len(x_test)

  # Reshape the labels into a simple 2D array
  y_train = y_train.reshape(train_len, step_out)
  y_vali = y_vali.reshape(vali_len, step_out)
  y_test = y_test.reshape(test_len, step_out)

  # Reshape the x data into a 2D array of (num windows, window size x features size)
  x_train = x_train.reshape(train_len, hist_window * num_features)
  x_vali = x_vali.reshape(vali_len, hist_window * num_features)
  x_test = x_test.reshape(test_len, hist_window * num_features)

  # fit the model
  model = RandomForestRegressor(n_estimators=num_estimators,
                                max_depth = max_depth,
                                random_state = random_state)
  model.fit(x_train, y_train)

  # Get predictions
  y_test_pred = model.predict(x_test)

  # Convert predictions back onto real scale
  y_test_pred_Inverse = Y_scaler_test.inverse_transform(y_test_pred)
  y_test_pred_Inverse_ordinal = np.round(y_test_pred_Inverse).astype(int)
  y_test_Inverse = Y_scaler_test.inverse_transform(y_test)
  y_test_Inverse_ordinal = np.round(y_test_Inverse).astype(int)

  # Get evaluation mtrics
  mse, mae = timeseries_evaluation_metrics_func(y_test_Inverse,y_test_pred_Inverse)

  # Get the binary classification evaluation metrics
  threshold = 2.5
  y_test_Inverse_binary = np.where(y_test_Inverse >= threshold, 1, 0)
  y_test_pred_Inverse_binary = np.where(y_test_pred_Inverse >= threshold, 1, 0)
  f1 = timeseries_evaluation_metrics_binary(y_test_Inverse_binary,y_test_pred_Inverse_binary)

  # Print the classification report
  if flag_report:
    classification_metrics = classification_report(y_test_Inverse_binary.flatten(),y_test_pred_Inverse_binary.flatten())
    print(classification_metrics)

  print(f'Number of estimators: {num_estimators}, Max depth: {max_depth}, Random state: {random_state}, F1: {f1}, MSE: {mse}, MAE: {mae}')

  return f1, mse, mae

In [15]:
parameter_result_list1 = []
hist_window = 30
step_out = 12

for num_estimators in [100]:
    for max_depth in [1,2,3]:
        for random_state in [123, 0, 42]:
              f1, mse, mae = modeling1(hist_window,
                             step_out,
                             num_estimators,
                             max_depth,
                             random_state,
                             flag_report = 1)
              parameter_result_list1.append((hist_window,
                                   step_out,
                                   num_estimators,
                                   max_depth,
                                   random_state,
                                   f1, mse, mae))

# Printing the list with comment lines indicating parameter titles
print("# Hist_Window   Step_Out   Num Estimators  Max Depth   Random State    F1    MSE       MAE")
for params in parameter_result_list1:
    print("{:<13} {:<10} {:<7} {:<7} {:<10} {:<10.4f} {:<10.4f} {:<10.4f}".format(*params))

Evaluation metric results:-
Evaluation metric results:-
              precision    recall  f1-score   support

           0       0.94      0.91      0.93    110778
           1       0.42      0.53      0.47     13110

    accuracy                           0.87    123888
   macro avg       0.68      0.72      0.70    123888
weighted avg       0.89      0.87      0.88    123888

Number of estimators: 100, Max depth: 1, Random state: 123, F1: 0.69737861711576, MSE: 0.5000430613975385, MAE: 0.5570079307478641
Evaluation metric results:-
Evaluation metric results:-
              precision    recall  f1-score   support

           0       0.94      0.91      0.93    110778
           1       0.42      0.53      0.47     13110

    accuracy                           0.87    123888
   macro avg       0.68      0.72      0.70    123888
weighted avg       0.89      0.87      0.88    123888

Number of estimators: 100, Max depth: 1, Random state: 0, F1: 0.6967895133968602, MSE: 0.50133095577341

In [16]:
# Saving the parameter_result_list to a CSV file
with open('/content/drive/MyDrive/random_forest_parameter_result_list1.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(parameter_result_list1)

In [18]:
# Loading the parameter_result_list from the CSV file
parameter_result_list = []

with open('/content/drive/MyDrive/random_forest_parameter_result_list1.csv', 'r', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        parameter_result_list.append(row)

In [19]:
# Convert all elements in the list to float
my_list_float = [[float(val) if '.' in val else int(val) for val in sublist] for sublist in parameter_result_list]

# Sort the list based on the f1 values
sorted_list = sorted(my_list_float, key=lambda x: x[6], reverse=True)

# Print the sorted list
for sublist in sorted_list:
    print(sublist)

[30, 12, 100, 1, 0, 0.6967895133968602, 0.5013309557734139, 0.5574087713504433]
[30, 12, 100, 1, 42, 0.6979901398661895, 0.5009611808618245, 0.5573822320798181]
[30, 12, 100, 1, 123, 0.69737861711576, 0.5000430613975385, 0.5570079307478641]
[30, 12, 100, 2, 123, 0.8216815960139914, 0.35097786998834224, 0.39086939248287017]
[30, 12, 100, 2, 0, 0.8266774355448667, 0.35095299855523365, 0.3905100660130562]
[30, 12, 100, 2, 42, 0.8265636454616385, 0.35016811900199174, 0.3897036304786059]
[30, 12, 100, 3, 123, 0.8207513058812219, 0.33436819596557843, 0.38944364671414955]
[30, 12, 100, 3, 42, 0.8236663202363537, 0.33364480854979706, 0.38848487308692514]
[30, 12, 100, 3, 0, 0.8238038183239547, 0.3334061969633288, 0.3881192998174195]
