In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
import csv
import tensorflow as tf

In [2]:
# Set random seed for TensorFlow
tf.random.set_seed(123)

# Set random seed for Python
np.random.seed(123)


In [3]:
tf.keras.utils.set_random_seed(1)

tf.config.experimental.enable_op_determinism()

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:

data_all_county = pd.read_csv('/content/drive/My Drive/time_series/CA_data_lat_log_weekly.csv')


In [6]:
data_all_county['date'] = pd.to_datetime(data_all_county['date'])
data_all_county['month'] = data_all_county['date'].dt.month
data_all_county['month'] = data_all_county['month'].astype('category')

Define helper functions

In [7]:
def ts_multi_data_prep(dataset, target, start, end, window, step_out):
    X = []
    y = []
    start = start + window
    if end is None:
        end = len(dataset) - step_out
        #end = len(dataset)
    for i in range(start, end):
        indices = range(i-window, i)
        X.append(dataset[indices])

        indicey = range(i, i+step_out) #revise the window definition
        y.append(target[indicey])
    return np.array(X), np.array(y)

In [8]:
def timeseries_evaluation_metrics_func(y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    print('Evaluation metric results:-')
    mse = metrics.mean_squared_error(y_true.flatten(), y_pred.flatten())
    mae = metrics.mean_absolute_error(y_true.flatten(), y_pred.flatten())
    rmse = np.sqrt(mse)
    #mape = mean_absolute_percentage_error(y_true.flatten(), y_pred.flatten())
    r2 = metrics.r2_score(y_true.flatten(), y_pred.flatten())
    #print(f'MSE is : {mse}')
    #print(f'MAE is : {mae}')
    #print(f'RMSE is : {rmse}')
    #print(f'MAPE is : {mape}')
    #print(f'R2 is : {r2}\n')
    return mse, mae

In [9]:
def timeseries_evaluation_metrics_binary(y_true, y_pred):
    print('Evaluation metric results:-')
    accuracy = accuracy_score(y_true.flatten(), y_pred.flatten())
    precision = precision_score(y_true.flatten(), y_pred.flatten(), average='macro')
    recall = recall_score(y_true.flatten(), y_pred.flatten(), average='macro')
    f1 = f1_score(y_true.flatten(), y_pred.flatten(), average='macro')

    #print(f'Accuracy: {accuracy}')
    #print(f'Precision: {precision}')
    #print(f'Recall: {recall}')
    #print(f'F1-score: {f1}\n')
    return f1

In [10]:
def transform_county_data(x_data_array, y_data_array):
    # Lists to store x_train_c and y_train_c arrays
    x_c_list = []
    y_c_list = []
    # Divide the arrays into 'unique_fips_count' number of subarrays
    x_subarrays = np.array_split(x_data_array, unique_fips_count, axis=0)
    y_subarrays = np.array_split(y_data_array, unique_fips_count, axis=0)

    # Combine x_subarrays and y_subarrays into tuples
    data_tuples = [(x_subarray, y_subarray) for x_subarray, y_subarray in zip(x_subarrays, y_subarrays)]

    # Print or use the data tuples as needed
    for idx, data_tuple in enumerate(data_tuples):
        x_window_c, y_window_c = ts_multi_data_prep(data_tuple[0],data_tuple[1], 0, None, hist_window, step_out)
        # Append x_window_c and y_window_c arrays to lists
        x_c_list.append(x_window_c)
        y_c_list.append(y_window_c)

    # Stack arrays in lists to create x_train_c and y_train_c
    x_all_county = np.vstack(x_c_list)
    y_all_county = np.vstack(y_c_list)

    return x_all_county, y_all_county

In [11]:

x_train_c, y_train_c, x_vali_c, y_vali_c, x_test_c, y_test_c = [], [], [], [], [], []
unique_fips = data_all_county['fips'].unique()
unique_fips_count = data_all_county['fips'].nunique()
for fips in unique_fips:
    # Extract dataframe for the current FIPS value
    data_county = data_all_county[data_all_county['fips'] == fips]

    X_data = data_county[['lat','lon','PRECTOT', 'PS', 'QV2M', 'T2M', 'T2MDEW', 'T2MWET',
       'T2M_MAX', 'T2M_MIN', 'T2M_RANGE', 'TS', 'WS10M', 'WS10M_MAX',
       'WS10M_MIN', 'WS10M_RANGE', 'WS50M', 'WS50M_MAX', 'WS50M_MIN',
       'WS50M_RANGE', 'score','month']]
    Y_data = data_county[['score']]
    #train_val_test split 70%-10%-20%
    n = len(X_data)

    x_train_county = X_data[0:int(n*0.7)]
    y_train_county = Y_data[0:int(n*0.7)]
    x_vali_county = X_data[int(n*0.7):int(n*0.8)]
    y_vali_county = Y_data[int(n*0.7):int(n*0.8)]
    x_test_county = X_data[int(n*0.8):]
    y_test_county = Y_data[int(n*0.8):]


    if fips == 6001:
        x_train_c, y_train_c, x_vali_c, y_vali_c, x_test_c, y_test_c = x_train_county, y_train_county, x_vali_county, y_vali_county, x_test_county, y_test_county

    else:
        x_train_c = np.concatenate((x_train_c, x_train_county), axis=0)
        y_train_c = np.concatenate((y_train_c, y_train_county), axis=0)
        x_vali_c = np.concatenate((x_vali_c, x_vali_county), axis=0)
        y_vali_c = np.concatenate((y_vali_c, y_vali_county), axis=0)
        x_test_c = np.concatenate((x_test_c, x_test_county), axis=0)
        y_test_c = np.concatenate((y_test_c, y_test_county), axis=0)


In [12]:
X_scaler_train = MinMaxScaler()
Y_scaler_train = MinMaxScaler()
X_scaler_test = MinMaxScaler()
Y_scaler_test = MinMaxScaler()
X_scaler_vali = MinMaxScaler()
Y_scaler_vali = MinMaxScaler()
x_train_data = X_scaler_train.fit_transform(x_train_c)
y_train_data = Y_scaler_train.fit_transform(y_train_c)
x_vali_data = X_scaler_vali.fit_transform(x_vali_c)
y_vali_data = Y_scaler_vali.fit_transform(y_vali_c)
x_test_data = X_scaler_test.fit_transform(x_test_c)
y_test_data = Y_scaler_test.fit_transform(y_test_c)


In [13]:
def modeling1(hist_window, step_out, filtersize, kernelsize, drop1, drop2, flag_report):

  x_train, y_train = transform_county_data(x_train_data, y_train_data)
  x_vali, y_vali = transform_county_data(x_vali_data, y_vali_data)
  x_test, y_test = transform_county_data(x_test_data, y_test_data)
  batch_size = 128
  buffer_size = 256

  train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
  train_data = train_data.cache().shuffle(buffer_size).batch(batch_size).repeat()

  val_data = tf.data.Dataset.from_tensor_slices((x_vali, y_vali))
  val_data = val_data.batch(batch_size).repeat()

  cnn_model = tf.keras.models.Sequential()
  cnn_model.add(tf.keras.layers.Conv1D(filters=filtersize, kernel_size=kernelsize, activation='relu', input_shape=(x_train.shape[1], x_train.shape[2])))
  cnn_model.add(tf.keras.layers.MaxPool1D(pool_size=2))
  cnn_model.add(tf.keras.layers.Dropout(drop1))
  cnn_model.add(tf.keras.layers.Flatten())
  cnn_model.add(tf.keras.layers.Dense(30, activation='relu'))
  cnn_model.add(tf.keras.layers.Dropout(drop2))
  cnn_model.add(tf.keras.layers.Dense(units=step_out))
  cnn_model.compile(optimizer='adam', loss='mae')

  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='min')
  history = cnn_model.fit(train_data, epochs=150, steps_per_epoch=50, validation_data=val_data, validation_steps=150, verbose=0, callbacks=[early_stopping])
  y_test_pred = cnn_model.predict(x_test)
  y_test_pred_Inverse = Y_scaler_test.inverse_transform(y_test_pred)
  y_test_pred_Inverse_ordinal = np.round(y_test_pred_Inverse).astype(int)
  y_test_reshaped = np.squeeze(y_test, axis=-1)
  y_test_Inverse = Y_scaler_test.inverse_transform(y_test_reshaped)
  y_test_Inverse_ordinal = np.round(y_test_Inverse).astype(int)
  mse, mae = timeseries_evaluation_metrics_func(y_test_Inverse,y_test_pred_Inverse)
  threshold = 2.5
  y_test_Inverse_binary = np.where(y_test_Inverse >= threshold, 1, 0)
  y_test_pred_Inverse_binary = np.where(y_test_pred_Inverse >= threshold, 1, 0)
  f1 = timeseries_evaluation_metrics_binary(y_test_Inverse_binary,y_test_pred_Inverse_binary)
  if flag_report:
    classification_metrics = classification_report(y_test_Inverse_binary.flatten(),y_test_pred_Inverse_binary.flatten())
    print(classification_metrics)

  print(f'filtersize: {filtersize}, kernelsize: {kernelsize}, Dropout1: {drop1}, Dropout2: {drop2}, F1: {f1}, MSE: {mse}, MAE: {mae}')
  return f1, mse, mae

In [14]:
parameter_result_list1 = []
hist_window = 30
step_out = 12

for filtersize in [10, 32, 64]:
    for kernelsize in [3, 4, 5]:
        for drop1 in [0.1]:
            for drop2 in [0.1]:
                f1, mse, mae = modeling1(hist_window, step_out, filtersize, kernelsize, drop1, drop2, 1)
                parameter_result_list1.append((hist_window, step_out, filtersize, kernelsize, drop1, drop2, f1, mse, mae))

# Printing the list with comment lines indicating parameter titles
print("# Hist_Window   Step_Out   filtersize, kernelsize,   Dropout1   Dropout2   F1    MSE       MAE")
for params in parameter_result_list1:
    print("{:<13} {:<10} {:<7} {:<7} {:<10} {:<10} {:<10.4f} {:<10.4f} {:<10.4f}".format(*params))


Epoch 32: early stopping
Evaluation metric results:-
Evaluation metric results:-
              precision    recall  f1-score   support

           0       0.95      1.00      0.97    110778
           1       0.94      0.56      0.70     13110

    accuracy                           0.95    123888
   macro avg       0.94      0.78      0.83    123888
weighted avg       0.95      0.95      0.94    123888

filtersize: 10, kernelsize: 3, Dropout1: 0.1, Dropout2: 0.1, F1: 0.834965685600569, MSE: 0.40222482923754554, MAE: 0.3790649861196106
Epoch 22: early stopping
Evaluation metric results:-
Evaluation metric results:-
              precision    recall  f1-score   support

           0       0.96      0.99      0.97    110778
           1       0.90      0.61      0.73     13110

    accuracy                           0.95    123888
   macro avg       0.93      0.80      0.85    123888
weighted avg       0.95      0.95      0.95    123888

filtersize: 10, kernelsize: 4, Dropout1: 0.1, Drop

In [15]:
# Saving the parameter_result_list to a CSV file
with open('/content/drive/My Drive/time_series/parameter_result_list1_cnn.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(parameter_result_list1)

In [16]:
# Loading the parameter_result_list from the CSV file
parameter_result_list = []

with open('/content/drive/My Drive/time_series/parameter_result_list1_cnn.csv', 'r', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        parameter_result_list.append(row)
#parameter_result_list

In [17]:
# Convert all elements in the list to float
my_list_float = [[float(val) if '.' in val else int(val) for val in sublist] for sublist in parameter_result_list]

# Sort the list based on the f1 values
sorted_list = sorted(my_list_float, key=lambda x: x[6], reverse=True)

# Print the sorted list
for sublist in sorted_list:
    print(sublist)

[30, 12, 10, 5, 0.1, 0.1, 0.8749162573997669, 0.36537235143499813, 0.38188375515207823]
[30, 12, 32, 5, 0.1, 0.1, 0.8694547014994938, 0.40949392276568347, 0.38840285834819727]
[30, 12, 64, 3, 0.1, 0.1, 0.8603357757596977, 0.3668929803686541, 0.36890938483406566]
[30, 12, 10, 4, 0.1, 0.1, 0.84911978094232, 0.4290468515046707, 0.4281373958403994]
[30, 12, 32, 4, 0.1, 0.1, 0.8426570759112698, 0.4241608177090393, 0.3972481453966835]
[30, 12, 10, 3, 0.1, 0.1, 0.834965685600569, 0.40222482923754554, 0.3790649861196106]
[30, 12, 64, 5, 0.1, 0.1, 0.8146858692406492, 0.39148586537831415, 0.3849315434792824]
[30, 12, 32, 3, 0.1, 0.1, 0.7940485800960773, 0.4493505441320324, 0.4168063310122966]
[30, 12, 64, 4, 0.1, 0.1, 0.7236089173551418, 0.46806839322742405, 0.4305305244081411]
