In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler

import tensorflow as tf

In [None]:
# Set random seed for TensorFlow
tf.random.set_seed(123)

# Set random seed for Python
np.random.seed(123)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

data_all_county = pd.read_csv('/content/drive/My Drive/time_series/CA_data_lat_log_weekly.csv')


Define helper functions

In [None]:
def ts_multi_data_prep(dataset, target, start, end, window, step_out):
    X = []
    y = []
    start = start + window
    if end is None:
        end = len(dataset) - step_out
        #end = len(dataset)
    for i in range(start, end):
        indices = range(i-window, i)
        X.append(dataset[indices])

        indicey = range(i, i+step_out) #revise the window definition
        y.append(target[indicey])
    return np.array(X), np.array(y)

In [None]:
def timeseries_evaluation_metrics_func(y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    print('Evaluation metric results:-')
    mse = metrics.mean_squared_error(y_true.flatten(), y_pred.flatten())
    mae = metrics.mean_absolute_error(y_true.flatten(), y_pred.flatten())
    rmse = np.sqrt(mse)
    #mape = mean_absolute_percentage_error(y_true.flatten(), y_pred.flatten())
    r2 = metrics.r2_score(y_true.flatten(), y_pred.flatten())
    print(f'MSE is : {mse}')
    print(f'MAE is : {mae}')
    print(f'RMSE is : {rmse}')
    #print(f'MAPE is : {mape}')
    print(f'R2 is : {r2}\n')

In [None]:
def timeseries_evaluation_metrics_binary(y_true, y_pred):
    print('Evaluation metric results:-')
    accuracy = accuracy_score(y_true.flatten(), y_pred.flatten())
    precision = precision_score(y_true.flatten(), y_pred.flatten(), average='macro')
    recall = recall_score(y_true.flatten(), y_pred.flatten(), average='macro')
    f1 = f1_score(y_true.flatten(), y_pred.flatten(), average='macro')

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-score: {f1}\n')

In [None]:
def single_evaluation_metrics_func(y_true, y_pred):
    def mean_absolute_percentage_error(y_true, y_pred):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print('Evaluation metric results:-')
    print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
    print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
    print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
    #print(f'MAPE is : {mean_absolute_percentage_error(y_true, y_pred)}')
    print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')

In [None]:
def process_county_level_func(data_county, step_out):

    X_scaler_train = MinMaxScaler()
    Y_scaler_train = MinMaxScaler()
    X_scaler_test = MinMaxScaler()
    Y_scaler_test = MinMaxScaler()
    X_scaler_vali = MinMaxScaler()
    Y_scaler_vali = MinMaxScaler()


    X_data = data_county[['score']]
    Y_data = data_county[['score']]


    #train_val_test split 70%-10%-20%
    n = len(X_data)
    x_train_data = X_scaler_train.fit_transform(X_data[0:int(n*0.7)])
    y_train_data = Y_scaler_train.fit_transform(Y_data[0:int(n*0.7)])
    x_vali_data = X_scaler_vali.fit_transform(X_data[int(n*0.7):int(n*0.8)])
    y_vali_data = Y_scaler_vali.fit_transform(Y_data[int(n*0.7):int(n*0.8)])
    x_test_data = X_scaler_test.fit_transform(X_data[int(n*0.8):])
    y_test_data = Y_scaler_test.fit_transform(Y_data[int(n*0.8):])
    x_train_c, y_train_c = ts_multi_data_prep(x_train_data, y_train_data, 0, None, hist_window, step_out)
    x_vali_c, y_vali_c = ts_multi_data_prep(x_vali_data, y_vali_data, 0, None, hist_window, step_out)
    x_test_c, y_test_c = ts_multi_data_prep(x_test_data, y_test_data, 0, None, hist_window, step_out)
    return x_train_c, y_train_c, x_vali_c, y_vali_c, x_test_c, y_test_c

In [None]:
hist_window = 30
step_out = 12
x_train_c, y_train_c, x_vali_c, y_vali_c, x_test_c, y_test_c = [], [], [], [], [], []
unique_fips = data_all_county['fips'].unique()
unique_fips_count = data_all_county['fips'].nunique()
for fips in unique_fips:
    # Extract dataframe for the current FIPS value
    data_county = data_all_county[data_all_county['fips'] == fips]

    X_data = data_county[['score']]
    Y_data = data_county[['score']]
    #train_val_test split 70%-10%-20%
    n = len(X_data)

    x_train_county = X_data[0:int(n*0.7)]
    y_train_county = Y_data[0:int(n*0.7)]
    x_vali_county = X_data[int(n*0.7):int(n*0.8)]
    y_vali_county = Y_data[int(n*0.7):int(n*0.8)]
    x_test_county = X_data[int(n*0.8):]
    y_test_county = Y_data[int(n*0.8):]


    if fips == 6001:
        x_train_c, y_train_c, x_vali_c, y_vali_c, x_test_c, y_test_c = x_train_county, y_train_county, x_vali_county, y_vali_county, x_test_county, y_test_county

    else:
        x_train_c = np.concatenate((x_train_c, x_train_county), axis=0)
        y_train_c = np.concatenate((y_train_c, y_train_county), axis=0)
        x_vali_c = np.concatenate((x_vali_c, x_vali_county), axis=0)
        y_vali_c = np.concatenate((y_vali_c, y_vali_county), axis=0)
        x_test_c = np.concatenate((x_test_c, x_test_county), axis=0)
        y_test_c = np.concatenate((y_test_c, y_test_county), axis=0)


In [None]:
x_train_c.shape

(44486, 1)

In [None]:
x_vali_c.shape

(6322, 1)

In [None]:
X_scaler_train = MinMaxScaler()
Y_scaler_train = MinMaxScaler()
X_scaler_test = MinMaxScaler()
Y_scaler_test = MinMaxScaler()
X_scaler_vali = MinMaxScaler()
Y_scaler_vali = MinMaxScaler()
x_train_data = X_scaler_train.fit_transform(x_train_c)
y_train_data = Y_scaler_train.fit_transform(y_train_c)
x_vali_data = X_scaler_vali.fit_transform(x_vali_c)
y_vali_data = Y_scaler_vali.fit_transform(y_vali_c)
x_test_data = X_scaler_test.fit_transform(x_test_c)
y_test_data = Y_scaler_test.fit_transform(y_test_c)

In [None]:
x_train_data.shape

(44486, 1)

In [None]:
def transform_county_data(x_data_array, y_data_array):
    # Lists to store x_train_c and y_train_c arrays
    x_c_list = []
    y_c_list = []
    # Divide the arrays into 'unique_fips_count' number of subarrays
    x_subarrays = np.array_split(x_data_array, unique_fips_count, axis=0)
    y_subarrays = np.array_split(y_data_array, unique_fips_count, axis=0)

    # Combine x_subarrays and y_subarrays into tuples
    data_tuples = [(x_subarray, y_subarray) for x_subarray, y_subarray in zip(x_subarrays, y_subarrays)]

    # Print or use the data tuples as needed
    for idx, data_tuple in enumerate(data_tuples):
        x_window_c, y_window_c = ts_multi_data_prep(data_tuple[0],data_tuple[1], 0, None, hist_window, step_out)
        # Append x_window_c and y_window_c arrays to lists
        x_c_list.append(x_window_c)
        y_c_list.append(y_window_c)

    # Stack arrays in lists to create x_train_c and y_train_c
    x_all_county = np.vstack(x_c_list)
    y_all_county = np.vstack(y_c_list)

    return x_all_county, y_all_county

In [None]:
x_train, y_train = transform_county_data(x_train_data, y_train_data)
x_vali, y_vali = transform_county_data(x_vali_data, y_vali_data)
x_test, y_test = transform_county_data(x_test_data, y_test_data)

In [None]:
print ('Multiple window of past history\n')
print(x_train[0])
print ('\n Target\n')
print (y_train[0])

Multiple window of past history

[[0.12158]
 [0.10072]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]
 [0.     ]]

 Target

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [None]:
batch_size = 32
buffer_size = 256

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.cache().shuffle(buffer_size).batch(batch_size).repeat()

val_data = tf.data.Dataset.from_tensor_slices((x_vali, y_vali))
val_data = val_data.batch(batch_size).repeat()

In [None]:
class MultiStepLastBaseline(tf.keras.Model):
  #def call(self, inputs):
    #return tf.tile(inputs[:, -1:, :], [1, step_out, 1])
    #return inputs[:, -n_step:, :]
    def __init__(self, step_out):
        super(MultiStepLastBaseline, self).__init__()
        self.step_out = step_out

    def call(self, inputs):
        last_step_mean = tf.reduce_mean(inputs[:, -self.step_out:, :], axis=1, keepdims=True)
        return tf.tile(last_step_mean, [1, self.step_out, 1])

persistence_baseline = MultiStepLastBaseline(step_out)
persistence_baseline.compile(loss=tf.keras.losses.MeanSquaredError(),
                      metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [None]:
x_test.shape

(10324, 30, 1)

In [None]:
y_test_pred = persistence_baseline.predict(x_test)
predicted_reshaped = np.squeeze(y_test_pred, axis=2)  # Remove the single batch dimension
y_test_pred_Inverse = Y_scaler_test.inverse_transform(predicted_reshaped)



In [None]:
y_test_pred_Inverse_ordinal = np.round(y_test_pred_Inverse).astype(int)

In [None]:
y_test_reshaped = np.squeeze(y_test, axis=-1)

In [None]:
y_test_Inverse = Y_scaler_test.inverse_transform(y_test_reshaped)
y_test_Inverse_ordinal = np.round(y_test_Inverse).astype(int)

In [None]:
threshold = 2.5
y_test_Inverse_binary = np.where(y_test_Inverse >= threshold, 1, 0)
y_test_pred_Inverse_binary = np.where(y_test_pred_Inverse >= threshold, 1, 0)

In [None]:
timeseries_evaluation_metrics_func(y_test_Inverse,y_test_pred_Inverse)

Evaluation metric results:-
MSE is : 0.4997515431507381
MAE is : 0.4344043537573488
RMSE is : 0.7069310738330422
R2 is : 0.5368332606976898



In [None]:
timeseries_evaluation_metrics_func(y_test_Inverse_ordinal,y_test_pred_Inverse_ordinal)

Evaluation metric results:-
MSE is : 0.632288841534289
MAE is : 0.4456040940204055
RMSE is : 0.7951659207576045
R2 is : 0.44989716049744677



In [None]:
timeseries_evaluation_metrics_binary(y_test_Inverse_binary,y_test_pred_Inverse_binary)

Evaluation metric results:-
Accuracy: 0.9413018209996126
Precision: 0.9019797306627104
Recall: 0.7620969318189846
F1-score: 0.8131934194341599



In [None]:
# Compute classification report
classification_metrics = classification_report(y_test_Inverse_binary.flatten(),y_test_pred_Inverse_binary.flatten())
#classification_metrics = classification_report(y_test_Inverse_ordinal.flatten(),y_test_pred_Inverse_ordinal.flatten())
# Print classification report
print(classification_metrics)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    110778
           1       0.86      0.53      0.66     13110

    accuracy                           0.94    123888
   macro avg       0.90      0.76      0.81    123888
weighted avg       0.94      0.94      0.94    123888

