In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1"
os.environ["RAY_memory_monitor_refresh_ms"] = "0" # do not kill raylet if low on memmory
os.environ["RAY_TASK_MAX_RETRIES"] = "2"

os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1" #neded only when running local

In [None]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Flatten, Input, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.regularizers import l1, l2, l1_l2

In [None]:
import ray
from ray import train, tune, air
from ray.tune.schedulers import AsyncHyperBandScheduler

In [None]:
import matplotlib.pyplot as plt
#%matplotlib notebook
%matplotlib inline

In [None]:
import mlflow
from mlflow.models import infer_signature
from ray.air.integrations.mlflow import setup_mlflow

In [None]:
import uuid

In [None]:
modelTypeName = "LSTM_Bi_valueStdScaled"

In [None]:
dataColumnName = 'valueStdScaled'

In [None]:
MLFLOW_URI='http://localhost:8080/'

In [None]:
param_space={
    "n_epochs": tune.choice([200]),
    "n_layers": tune.choice([1, 3, 5, 10]),
    "n_dense_layers": tune.choice([1, 3, 5]), 
    "learning_rate": tune.choice([0.003, 0.001]),
    "activation": tune.choice(['tanh', 'relu']), 
    "n_neurons": tune.choice([32, 64, 96, 128, 256]),
    "T": tune.choice([5, 10, 15, 20]),
    "dense_dp": tune.choice([0.2, 0.3, 0.4]),
}

In [None]:
%run '../lib/utils_anomaly_detection.ipynb'

In [None]:
import pandas as pd

In [None]:
# load datasets
data = pd.read_csv('../../data/rq_1-3_train_test/2024-01-15_10-51-45__2024-01-15_14-26-45_load-gen-msg-w-spikes-10s-rate.csv', 
                   index_col=['EventDateTime'], parse_dates=['EventDateTime'])
dataLatency = pd.read_csv('../../data/rq_1-3_train_test/2024-01-15_10-51-45__2024-01-15_14-26-45_load-gen-avg-latency-10s-rate.csv', 
                          index_col='EventDateTime', parse_dates=['EventDateTime'])

In [None]:
%run '../lib/prepareDataSet.ipynb'

In [None]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from keras.models import load_model, save_model

In [None]:
"""
 This function uses one Tensor formatted input from the X_test dataset in order to predict ahead by a number
 of given steps. Then it readjusts by using another true value from X_test (as the nex index) before starting
 a new prediction cycle
"""
def generate_nsteps_forecast(x_test, nn_model, pred_ahead):
    max_len = x_test.shape[0]
    y_pred = []
    last_x = x_test[0]
    index = 0
    while len(y_pred) < max_len:
        sequence = 0
        while sequence < pred_ahead:
            try:
                x_crt_input = last_x.reshape(1, -1, 1)
                p_vector = nn_model.predict(x_crt_input, verbose=0)
                p = p_vector[0,0] # 1x1 array -> scalar
            except:
                print(f'Prediction error for x={x_crt_input} at sequence={sequence} for start index={index} when pred_ahead={pred_ahead}')
                print(f'Model config was:{nn_model.get_config()}')
                p = 0
                
            # update the predictions list
            y_pred.append(p)

            # make the new input
            last_x = np.roll(last_x, -1)
            last_x[-1] = p
            
            # increase index for the next run
            sequence += 1

        index += sequence
        if index < max_len:
            last_x = x_test[index]
            #print(f"Arrived at index = {index} of {max_len} with value X={last_x}")
    
    if len(y_pred) > max_len:
        # predicted too much, cutoff the tail
        y_pred = y_pred[0:max_len]
        
    return np.array(y_pred)

In [None]:
def create_model(config):
    
    model = Sequential()

    model.add(Input(shape=(config["T"],1)))
    for i in range(config["n_layers"] - 1):
        model.add(Bidirectional(LSTM(config["n_neurons"], activation=config['activation'],return_sequences=True)))
    model.add(Bidirectional(LSTM(config["n_neurons"], activation=config['activation'],return_sequences=False)))

    for i in range(config["n_dense_layers"]-1):
        model.add(Dense(config["n_neurons"], activation="relu"))
        model.add(Dropout(rate=config["dense_dp"]))

    model.add(Dense(1))
            
    return model

In [None]:
def train_model(config):
    

    n_neurons = config['n_neurons']
    n_epochs = config['n_epochs']
    learning_rate = config['learning_rate']
    T = config['T']

    model_exp =  f'{modelTypeName}'
    UUID = uuid.uuid4().hex

    mlflow_exp_name = f'{modelTypeName}-{UUID}-T_{T}-LY_{config["n_layers"]}-DLY_{config["n_dense_layers"]}-NN_{n_neurons}'

    import mlflow

    mlflow.set_tracking_uri(MLFLOW_URI)
    mlflow.set_registry_uri(MLFLOW_URI)
    
    nn_model = create_model(config)
    nn_model.compile(loss='mse', metrics='mse', optimizer=Adam(learning_rate=config["learning_rate"]))
    
    X_train, Y_train, X_test, Y_test = prepare_dataset(dataFrame[dataColumnName], T)
    
    model_name = modelTypeName + "-T_" + str(T) + "-LY_" + str(config['n_layers']) + "-DLY_" + str(config['n_dense_layers']) +\
                 "-NN_" + str(n_neurons) + "-LR_" + str(learning_rate) + "-epochs_" + str(n_epochs) +"-" + UUID + ".keras"
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=5)
    mc = ModelCheckpoint(model_name, monitor='val_loss', mode='min', save_best_only=True)    
    
    train_results = nn_model.fit(X_train, Y_train, 
                                 epochs=config['n_epochs'], 
                                 validation_data=(X_test, Y_test), 
                                 #callbacks=[ReportCheckpointCallback(metrics={"mse": "mse"}, checkpoint_on="train_end")],
                                 callbacks=[es, mc],
                                 verbose = 0
                                )
    
    #evaluate and print results
    try:
        saved_model = load_model(model_name)
    except:
        saved_model = nn_model
        save_model(nn_model, model_name)

    y_predict = saved_model.predict(X_test, verbose=0)
    errors_ae = calculate_absolute_prediction_errors(Y_test, y_predict)
    anomalies_ae = calculate_3sigma_anomalies(errors_ae)
    errors_se = calculate_squared_prediction_errors(Y_test, y_predict)
    anomalies_se = calculate_3sigma_anomalies(errors_se)

    anomalies_3sigma_Y_test = calculate_3sigma_anomalies(Y_test)
    anomalies_3sigma_y_predict = calculate_3sigma_anomalies(y_predict)
    
    anomalies_Y_test, z_scores_Y_test = calculate_zscore_anomalies(Y_test)
    anomalies_y_predict, z_scores_y_predict = calculate_zscore_anomalies(y_predict)
    anomalies_errors_ae, z_scores_errors_ae = calculate_zscore_anomalies(errors_ae)
    anomalies_errors_se, z_scores_errors_se = calculate_zscore_anomalies(errors_se)
    
    anomalies_Y_test_mod, z_scores_Y_test_mod = calculate_modified_zscore_anomalies(Y_test)
    anomalies_y_predict_mod, z_scores_y_predict_mod = calculate_modified_zscore_anomalies(y_predict)
    anomalies_errors_ae_mod, z_scores_errors_ae_mod = calculate_modified_zscore_anomalies(errors_ae)
    anomalies_errors_se_mod, z_scores_errors_se_mod = calculate_modified_zscore_anomalies(errors_se)
    
    try:
        r2 = r2_score(Y_test, y_predict)
    except:
        r2 = 110
    if np.isnan(r2):
        r2 = 110

    try:
        mae = mean_absolute_error(Y_test, y_predict)
    except:
        mae = 100
    if np.isnan(mae):
        mae = 100

    try:
        mape = mean_absolute_percentage_error(Y_test, y_predict)
    except:
        mape = 100
    if np.isnan(mape):
        mape = 100

    try:
        mse = mean_squared_error(Y_test, y_predict)
    except:
        mse = 100
    if np.isnan(mse):
        mse = 100
    
    try:
        pcc = np.corrcoef(Y_test, y_predict.flatten())[0,1]
    except:
        pcc = 100
    if np.isnan(pcc):
        pcc = 100

    experiment_id = mlflow.create_experiment(mlflow_exp_name)
    with mlflow.start_run(run_name=mlflow_exp_name, experiment_id=experiment_id) as mlflowrun:
        run_id = mlflowrun.info.run_id

        fig = plt.figure(figsize=(20,15))
        plt.title("Anomalies Y_test")
        plt.plot(Y_test,label="Original Data", alpha=0.6, c='gray')
        plt.scatter(np.where(anomalies_Y_test==True)[0], Y_test[np.where(anomalies_Y_test==True)], 
                    alpha=0.8, color='green', s=250, label="Z-Score Anomalies")
        plt.scatter(np.where(anomalies_3sigma_Y_test==True)[0], Y_test[np.where(anomalies_3sigma_Y_test==True)], 
                    alpha=0.8, color='red', s=150, label="3-Sigma Anomalies")
        plt.scatter(np.where(anomalies_Y_test_mod==True)[0], Y_test[np.where(anomalies_Y_test_mod==True)], 
                    alpha=0.8, color='blue', s=100, label="Modified Z-Score Anomalies")    
        plt.legend()
        figName = f"Y_test_anomalies-T_{T}.png"
        mlflow.log_figure(fig, figName)
        #plt.savefig(figName, transparent=False)
        fig.clf()
        plt.close()
    
        fig = plt.figure(figsize=(20,15))
        plt.title("Predict Anomalies T=" + str(T) + " with predict 1 on "+ str(model_exp) +": NN=" + str(n_neurons) + " epochs=" + str(n_epochs) +
                  " lr=" + str(learning_rate))
        plt.plot(y_predict,label="Predict 1-step Forecast", alpha=0.6, c='red', linewidth=3)
        plt.plot(Y_test,label="Original Data", alpha=0.6, c='black')
        plt.scatter(np.where(anomalies_ae==True), y_predict[np.where(anomalies_ae==True)], 
                    alpha=0.8, color='green', s=350, label="3-Sigma Anomalies AE")
        plt.scatter(np.where(anomalies_se==True), y_predict[np.where(anomalies_se==True)], 
                    alpha=0.8, color='magenta', s=300, label = "3-Sigma Anomalies SE")
        plt.scatter(np.where(anomalies_errors_ae==True), y_predict[np.where(anomalies_errors_ae==True)], 
                    alpha=0.8, color='blue', s=250, label = "Z-score Anomalies AE")
        plt.scatter(np.where(anomalies_errors_se==True), y_predict[np.where(anomalies_errors_se==True)], 
                    alpha=0.8, color='cyan', s=200, label = "Z-score Anomalies SE")
        plt.scatter(np.where(anomalies_errors_ae_mod==True), y_predict[np.where(anomalies_errors_ae_mod==True)], 
                    alpha=0.8, color='lightgreen', s=150, label = "Modified Z-score Anomalies AE")
        plt.scatter(np.where(anomalies_errors_se_mod==True), y_predict[np.where(anomalies_errors_se_mod==True)], 
                    alpha=0.8, color='orange', s=50, label = "Modified Z-score Anomalies SE")    
        plt.legend()    
        figName = f"Y_predict-1-step-anomalies-T_{T}.png"
        mlflow.log_figure(fig, figName)
        #plt.savefig(figName, transparent=False)
        fig.clf()
        plt.close()
        
        try:
            signature = infer_signature(X_test, y_predict)
    
            mlflow.tensorflow.log_model(nn_model, model_name, 
                                            signature = signature,
                                            #input_example=X_train[0].reshape(1, -1, 1), 
                                            registered_model_name = model_name)
        except:
            print(f'Ray-MLFlow: Could not save model {model_name}')
            
        try:
            mlflow.log_param("n_layer_size", n_neurons)
            mlflow.log_param("n_layers", config['n_layers'])
            mlflow.log_param("n_dense_layers", config['n_dense_layers'])
            mlflow.log_param("activation_fn", config['activation'])
            mlflow.log_param("epochs", n_epochs)
            mlflow.log_param("learning_rate", learning_rate)
            mlflow.log_param("optimizer", "adam")
            mlflow.log_param("time_window", config['T'])
            mlflow.log_param("dense_dp", config['dense_dp'])
            mlflow.log_param("model_exp", model_exp)
            
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("mape", mape)
            mlflow.log_metric("r2_score", r2)
            mlflow.log_metric("pearson_corr_coef", pcc)
            
        except:
            exception_param_metric_dict = {}
            log_metric_dict = {
                'r2_score': r2,
                'mae': mae,
                'mape': mape,
                'mse': mse,
                'pcc': pcc
            }        
            log_param_dict = {
                "n_layer_size": n_neurons,
                "n_layers": config['n_layers'],
                "n_dense_layers": config['n_dense_layers'],
                "activation_fn": config['activation'],
                "epochs": n_epochs,
                "learning_rate": learning_rate,
                "optimizer": "adam",
                "time_window": config['T'],
                "dense_dp": config['dense_dp'],
                "model_exp": model_exp            
            }
            exception_param_metric_dict['log_param_dict'] = log_param_dict
            exception_param_metric_dict['log_metric_dict'] = log_metric_dict
            mlflow.log_dict(exception_param_metric_dict, "exception_param_metric_dict.json")

    #train.report({""mse":mse, "mae":mae, "mape":mape, "r2":r2}) # for Ray>=2.7
    
    air.session.report({"mse":mse, "mae":mae, "mape":mape, "r2":r2, "mlflow_exp":mlflow_exp_name, "model_name":model_name, "T":T, "run_id":run_id})

In [None]:
def tune_model(num_training_iterations, num_samples):
    sched = AsyncHyperBandScheduler(
        time_attr="training_iteration", max_t=10, grace_period=5
    )
    
    #we have a cluster of 10 worker nodes with requests 1 CPU and limits 2 CPU settings for the pod
    resource_group = tune.PlacementGroupFactory([{'CPU': 1.0}] * 2) 
    tuner = tune.Tuner(
        tune.with_resources(train_model, resources=resource_group),
        tune_config=tune.TuneConfig(
            metric="mse",
            mode="min",
            scheduler=sched,
            num_samples=num_samples,
            max_concurrent_trials=10,
        ),
        run_config=air.RunConfig(
            name=modelTypeName,
            verbose = 1,
            stop={"training_iteration": num_training_iterations},
        ),
        param_space=param_space
    )
    
    results = tuner.fit()
    return results    

In [None]:
@ray.remote
def run_n_step_evaluation(model_name, run_id, T, predict_ahead):
    import mlflow

    mlflow.set_tracking_uri(MLFLOW_URI)
    mlflow.set_registry_uri(MLFLOW_URI)

    model = mlflow.tensorflow.load_model(f'models:/{model_name}/1')

    params = mlflow.get_run(run_id).to_dictionary()['data']['params']
    n_neurons = params['n_layer_size']
    learning_rate = params['learning_rate']
    n_epochs = params['epochs']
    model_exp = params['model_exp']
    
    n_step_metrics = {}
    
    X_train, Y_train, X_test, Y_test = prepare_dataset(dataFrame[dataColumnName], T)
    
    y_predict = model.predict(X_test, verbose=0)
    y_pred_nsteps = generate_nsteps_forecast(X_test, model, predict_ahead)
    
    errors_ae2 = calculate_absolute_prediction_errors(Y_test, y_pred_nsteps)
    anomalies_ae2 = calculate_3sigma_anomalies(errors_ae2)        
    errors_se2 = calculate_squared_prediction_errors(Y_test, y_pred_nsteps)
    anomalies_se2 = calculate_3sigma_anomalies(errors_se2)
    anomalies_y_pred_nsteps_mod, z_scores_y_pred_nsteps_mod = calculate_modified_zscore_anomalies(y_pred_nsteps)
    anomalies_errors_ae2_mod, z_scores_errors_ae2_mod = calculate_modified_zscore_anomalies(errors_ae2)
    anomalies_errors_se2_mod, z_scores_errors_se2_mod = calculate_modified_zscore_anomalies(errors_se2)

    try:
        r2_nStep = r2_score(Y_test, y_pred_nsteps)
    except:
        r2_nStep = 100

    try:
        mae_nStep = mean_absolute_error(Y_test, y_pred_nsteps)
    except:
        mae_nStep = 100

    try:
        mape_nStep = mean_absolute_percentage_error(Y_test, y_pred_nsteps)
    except:
        mape_nStep = 100

    try:
        mse_nStep = mean_squared_error(Y_test, y_pred_nsteps)
    except:
        mse_nStep = 100

    try:
        pcc_nStep = np.corrcoef(Y_test, y_pred_nsteps.flatten())[0,1]
    except:
        pcc_nStep = 100

    crt_step = f'predict_ahead_{predict_ahead}'
    n_step_metrics[crt_step] = {
                                    'r2_nStep': r2_nStep,
                                    'mae_nStep': mae_nStep,
                                    'mape_nStep': mape_nStep,
                                    'mse_nStep': mse_nStep,
                                    'pcc_nStep': pcc_nStep
                                   }
    anomalies_3sigma_y_pred_nsteps = calculate_3sigma_anomalies(y_pred_nsteps)
    anomalies_y_pred_nsteps, z_scores_y_pred_nsteps = calculate_zscore_anomalies(y_pred_nsteps)
    anomalies_errors_ae2, z_scores_errors_ae2 = calculate_zscore_anomalies(errors_ae2)
    anomalies_errors_se2, z_scores_errors_se2 = calculate_zscore_anomalies(errors_se2)

    with mlflow.start_run(run_id=run_id, nested=True):
        
        fig = plt.figure(figsize=(20,15))
        plt.title("Compare forecasts T=" + str(T) + " predict_ahead=" + str(predict_ahead) + " with predict 1" + 
                 " for LSTM "+ modelTypeName +": NN=" + str(n_neurons) + " LR= " + str(learning_rate) + " epochs=" + str(n_epochs))
        plt.plot(Y_test,label="Original Data", alpha=0.6, c='red',linewidth=2)
        plt.plot(y_predict,label="Predicted Data 1-step", alpha=0.6, c='black', linewidth=2)
        plt.plot(y_pred_nsteps,label="Predicted Data " + str(predict_ahead) + "-steps", alpha=0.6, c='blue', linewidth=2)
        plt.legend()
        figName = f"compare-forecasts-1_{predict_ahead}.png"
        mlflow.log_figure(fig, figName)
        #plt.savefig(figName, transparent=False)
        fig.clf()
        plt.close()
        
        fig = plt.figure(figsize=(20,15))        
        plt.title("Predict Anomalies T=" + str(T) + " with predict " + str(predict_ahead) + " on " + str(model_exp) + ": NN=" 
                  + str(n_neurons) + " epochs=" + str(n_epochs) + " lr=" + str(learning_rate))
        plt.plot(y_pred_nsteps,label="Predict " + str(predict_ahead) + "-step Forecast", alpha=0.6, c='red', linewidth=3)
        plt.plot(Y_test,label="Original Data", alpha=0.6, c='black')
        plt.scatter(np.where(anomalies_ae2==True), y_pred_nsteps[np.where(anomalies_ae2==True)], 
                    alpha=0.8, color='green', s=350, label="Anomalies AE")
        plt.scatter(np.where(anomalies_se2==True), y_pred_nsteps[np.where(anomalies_se2==True)], 
                    alpha=0.8, color='magenta', s=300, label = "Anomalies SE")
        plt.scatter(np.where(anomalies_errors_ae2==True), y_pred_nsteps[np.where(anomalies_errors_ae2==True)], 
                    alpha=0.8, color='blue', s=250, label = "Z-score Anomalies AE")
        plt.scatter(np.where(anomalies_errors_se2==True), y_pred_nsteps[np.where(anomalies_errors_se2==True)], 
                    alpha=0.8, color='cyan', s=200, label = "Z-score Anomalies SE")
        plt.scatter(np.where(anomalies_errors_ae2_mod==True), y_pred_nsteps[np.where(anomalies_errors_ae2_mod==True)], 
                    alpha=0.8, color='lime', s=150, label = "Modified Z-score Anomalies AE")
        plt.scatter(np.where(anomalies_errors_se2_mod==True), y_pred_nsteps[np.where(anomalies_errors_se2_mod==True)], 
                    alpha=0.8, color='orange', s=50, label = "Modified Z-score Anomalies SE")        
        plt.legend();    
        figName = f"Y-predict-anomalies-step-{predict_ahead}-with-T_{T}.png"
        mlflow.log_figure(fig, figName)
        #plt.savefig(figName, transparent=False)
        fig.clf()
        plt.close()

        fname = f'{predict_ahead}-step-metric.json'
        mlflow.log_dict(n_step_metrics, fname)
        
    return n_step_metrics
    

In [None]:
ray.shutdown()
ray.init()

In [None]:
results_lstm = tune_model(1, 10)

In [None]:
results_lstm.get_best_result()

In [None]:
results_n_step_predict = {}
for result in results_lstm:
    try:
        model_name = result.metrics['model_name']
        T = result.metrics['T']
        run_id = result.metrics['run_id']
        for predict_ahead in [5, 10, 15, 30, 60, 90, 120]: 
            res = run_n_step_evaluation.remote(model_name, run_id, T, predict_ahead)
            tag = f'runID_{run_id}-model_{model_name}'
            results_n_step_predict[tag] = res
            print(f'Scheduled job for T:{T}, predict_ahead: {predict_ahead}, model_name:{model_name}, run_id:{run_id}')    
    except:
        print(f'ERROR scheduling job for T:{T}, predict_ahead: {predict_ahead}, model_name:{model_name}, run_id:{run_id}')

In [None]:
ex_n_step = {}
for item in results_n_step_predict.keys():
    try:
        res = ray.get(results_n_step_predict[item])
    except:
        print(f'Error getting results for key:{item}')
        res = None
        
    ex_n_step[item]=res

In [None]:
len(ex_n_step.keys())

In [None]:
ex_n_step

In [None]:
ray.shutdown()