In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import math

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class E_L(nn.Module):
    def __init__(self, input_dim=1, hidden_dim=256, layer_dim=8):
        super(E_L, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.input_dim = input_dim

        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.layer_dim, dropout=0.4, bidirectional=False, batch_first=True)

    def forward(self, x, s_h, s_c):
        # Initialize hidden and cell state
        h0 = s_h
        c0 = s_c
        out, (hn, cn) = self.lstm(x, (h0,c0))
        return out, hn, cn

import torch.nn as nn
import torch
import torch.nn.functional as F

    
class E_F(nn.Module):
    def __init__(self, hidden_dim=64, output_dim=24):
        super(E_F, self).__init__()
        self.hidden_dim = hidden_dim
        self.output_dim=output_dim
        self.bn1 = nn.BatchNorm1d(num_features=self.hidden_dim)
        self.L_out1 = nn.Linear(self.hidden_dim, 512) 
        self.bn2 = nn.BatchNorm1d(num_features=512)
        self.L_out2 = nn.Linear(512, 256) 
        self.bn3 = nn.BatchNorm1d(num_features=256)
        self.L_out3 = nn.Linear(256, self.output_dim) 
    def forward(self, x, id=-1):
        linear_out = self.L_out1(self.bn1(x[:,id,:]))
        linear_out = self.L_out2(self.bn2(linear_out))
        linear_out = self.L_out3(self.bn3(linear_out))
        return linear_out


# prepare train and val set
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
class RnnDataset(TensorDataset):
    def __init__(self, data, label):
        self.data = data
        self.label=label
    
    def __getitem__(self, index):
        return self.data[index],self.label[index] 
    
    def __len__(self):
        return len(self.data)

In [None]:
import random
import pandas as pd
import numpy as np

def read_dataset(sensor_id, start_point, train_point):
    # read sensor data to vector
    if(str(sensor_id)[0]=='4'):
        trainX = pd.read_csv('./data'+'/reservoir_stor_'+str(sensor_id)+'_sof24.tsv', sep='\t')
    elif(str(sensor_id)[0]=='6'):
        trainX = pd.read_csv('./data'+'/raingauge_byhour_'+str(sensor_id)+'_sof.tsv', sep='\t')
    else:
        print("Error: the support sensor type is 4***or 6****.")
        sys.exit()
    start_num = trainX[trainX["TSC_TSTAMP_UTC"]==start_point].index.values[0]
    print("for sensor ", sensor_id,"start_num is: ", start_num)
    idx_num = 0
    print(len(trainX))
    print(trainX[:3])

    val_skips = []
    
    val_set = pd.read_csv('./data/'+str(sensor_id)+'_validation_timestamps_24avg.tsv',sep='\t')
    val_points = val_set["Hold Out Start"]
    for val_i in val_points:
        valskip = trainX[trainX["TSC_TSTAMP_UTC"]==val_i].index.values[0] - start_num
        val_skips.append(valskip)
        
    # id of test_start and test_end, save it in the last two values in val_skips
    test_start = trainX[trainX["TSC_TSTAMP_UTC"]=='2017-01-01 14:30:00'].index.values[0] - start_num 
    test_end = trainX[trainX["TSC_TSTAMP_UTC"]=='2018-12-31 14:30:00'].index.values[0] - start_num
    val_skips.append(test_start)
    val_skips.append(test_end) 
        
    print("val_skips is: ", val_skips)

    #foot label of train_end
    train_end = trainX[trainX["TSC_TSTAMP_UTC"]==train_point].index.values[0] - start_num 
    print("train_end is : ", train_end)

    #the whole dataset to be preprocessed
    train_x = trainX[start_num:] 
    print(len(train_x))
    sensor_data = np.array(train_x["TSC_VALUE_F"])
    
    return sensor_data, train_end, val_skips
 
def diff_norm_dataset(sensor_data0):
    a = sensor_data0
    b = a[:-1]
    a = a[1:]-b
    c = np.array([0]+a.tolist())
    mean = c.mean()
    print("mean is: ",mean)
    std = c.std()
    print("std is ",std)
    c = (c-mean)/std

    extream_num = 0
    print("extream boundary is: ", std*alpha_extreams)
    total_num = len(c)
    for i in range(total_num):
        if(abs(c[i])>alpha_extreams):
            extream_num += 1
    print(" the total number is: ", total_num)
    print(" the number of extreams is: ", extream_num, " the ratio is: ", extream_num/total_num)

    return c, mean, std

def std_norm_dataset(sensor_data0):

    c = sensor_data0
    mean = c.mean()
    print("mean is: ", mean)
    std = c.std()
    print("std is ", std)
    c = (c - mean) / std
  
    return c, mean, std


import random
def split_dataset(norm_data,local_data, train_end, input_dim, output_size, train_days, predict_days, train_volum):

    en_seq_len = train_days
    de_seq_len = predict_days

    DATA = []
    Label = []
    print("norm data size:",len(norm_data))

    train_size = train_end + 1 - (train_days*1 + output_size * predict_days)
    print("train size is: ",train_size)

    val_skips_extend = []
    l = len(val_skips)
    for i in range(l-2):
        for j in range(output_size*predict_days):
            val_skips_extend.append(val_skips[i]+j)
    
    test_start = val_skips[l-2]
    print("test set start ID is: ", test_start)
    test_end = val_skips[l-1]
    print("test set end ID is: ", test_end)
    
    
    # step 1, randomly choose train data
    if(is_over_sampling==1):
        norm_num = train_volum * norm_percent
        extreme_num = train_volum * (1 - norm_percent)
        extreme_total = 0
        norm_total = 0
   
        while(extreme_total<extreme_num or norm_total<norm_num):
            i = random.randint(0, train_size)
            if  (i+train_days not in val_skips_extend ) and ((i+train_days)<test_start or (i+train_days)>test_end):  
                if(is_watersheds==1 or is_prob_feature==1):
                    data0 = np.array(local_data[i:(i+train_days*1)])
                else:
                    data0 = np.array(norm_data[i:(i+train_days*1)]).reshape(train_days,-1)
                label0 = np.array(norm_data[(i+train_days*1):(i+train_days*1+output_size*predict_days)]) 
                label1 = np.array(class_label[(i+train_days*1):(i+train_days*1+output_size*predict_days)])
            
            # step 2, over sampling extreme points
            if(label1.sum()>=1 and extreme_total<extreme_num):
                extreme_total += 1;
                DATA.append(data0)
                Label.append(label0)
            if(label1.sum()==0 and norm_total<norm_num):
                norm_total += 1;
                DATA.append(data0)
                Label.append(label0) 
    else:  
        ii=0
        while (ii+train_days) < train_volum:
            i=random.randint(0, train_size-1)
            if (i+train_days not in val_skips_extend ) and ((i+train_days)<test_start or (i+train_days)>test_end) :   
                if(is_watersheds==1 or is_prob_feature==1):
                    data0=np.array(local_data[i:(i+train_days*1)])
                else:
                    data0=np.array(norm_data[i:(i+train_days*1)]).reshape(train_days,-1)
            label0=np.array(norm_data[(i+train_days*1):(i+train_days*1+output_size*predict_days)])
            DATA.append(data0)
            Label.append(label0)
            ii=ii+1
        
    dataset1=RnnDataset(DATA,Label)
    data_loader = DataLoader(dataset1, 
                         batch_size,
                         shuffle=True,
                         num_workers=2,
                         pin_memory=True,
                         collate_fn=lambda x: x)
    return data_loader

import numpy as np
import torch.nn.functional as F

def create_model(input_dim, hidden_dim, layer_dim, output_dim):
    encoder = E_L(input_dim, hidden_dim, layer_dim)
    outlinear = E_F(hidden_dim, predict_days) 

    encoder = encoder.to(device)
    outlinear = outlinear.to(device)

    criterion = nn.MSELoss(reduction='sum')
    encoder_optimizer = torch.optim.SGD(encoder.parameters(),0.001)  
    outlinear_optimizer = torch.optim.Adam(outlinear.parameters(),0.0005) 
    
    return encoder, outlinear, encoder_optimizer, outlinear_optimizer, criterion


In [None]:
def train_loop( ):
    num_epochs = 100
    early_stop = 0
    old_val_loss = 1000
    min_RMSE = 500000
    
    for epoch in range(num_epochs):
        print_loss_total = 0
        encoder.train()
        outlinear.train()
        
        for i, batch in enumerate(dataloader):
            x_train=[TrainData for TrainData, _ in batch]
            y_train=[TrainLabel for _, TrainLabel in batch]
            x_train=torch.from_numpy(np.array(x_train, np.float32)).to(device)
            y_train=torch.from_numpy(np.array(y_train,np.float32)).to(device)

            # Clear gradients w.r.t. parameters
            encoder_optimizer.zero_grad()
            outlinear_optimizer.zero_grad()
            loss = 0

            # Forward pass
            s_h = torch.zeros(layer_dim, x_train.size(0), hidden_dim).to(device)
            s_c = torch.zeros(layer_dim, x_train.size(0), hidden_dim).to(device)
        
            encoder_out, encoder_h, encoder_c = encoder(x_train, s_h, s_c)
            e_out = outlinear(encoder_out)

            # selective backpropagation
        
            if(is_normal==1):
                for ii in range(y_train.size(0)):
                    for jj in range(len(e_out[ii])):
                        if(abs(y_train[ii][jj])>alpha_extreams):
                            e_out[ii][jj]=y_train[ii][jj]
                        
            if(is_extreme==1):
                for ii in range(y_train.size(0)):
                    for jj in range(len(e_out[ii])):
                        if(abs(y_train[ii][jj])<=alpha_extreams):
                            e_out[ii][jj]=y_train[ii][jj] 
                        
            loss = criterion(e_out, y_train) 

            # Backward pass 
            loss.backward()
            encoder_optimizer.step()
            outlinear_optimizer.step()
            print_loss_total += loss.item()

        encoder.eval()
        outlinear.eval()
        val_loss, min_RMSE = generate_val_rmse(min_RMSE)
        
        print('-----------Epoch: {}. train_Loss>: {:.6f}. --------------------'.format(epoch, print_loss_total)) 
        print('-----------Epoch: {}. val_Loss>: {:.6f}. --------------------'.format(epoch, val_loss)) 

        #early stop
        if(val_loss>old_val_loss):
            early_stop+=1
        else:
            early_stop=0
        if(early_stop>=4):
            break
        old_val_loss=val_loss

In [None]:
def inference_test(encoder, outlinear,  x_test):
    y_predict = []
    d_out = torch.tensor([]).to(device)
    encoder.eval()
    outlinear.eval()

    with torch.no_grad():
        x_test = torch.from_numpy(np.array(x_test, np.float32)).to(device)
        # Forward pass 
        s_h = torch.zeros(layer_dim, x_test.size(0), hidden_dim).to(device)
        s_c = torch.zeros(layer_dim, x_test.size(0), hidden_dim).to(device)
        x_test_new = x_test.squeeze(2)
        encoder_out, _, _ = encoder(x_test, s_h, s_c)
        d_out = outlinear(encoder_out)
            
        y_predict.extend(d_out[0])
        y_predict = [y_predict[i].item() for i in range(len(y_predict))]
        y_predict = np.array(y_predict).reshape(1, -1) 
    
    return y_predict


def diff_denorm_dataset(predict_y0, y0, pre_gt, y_test):
    a2 = predict_y0
    a2 = [(ii*std+mean) for ii in a2] 
    y0 = y0.values
    a3 = np.zeros(len(y0)) 
    tt = 0
  
    if(is_single==1):
        a3[0] = a2[0] + pre_gt[0]
        for ii in range(predict_days-1):
            a3[ii+1] = a3[ii] + a2[ii+1]
    
    if(is_single==0) and (is_normal==1):
        for ii in range(predict_days):
            if abs(y_test[ii]) <= alpha_extreams:
                a3[tt]=a2[tt] + pre_gt[ii]
            else:
                a3[tt] = y0[ii]
            tt += 1
    
    if(is_single==0) and (is_extreme==1):
        for ii in range(predict_days):
            if abs(y_test[ii]) > alpha_extreams:
                a3[tt] = a2[tt] + pre_gt[ii]
            else:
                a3[tt] = y0[ii]
            tt += 1 
    return a3, a2

def std_denorm_dataset(predict_y0):

    a2 = predict_y0
    a2 = [(ii*std+mean) for ii in a2]
  
    return a2

In [None]:
def test(test_point):
    encoder.eval()
    outlinear.eval()

    test_predict = np.zeros(predict_days*output_dim)
    
    if(str(sensor_id)[0]=='4'):
        trainX = pd.read_csv('./data'+'/reservoir_stor_'+str(sensor_id)+'_sof24.tsv', sep='\t')
    elif(str(sensor_id)[0]=='6'):
        trainX = pd.read_csv('./data'+'/raingauge_byhour_'+str(sensor_id)+'_sof.tsv', sep='\t')
    else:
        print("Error: the support sensor type is 4*** or 6****.")
        sys.exit()

    point = trainX[trainX["TSC_TSTAMP_UTC"]==test_point].index.values[0]
    start_num = trainX[trainX["TSC_TSTAMP_UTC"]==start_point].index.values[0]
    test_point = point - start_num
    pre_gt = trainX[point-1 : point+71]["TSC_VALUE_F"].values.tolist()
    y = trainX[point:point+72]["TSC_VALUE_F"]

    #inference
    norm_data = sensor_data_norm
    if(is_watersheds==1 or is_prob_feature==1):
        x_test = np.array(sensor_data_norm_1[test_point-train_days*1:test_point], np.float32).reshape(train_days,-1)
    else:
        x_test = np.array(norm_data[test_point-train_days*1:test_point], np.float32).reshape(train_days,-1)
    y_test = np.array(norm_data[test_point:test_point+72], np.float32).reshape(72,-1)
    x_test = [x_test]
    y_predict = inference_test(encoder=encoder, outlinear=outlinear, x_test=x_test)
    y_predict = np.array(y_predict.tolist())[0]
    y_predict = [y_predict[i].item() for i in range(len(y_predict))]

    if(is_diff==0):
        test_predict = std_denorm_dataset(y_predict)
        diff_predict = []
    else:
        test_predict, diff_predict = diff_denorm_dataset(y_predict, y, pre_gt, y_test)

    return test_predict, y, diff_predict

In [None]:
def test_single(test_point):
    encoder.eval()
    outlinear.eval()
    test_predict = np.zeros(predict_days*output_dim)

    if(str(sensor_id)[0]=='4'):
        trainX = pd.read_csv('./data'+'/reservoir_stor_'+str(sensor_id)+'_sof24.tsv', sep='\t')
    elif(str(sensor_id)[0]=='6'):
        trainX = pd.read_csv('./data'+'/raingauge_byhour_'+str(sensor_id)+'_sof.tsv', sep='\t')
    else:
        print("Error: the support sensor type is 4*** or 6****.")
        sys.exit()
            
    #foot label of test_data
    point = trainX[trainX["TSC_TSTAMP_UTC"]==test_point].index.values[0]
    start_num = trainX[trainX["TSC_TSTAMP_UTC"]==start_point].index.values[0]
    test_point = point - start_num
    pre_gt = trainX[point-1:point+71]["TSC_VALUE_F"].values.tolist()
    y = trainX[point:point+72]["TSC_VALUE_F"]

    #inference
    norm_data = sensor_data_norm
    if(is_watersheds==1 or is_prob_feature==1):
        x_test = np.array(sensor_data_norm_1[test_point-train_days*1:test_point], np.float32).reshape(train_days, -1)

    else:
        x_test = np.array(norm_data[test_point-train_days*1:test_point], np.float32).reshape(train_days, -1)

    y_test = np.array(norm_data[test_point:test_point+72], np.float32).reshape(72, -1)
    x_test = [x_test]

    y_predict = inference_test(encoder=encoder, outlinear=outlinear, x_test=x_test)
    y_predict = np.array(y_predict.tolist())[0]
    y_predict = [y_predict[i].item() for i in range(len(y_predict))]
    
    if(is_diff==0):
        test_predict = std_denorm_dataset(y_predict)
        diff_predict = []
    else:
        test_predict, diff_predict = diff_denorm_dataset(y_predict, y, pre_gt, y_test)

    return test_predict, y, diff_predict

In [None]:
def generate_val_rmse(min_RMSE):
    
    test_predict = np.zeros(predict_days*output_dim)
    val_set = pd.read_csv('./data/'+str(sensor_id)+'_validation_timestamps_24avg.tsv',sep='\t')
    val_points = val_set["Hold Out Start"]

    total = 0
    val_rmse_list = []
    val_diff_predicts = []
    val_std_predicts = []
    for i in range(len(val_points)):
        val_point = val_points[i]
        test_predict, ground_truth, diff_predict = test_single(test_point=val_point)
        test_predict = [test_predict[i].item() for i in range(len(test_predict))]
        test_predict = [k if k>0 else 0 for k in test_predict]
    
        val_diff_predicts.append(diff_predict)
        val_std_predicts.append(test_predict)
    
        val_MSE = np.square(np.subtract(ground_truth, test_predict)).mean() 
        val_RMSE = math.sqrt(val_MSE)
        val_rmse_list.append(val_RMSE)
        total += val_RMSE
        print(val_point, " val Root Mean Square Error: ", val_RMSE)
    
    temp0 = [ k+0.1 for k in range(72)]
    pd__temp = pd.DataFrame([temp0])
    if(is_diff==1):
        for i in range(len(val_diff_predicts)):
            pd__temp.loc[i] = val_diff_predicts[i]
    else:
        for i in range(len(val_std_predicts)):
            pd__temp.loc[i] = val_std_predicts[i]

        
    if(is_diff==0):
        norm = "std"
    else:
        norm = "diff"
    if(is_single==1):
        single = "single"
    else:
        if(is_normal==1):
            single = "normal"
        else:
            single = "extreme"
        
    if(is_watersheds==1):
        if(is_prob_feature==0):
            watersheds = "shed"
        else:
            watersheds = "Shed-ProbFeature"
    else:
        if(is_prob_feature==0):
            watersheds = "solo"
        else:
            watersheds = "ProbFeature"
        
    new_min_RMSE = min_RMSE
    if (total<min_RMSE):
        pd__temp.to_csv(basic_path+'_'+norm+'_'+single+'.tsv',sep='\t')
        new_min_RMSE = total
        #save_model
        encoder_name = basic_encoder_model_path + '_' + norm + '_' + watersheds + ".pt"
        outlinear_name = basic_decoder_model_path + '_' + norm + '_' + watersheds + ".pt"      
        torch.save(outlinear, outlinear_name) 
        torch.save(encoder, encoder_name)
        
    print("val total RMSE: ", total)
    print("val min RMSE: ", new_min_RMSE)
    
    return total, new_min_RMSE

In [None]:
def generate_test(path, filename):
    
    test_predict = np.zeros(predict_days*output_dim)
    #test_data
    val_set = pd.read_csv('./data/'+str(sensor_id)+'_test_timestamps_24avg.tsv',sep='\t')
    val_points = val_set["Start"]

    val_diff_predicts = []
    val_std_predicts = []
    for i in range(len(val_points)):
        val_point = val_points[i]
        test_predict, ground_truth, diff_predict = test_single(test_point=val_point)
        test_predict = [test_predict[i].item() for i in range(len(test_predict))]
    
        val_diff_predicts.append(diff_predict)
        val_std_predicts.append(test_predict)
    
    temp0 = [ k+0.1 for k in range(72)]
    pd__temp = pd.DataFrame([temp0])
    if(is_diff==1):
        for i in range(len(val_diff_predicts)):
            pd__temp.loc[i] = val_diff_predicts[i]
    else:
        for i in range(len(val_std_predicts)):
            pd__temp.loc[i] = val_std_predicts[i]
    pd__temp.to_csv(path+filename+'.tsv', sep='\t')
    
    print("test prediction is saved at:  " + path + filename + '.tsv')

In [None]:
from matplotlib import pyplot as plt
import sys
import sklearn
from sklearn.mixture import GaussianMixture

sensor_ids = [4001]
local_sets = [[0]] # if watershed mode is used, the rain sensors involved should be set here.

for i in range(len(sensor_ids)):
    # sensor's data
    sensor_id = sensor_ids[i]
    local_set = local_sets[i]

    # dataset parameters
    start_point = '1980-12-31 14:30:00'
    train_end = 0 
    train_point = '2019-12-31 14:30:00'
    print("Begin!! Sensor id is :---------------------", sensor_id)

    #training hyperparameters    
    input_dim = 1      # univari forecasting, set 1
    output_dim = 1       # univari forecasting, set 1
    train_days = 15*24   # history length, h=360
    predict_days = 24*3  # forecasting length,f=72
    alpha_extreams = 1.5 # normal/extreme boundary, \epsilon=1.5
    is_diff = 1          # first order difference preprocessing, set1
    is_single = 0        # N, E, or C, so not single
    is_normal = 0        
    is_extreme = 1       # E model, set 1
    is_watersheds = 0    # no exogenous variables for this sensor, set 0
    is_prob_feature = 1  # use GMM Indicator, set 1
    is_over_sampling = 1 # E model use oversampling, set 1
    norm_percent = 0.0   # oversampling ratio, OS% = 1

    # model hyperparameters
    hidden_dim = 512   # E hidden
    layer_dim = 4      # E layers
    batch_size = 32     # E batch_size
    train_volum = 50000  # E volume
    print("hidden dim is: ", hidden_dim)
    print("layer num is: ", layer_dim)
    print("train_volum is: ", train_volum)

    print(" is_over_sampling: ", is_over_sampling, " is_prob_feature: ", is_prob_feature, " is_diff: ", is_diff," is_single: ", is_single, " is_normal: ", is_normal, " is_extreme: ", is_extreme, " is_watersheds: ", is_watersheds)
    if(is_watersheds==1):
        input_dim = len(local_set) + 1
        print("the input_dim is: ", input_dim)
    
    if(is_prob_feature==1):
        input_dim += 1
        print("the input_dim with GM is: ", input_dim)
    
    if(is_extreme+is_single+is_normal!=1):
        print("Error: is_extreme, is_single,is_normal can and only can be chosen one.")
        sys.exit()
    
    val_skips = []

    if(is_watersheds==1):
        # set according to the rain sensor's available data scope
        start_point = '1991-12-18 14:30:00'
        train_point = '2017-12-18 14:30:00'
        
    sensor_data, train_end, val_skips = read_dataset(sensor_id=sensor_id, start_point=start_point, train_point=train_point)
  
    if(is_diff==0):
        sensor_data_norm, mean, std = std_norm_dataset(sensor_data)
        print("I am using std_norm.")
    else:
        sensor_data_norm, mean, std = diff_norm_dataset(sensor_data)
        print("I am using diff_std_norm.")


    print("If I am using over_sampling, the norm_percent is: ", norm_percent)
    
    if(is_diff==0):
        norm = "std"
    else:
        norm = "diff"
        
    if(is_over_sampling==1):
        OS = "_OS" + str(norm_percent)
    else:
        OS = "_OS-null"
        
    if(is_single==1):
        single = "single"
    else:
        if(is_normal==1):
            single = "normal"
        else:
            single = "extreme"
            
    if(is_watersheds==1):
        if(is_prob_feature==0):
            watersheds = "shed"
        else:
            watersheds = "Shed-ProbFeature"
    else:
        if(is_prob_feature==0):
            watersheds = "solo"
        else:
            watersheds = "ProbFeature"
 
    sensor_data_norm_1 = [[ff] for ff in sensor_data_norm] 

    if(is_watersheds==1):
        sensor_data_norm = sensor_data_norm[:train_end]
        sensor_data_norm_1 = [[ff] for ff in sensor_data_norm] 
        for k in range(len(local_set)):
            sensor_data_local, _ , _ = read_dataset(sensor_id=local_set[k], start_point=start_point, train_point=train_point)
            if(is_diff==1):
                sensor_data_norm_local, mean_local, std_local = std_norm_dataset(sensor_data_local)
            else:
                sensor_data_norm_local, mean_local, std_local = diff_norm_dataset(sensor_data_local)
            print("local sensor id is: ", local_set[k], " mean is: ", mean_local, " std is: ", std_local)
            sensor_data_norm_local = sensor_data_norm_local[:train_end]
            sensor_data_norm_local = [[ff] for ff in sensor_data_norm_local]
            sensor_data_norm_1 = np.concatenate((sensor_data_norm_1, sensor_data_norm_local), 1)

    if(is_prob_feature==1):
        gm = GaussianMixture(n_components=3, )  # using GMM, M=3
        sensor_data_norm_prob = sensor_data_norm.reshape(-1, 1)
        gm.fit(sensor_data_norm_prob)
        print("gm.means are: ", gm.means_)
        print("gm.covariances are: ", gm.covariances_)
        print("gm.weights are: ", gm.weights_)
        weights = gm.weights_
        data_prob = gm.predict_proba(sensor_data_norm_prob)
        prob_in_distribution = data_prob[:, 0] * weights[0] + data_prob[:, 1] * weights[1] + data_prob[:, 2] * weights[2]
        prob_like_outlier = 1- prob_in_distribution
        prob_like_outlier = prob_like_outlier.reshape((len(sensor_data_norm), 1))
        sensor_data_norm_1 = np.concatenate((sensor_data_norm_1,prob_like_outlier), 1)
          
    class_label = []
    for i in range(len(sensor_data_norm)):
        if abs(sensor_data_norm[i])>alpha_extreams:
            class_label.append(1)
        else:
            class_label.append(0)
    
    if(is_single==0):
        if(is_normal==1):
            print("I am training only normal data.")        
        else: 
            if(is_extreme==1):
                print("I am training only extremem data.")
    else:
        print("I am training on the whole data set.")


    #paths settings
    basic_path = './val/' + str(sensor_id) + '_' + watersheds       #path used to save validation set prediction
    basic_encoder_model_path = "./model/" + str(sensor_id) + single + '_encoder'   # E_LSTM model path
    basic_decoder_model_path = "./model/" + str(sensor_id) + single + '_outlinear' # E_FC model path
    
    #get train data_loader 
    dataloader = split_dataset(norm_data=sensor_data_norm, local_data=sensor_data_norm_1,train_end=train_end, input_dim=input_dim, output_size=output_dim, train_days=train_days, predict_days=predict_days,train_volum=train_volum)
        
    #create_model
    encoder, outlinear, encoder_optimizer, outlinear_optimizer, criterion = create_model(input_dim=input_dim, hidden_dim=hidden_dim, layer_dim=layer_dim, output_dim=input_dim)

    #train the model
    train_loop()    #if only inferencing, just hide this line.
    
    print("Finish!! Sensor id is :---------------------", sensor_id)    

    #generate test prediction
    encoder = torch.load(basic_encoder_model_path+'_'+norm+'_'+watersheds+".pt")
    outlinear = torch.load(basic_decoder_model_path+'_'+norm+'_'+watersheds+".pt" )
    encoder.eval()
    outlinear.eval()
    generate_test("./test/", str(sensor_id)+single) # the path of generated test prediction
