In [2]:
import os
models=["mlp","cnn","lstm"]
columns=["readings","cycle","trend"]
files_train=os.listdir("dataset/train")
files_test=os.listdir("dataset/test")

In [3]:
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

def prepare_dataset(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['filled']=df['meter_reading'].apply(lambda x: 0)
    existing_hours = df['timestamp'].dt.floor('H').unique()
    
    start_date = df['timestamp'].min().replace(minute=0, second=0)
    end_date = df['timestamp'].max().replace(minute=0, second=0)
    date_range = pd.date_range(start=start_date, end=end_date, freq='H')
    all_hours_present = all(hour in existing_hours for hour in date_range)
    if not(all_hours_present):
        complete_df = pd.DataFrame({'timestamp': date_range})
        df = complete_df.merge(df, on='timestamp', how='left')
        df['filled']=df['filled'].fillna(1)
        df['meter_reading'] = df['meter_reading'].interpolate(method='linear', limit_direction='both')
        df.reset_index(inplace=True, drop=True)
    #apply minmax scaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    df['meter_reading'] = scaler.fit_transform(df['meter_reading'].values.reshape(-1,1))
    grouped_df = df.groupby(df['timestamp'].dt.date)
    
    # Aggregate 'meter_reading' values into a list for each day
    aggregated_df = grouped_df.agg({'meter_reading': list, 'anomaly': list, 'filled':list}).reset_index()

    # Rename columns and sort by date
    aggregated_df.columns = ['date', 'readings', 'anomalies','filled']
    aggregated_df = aggregated_df.sort_values(by='date')

    # Display the aggregated dataframe
    aggregated_df["length"] = aggregated_df["readings"].apply(lambda lst: len([x for x in lst if not pd.isna(x)]))
    aggregated_df["no_anomalies"] = aggregated_df["anomalies"].apply(lambda x: True if all(val == 0 for val in x) else False)
    aggregated_df["filled"] = aggregated_df["filled"].apply(lambda x: False if all(val == 0 for val in x) else True)


    df=aggregated_df[aggregated_df["length"]==24]
    df['cycle'] = df["readings"].apply(lambda x: sm.tsa.filters.hpfilter(x, 2)[0])
    df['trend'] = df["readings"].apply(lambda x: sm.tsa.filters.hpfilter(x, 2)[1])
    df["months"] = df["date"].apply(lambda x: str(x.month))
    df["weekday"] = df["date"].apply(lambda x: str(x.weekday()))
    df["weekend"] = df["weekday"].apply(lambda x: 1 if x in ["5","6"] else 0)
    return df

In [4]:
import pandas as pd
train_datasets=[]
test_datasets=[]

for file in files_train:
    dataset=pd.read_csv("dataset/train/"+file)
    train_dataset=prepare_dataset(dataset)
    train_datasets.append(train_dataset)
for file in files_test:
    dataset=pd.read_csv("dataset/test/"+file)
    test_dataset=prepare_dataset(dataset)
    test_datasets.append(test_dataset)
# concatenate all the csv files into one
train_dataset= pd.concat(train_datasets)
test_dataset= pd.concat(test_datasets)

train_dataset=train_dataset[train_dataset["filled"]==0]
test_dataset=test_dataset[test_dataset["filled"]==0]
print(train_dataset.shape)

(1461, 11)


In [5]:
train_dataset

Unnamed: 0,date,readings,anomalies,filled,length,no_anomalies,cycle,trend,months,weekday,weekend
0,2016-01-01,"[0.4949748743718593, 0.5678391959798995, 0.522...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.03123344778510584, 0.03941239826478582, 0....","[0.5262083221569651, 0.5284267977151137, 0.515...",1,4,0
1,2016-01-02,"[0.4396984924623116, 0.4271356783919598, 0.487...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[0.0018269408516842778, -0.021690685703229518,...","[0.4378715516106273, 0.44882636409518933, 0.46...",1,5,1
2,2016-01-03,"[0.41959798994974873, 0.4396984924623116, 0.52...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.009053880414004123, -0.01316737908346971, ...","[0.42865187036375285, 0.4528658715457813, 0.47...",1,6,1
3,2016-01-04,"[0.42462311557788945, 0.4296482412060301, 0.42...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.0003693508051154648, 0.0022657521224360155...","[0.4249924663830049, 0.4273824890835941, 0.429...",1,0,0
4,2016-01-05,"[0.5477386934673367, 0.5402010050251257, 0.545...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.0027492076873769644, -0.004783365280123131...","[0.5504879011547137, 0.5449843703052488, 0.538...",1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
361,2016-12-27,"[0.4570536179073399, 0.43987506507027585, 0.44...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[0.005837027211260448, -0.006189871875762865, ...","[0.45121659069607944, 0.4460649369460387, 0.44...",12,1,0
362,2016-12-28,"[0.43102550754815194, 0.43987506507027585, 0.4...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.0017604949444661733, 0.00811677003408795, ...","[0.4327860024926181, 0.4317582950361879, 0.429...",12,2,0
363,2016-12-29,"[0.44351900052056215, 0.4242581988547631, 0.42...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[0.005938749745207805, -0.00670914850011195, 0...","[0.43758025077535434, 0.4309673473548751, 0.42...",12,3,0
364,2016-12-30,"[0.4263404476834982, 0.4299843831337845, 0.436...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.0009624783530313152, 0.0003953337661932177...","[0.42730292603652953, 0.4295890493675913, 0.43...",12,4,0


In [6]:
import numpy as np
def make_numpy(column):
    dataset=np.array([np.array(i) for i in column])

    return dataset

In [7]:
X_train=make_numpy(train_dataset["readings"])
y_train=train_dataset["no_anomalies"].astype(float).values
X_test=make_numpy(test_dataset["readings"])
y_test=test_dataset["no_anomalies"].astype(float).values

In [8]:

import torch
 
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:

import torch.nn as nn
 

class Deep(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(24, 24)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(24, 24)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(24, 24)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(24, 1)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.2)
 
    def forward(self, x):
        x = self.act1(self.layer1(x))
        #x = self.dropout(x)
        x = self.act2(self.layer2(x))
        #x = self.dropout(x)
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

In [10]:

targets = y_batch
# Calculate the class weights based on the class distribution
total_samples = len(targets)
epsilon = 1e-5  # a small value to avoid division by zero
weight_positive = total_samples / (weight2 * (torch.sum(targets == 1) + epsilon))
weight_negative = total_samples / (weight1 * (torch.sum(targets == 0) + epsilon))

NameError: name 'y_batch' is not defined

In [11]:
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=5, alpha=0.5):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, output, target):
        bce_loss = torch.nn.functional.binary_cross_entropy_with_logits(output, target, reduction='none')
        pt = torch.exp(-bce_loss)
        focal_loss = (self.alpha * (1 - pt) ** self.gamma * bce_loss).mean()
        return focal_loss

In [19]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
 
def model_train(model, X_train, y_train, X_val, y_val, alpha,gamma,weight1=1, weight2=1):
    # loss function and optimizer
    # Assuming you have your predictions and targets


    # Create a weight tensor
    # Generate random indices


    optimizer = optim.Adam(model.parameters(), lr=0.0001)
 
    n_epochs = 1000   # number of epochs to run
    batch_size = 5  # size of each batch :20
    batch_start = torch.arange(0, len(X_train), batch_size)
 
    # Hold the best model
    best_acc = - np.inf   # init to negative infinity
    best_weights = None
    best_kappa=-1
    best_spec=-1
 
    for epoch in range(n_epochs):
        indices = torch.randperm(len(y_train))

        # Shuffle both tensors using the same indices
        X_train = X_train[indices]
        y_train = y_train[indices]
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                y_batch = y_train[start:start+batch_size]
                # forward pass
                y_pred = model(X_batch)
                targets = y_batch

                # Calculate the class weights based on the class distribution
                total_samples = len(targets)
                epsilon = 1e-5  # a small value to avoid division by zero
                #weight_positive = total_samples / (weight2 * (torch.sum(targets == 1) + epsilon))
                #weight_negative = total_samples / (weight1 * (torch.sum(targets == 0) + epsilon))
                #weights = torch.where(targets == 1, torch.tensor(weight_positive), torch.tensor(weight_negative))

                
                #return(weights,y_batch,y_pred)
                #loss = torch.nn.functional.binary_cross_entropy_with_logits(y_pred, y_batch, weight=weights )
                loss = FocalLoss(gamma=gamma,alpha=alpha)(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                acc = (y_pred.round() == y_batch).float().mean()
                bar.set_postfix(
                    loss=float(loss),
                    acc=float(acc)
                )
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(X_val)
        acc = (y_pred.round() == y_val).float().mean()
        acc = float(acc)
        kappa=cohen_kappa_score(y_pred.round().detach().numpy()[:,0], y_test.detach().numpy()[:,0])
        true_negatives = np.sum((y_test.detach().numpy()[:,0] == 0) & (y_pred.round().detach().numpy()[:,0] == 0))
        false_positives = np.sum((y_test.detach().numpy()[:,0] == 0) & (y_pred.round().detach().numpy()[:,0] == 1))
        specificity = true_negatives / (true_negatives + false_positives)
        if kappa > best_kappa:
            best_acc = acc
            best_kappa=kappa
            best_spec=specificity
            best_weights = copy.deepcopy(model.state_dict())
    # restore model and return best accuracy
    model.load_state_dict(best_weights)
    return best_kappa,best_spec,best_acc,model

In [None]:
alpha=[i/10 for i in range(1,10)]
gamma=[i/100 for i in range(500,700,5)]
pbar = tqdm.tqdm(total=len(alpha)*len(gamma))
output=pd.DataFrame(columns=["alpha","gamma","kappa","specificity","accuracy"])
for a in alpha:
    for g in gamma:
        model = Deep()
        best_kappa,best_spec,best_acc,model = model_train(model, X_train, y_train, X_test, y_test,alpha=a,gamma=g)
        print("Accuracy (deep): %.2f" % best_acc)
        print("Kappa (deep): %.2f" % best_kappa)
        print("Specificity (deep): %.2f" % best_spec)
        print("alpha: %.2f" % a)
        print("gamma: %.2f" % g)
        print("--------------------------------------------------")
        output=output.append({"alpha":a,"gamma":g,"kappa":best_kappa,"specificity":best_spec,"accuracy":best_acc},ignore_index=True)
        #save csv
        output.to_csv("output6.csv",index=False)
        pbar.update(1)
pbar.close()

In [16]:
from sklearn.metrics import cohen_kappa_score


In [20]:
model = Deep()
best_kappa,best_spec,best_acc,model = model_train(model, X_train, y_train, X_test, y_test,alpha=0.2,gamma=5.6)

In [22]:
best_kappa,best_spec,best_acc

(0.11202231674031227, 0.6842105263157895, 0.6639118194580078)

In [None]:
#save model to file
torch.save(model.state_dict(), "model.pt")

In [None]:
pbar.close()

In [None]:
from sklearn.metrics import cohen_kappa_score
import tqdm

weight2_range=[i/2000 for i in range(400,300,-1)]
weight1_range=[i/10 for i in range(1,20,1)]
print(len(weight1_range),len(weight2_range))
pbar = tqdm.tqdm(total=len(weight1_range)*len(weight2_range))
for i in weight1_range:
    stop=0
    for j in weight2_range:
        
        model = Deep()
        best_kappa,best_acc,model = model_train(model, X_train, y_train, X_test, y_test,weight1=i,weight2=j)
        print("Accuracy (deep): %.2f" % best_acc)
        print("Kappa (deep): %.2f" % best_kappa)
        model.eval()
        outputs = model(X_test)
        y_pred = outputs.round().detach().numpy()
        

        kappa=cohen_kappa_score(y_pred[:,0], y_test.detach().numpy()[:,0])
        if kappa > best_kappa:
            best_kappa=kappa
            best_weights = copy.deepcopy(model.state_dict())
            best_weight1=i
            best_weight2=j
            print("best kappa:",best_kappa,"best weight1:",best_weight1,"best weight2:",best_weight2)
            stop=0
        else:
            stop+=1
            if stop>3:
                break
                
        pbar.update(1)
pbar.close()


In [None]:
model = Deep()
best_kappa,best_acc,model = model_train(model, X_train, y_train, X_test, y_test,weight1=i,weight2=j)

In [None]:
pbar.close()

In [None]:
best_kappa,best_weight1,best_weight2 #with dropout 0.2

In [None]:
best_kappa,best_weight1,best_weight2 #without dropout 0.2

In [None]:
best_kappa,best_weight1,best_weight2 #without dropout 0.2, larger model

In [None]:
best_kappa,best_weight1,best_weight2 #without dropout 0.2, larger model new weight range

In [None]:
threshold = 0.5
predicted_classes = (outputs > threshold)

In [None]:
from collections import Counter

# Your list of values
my_list = predicted_classes[:,0].tolist()

# Count the occurrences of each value
value_counts = Counter(my_list)

# Print the result
for value, count in value_counts.items():
    print(f"Value {value} appears {count} times.")