In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
models=["mlp","cnn","lstm"]
columns=["readings","cycle","trend"]
files_train=os.listdir("dataset/train")
files_test=os.listdir("dataset/test")


In [3]:
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

def prepare_dataset(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['filled']=df['meter_reading'].apply(lambda x: 0)
    existing_hours = df['timestamp'].dt.floor('H').unique()
    
    start_date = df['timestamp'].min().replace(minute=0, second=0)
    end_date = df['timestamp'].max().replace(minute=0, second=0)
    date_range = pd.date_range(start=start_date, end=end_date, freq='H')
    all_hours_present = all(hour in existing_hours for hour in date_range)
    if not(all_hours_present):
        complete_df = pd.DataFrame({'timestamp': date_range})
        df = complete_df.merge(df, on='timestamp', how='left')
        df['filled']=df['filled'].fillna(1)
        df['meter_reading'] = df['meter_reading'].interpolate(method='linear', limit_direction='both')
        df.reset_index(inplace=True, drop=True)
    #apply minmax scaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    df['meter_reading'] = scaler.fit_transform(df['meter_reading'].values.reshape(-1,1))
    grouped_df = df.groupby(df['timestamp'].dt.date)
    
    # Aggregate 'meter_reading' values into a list for each day
    aggregated_df = grouped_df.agg({'meter_reading': list, 'anomaly': list, 'filled':list}).reset_index()

    # Rename columns and sort by date
    aggregated_df.columns = ['date', 'readings', 'anomalies','filled']
    aggregated_df = aggregated_df.sort_values(by='date')

    # Display the aggregated dataframe
    aggregated_df["length"] = aggregated_df["readings"].apply(lambda lst: len([x for x in lst if not pd.isna(x)]))
    aggregated_df["no_anomalies"] = aggregated_df["anomalies"].apply(lambda x: True if all(val == 0 for val in x) else False)
    aggregated_df["filled"] = aggregated_df["filled"].apply(lambda x: False if all(val == 0 for val in x) else True)


    df=aggregated_df[aggregated_df["length"]==24]
    df['cycle'] = df["readings"].apply(lambda x: sm.tsa.filters.hpfilter(x, 2)[0])
    df['trend'] = df["readings"].apply(lambda x: sm.tsa.filters.hpfilter(x, 2)[1])
    df["months"] = df["date"].apply(lambda x: str(x.month))
    df["weekday"] = df["date"].apply(lambda x: str(x.weekday()))
    df["weekend"] = df["weekday"].apply(lambda x: 1 if x in ["5","6"] else 0)
    return df

In [4]:
train_datasets=[]
test_datasets=[]

for file in files_train:
    dataset=pd.read_csv("dataset/train/"+file)
    train_dataset=prepare_dataset(dataset)
    train_datasets.append(train_dataset)
for file in files_test:
    dataset=pd.read_csv("dataset/test/"+file)
    test_dataset=prepare_dataset(dataset)
    test_datasets.append(test_dataset)
# concatenate all the csv files into one
train_dataset= pd.concat(train_datasets)
test_dataset= pd.concat(test_datasets)

train_dataset=train_dataset[train_dataset["filled"]==0]
test_dataset=test_dataset[test_dataset["filled"]==0]
print(train_dataset.shape)

(1461, 11)


In [5]:
train_dataset.head()

Unnamed: 0,date,readings,anomalies,filled,length,no_anomalies,cycle,trend,months,weekday,weekend
0,2016-01-01,"[0.4949748743718593, 0.5678391959798995, 0.522...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.03123344778510584, 0.03941239826478582, 0....","[0.5262083221569651, 0.5284267977151137, 0.515...",1,4,0
1,2016-01-02,"[0.4396984924623116, 0.4271356783919598, 0.487...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[0.0018269408516842778, -0.021690685703229518,...","[0.4378715516106273, 0.44882636409518933, 0.46...",1,5,1
2,2016-01-03,"[0.41959798994974873, 0.4396984924623116, 0.52...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.009053880414004123, -0.01316737908346971, ...","[0.42865187036375285, 0.4528658715457813, 0.47...",1,6,1
3,2016-01-04,"[0.42462311557788945, 0.4296482412060301, 0.42...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.0003693508051154648, 0.0022657521224360155...","[0.4249924663830049, 0.4273824890835941, 0.429...",1,0,0
4,2016-01-05,"[0.5477386934673367, 0.5402010050251257, 0.545...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",False,24,True,"[-0.0027492076873769644, -0.004783365280123131...","[0.5504879011547137, 0.5449843703052488, 0.538...",1,1,0


In [6]:
def make_numpy(column):
    dataset=np.array([np.array(i) for i in column])

    return dataset

In [188]:
# Define the binary classifier model
class BinaryClassifier(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2,class_weights = [1, 5]):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, 1)
        self.sigmoid = nn.Sigmoid()
        self.dropout1 = nn.Dropout(p=0.3)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
        else:
            self.class_weights = None


    def forward(self, x):
        x = x.permute(1, 0) 
        print(x.shape)
        x = self.relu1(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu2(self.fc2(x))
        x = self.dropout1(x)
        x = self.fc3(x)
        return x
    def calculate_loss(self, outputs, targets):
        # Use weighted binary cross-entropy loss
        criterion = nn.BCEWithLogitsLoss(pos_weight=self.class_weights)
        loss = criterion(outputs, targets)
        return loss
    
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Set initial hidden and cell states to zero
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))

        # Extract the output from the last time step
        out = out[:, -1, :]

        # Fully connected layer
        out = self.fc(out)

        # Apply sigmoid activation function
        out = self.sigmoid(out)

        return out

In [189]:
def labelize(arr):
    arr2=[]
    for x in arr:
        if x=[1,0]:
            arr2.append([1.,0.])
        else:
            arr2.append([0.,1.])
    return np.array(arr2)

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (1571649736.py, line 4)

In [190]:

from collections import Counter

# Your list of values
my_list = y_train.detach().numpy()[0].tolist()

# Count the occurrences of each value
value_counts = Counter(my_list)

# Print the result
for value, count in value_counts.items():
    print(f"Value {value} appears {count} times.")

Value 1.0 appears 1 times.
Value 0.0 appears 1 times.


In [191]:
1367/94

14.542553191489361

In [192]:
def get_labels(x):
    
    if x==1:
        return(np.array([1,0]))
    else:
        return(np.array([0,1]))

In [193]:
y_train=train_dataset["no_anomalies"].apply(lambda x: np.array([1,0]) if x==True else np.array([0,1]))
labels_y=[]
for x in y_train:
    labels_y.append(x)
labels_y=np.array(labels_y)
labels_y=torch.from_numpy(labels_y).float()
labels_y

tensor([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]])

In [194]:
y_train.squeeze(0)

0      [1, 0]
1      [1, 0]
2      [1, 0]
3      [1, 0]
4      [1, 0]
        ...  
361    [1, 0]
362    [1, 0]
363    [1, 0]
364    [1, 0]
365    [1, 0]
Name: no_anomalies, Length: 1461, dtype: object

In [195]:
from sklearn.metrics import cohen_kappa_score
for model_n in models:
    for column in columns:
        X=train_dataset[column]
        X_train=torch.Tensor(make_numpy(X)).transpose(0,1)
        X_test=torch.Tensor(make_numpy(test_dataset[column])).transpose(0,1)
        y_train=train_dataset["no_anomalies"].apply(lambda x: np.array([1,0]) if x==True else np.array([0,1]))
        labels_y=[]
        for x in y_train:
            labels_y.append(x)
        labels_y=np.array(labels_y)
        labels_y=torch.from_numpy(labels_y).float()
        y_train=labels_y

        y_test=test_dataset["no_anomalies"].apply(lambda x: np.array([1,0]) if x==True else np.array([0,1]))
        labels_y=[]
        for x in y_test:
            labels_y.append(x)
        labels_y=np.array(labels_y)
        labels_y=torch.from_numpy(labels_y).float()
        y_test=labels_y

        num_epochs = 10000
        print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
        if model_n=="mlp":

            input_size = 24
            hidden_size1 = 18
            hidden_size2 = 8
            model = BinaryClassifier(input_size, hidden_size1, hidden_size2)
            
            optimizer = optim.Adam(model.parameters(), lr=0.001)
            for epoch in range(num_epochs):
                # Forward pass
                outputs = model(X_train)
                loss = model.calculate_loss(outputs, y_train)
                print(outputs)
                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if epoch == 0:
                    best_val_loss = loss
                    best_model = model
                # Print loss every 100 epochs
                if (epoch + 1) % 100 == 0:
                    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
                if (epoch + 1) % 20 == 0:
                    outputs = model(X_test)
                    loss = model.calculate_loss(outputs, y_test)
                    #print(f'Test Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
                    best_val_loss = min(loss,best_val_loss)
                    if loss==best_val_loss:
                        best_model = model
                        #print("best model updated at epoch ",epoch+1)
                        #print(outputs.squeeze(1).detach().numpy())
                        threshold = 0.5
                        predicted_classes = (outputs > threshold).float()
                        best_kappa=cohen_kappa_score(predicted_classes.squeeze(1).detach().numpy(), y_test.squeeze(0).detach().numpy())
            #save model
            torch.save(best_model.state_dict(), "models/"+model_n+"_"+column+"_kappa_"+str(best_kappa)+".pt")
            print("best model saved, kappa = ",best_kappa)




torch.Size([24, 1461]) torch.Size([24, 1452]) torch.Size([1461, 2]) torch.Size([1452, 2])
torch.Size([1461, 24])


ValueError: Target size (torch.Size([1461, 2])) must be the same as input size (torch.Size([1461, 1]))

In [None]:
y_test

tensor([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]])

In [187]:


predicted_classes

tensor([[1., 0.],
        [1., 1.],
        [1., 0.],
        ...,
        [1., 0.],
        [1., 0.],
        [0., 0.]])

In [145]:
outputs.type()

'torch.FloatTensor'

In [69]:
labelize(outputs.squeeze(1).detach().numpy()), y_test.squeeze(0).detach().numpy()


(array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.], dtype=float32))

In [46]:
y_test

tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]])

In [43]:
torch.Tensor(y_train.squeeze(1).detach().numpy())

tensor([1., 1., 1.,  ..., 1., 1., 1.])

In [21]:
outputs

tensor([[0.5036],
        [0.5023],
        [0.5012],
        ...,
        [0.5202],
        [0.5203],
        [0.5203]], grad_fn=<SigmoidBackward0>)

In [31]:
labelize(outputs.squeeze(1).detach().numpy())

1