In [150]:
import numpy as np
import pandas as pd
from datetime import datetime
from sktime.transformations.panel.rocket import Rocket
from datetime import datetime, timedelta
from sklearn.linear_model import RidgeCV
import plotly.express as px
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # only for pandas version < 1.9
from sklearn.model_selection import train_test_split
from scipy.fft import fft


## Create Dataframe with Label

In [151]:
features=['x',"y","z"]
df_features = pd.read_parquet("df_valve.parquet")


In [152]:
df_features

Unnamed: 0,x,y,z,time,label
0,1932,-134,134,2023-01-10 11:11:11,479.577271
1,2018,10,146,2023-01-10 11:11:11,479.577271
2,2263,25,101,2023-01-10 11:11:11,479.577271
3,2275,-85,87,2023-01-10 11:11:11,479.577271
4,2092,-63,52,2023-01-10 11:11:11,479.577271
...,...,...,...,...,...
3409915,2190,-445,234,2023-01-10 14:22:00,644.114685
3409916,2019,-495,309,2023-01-10 14:22:00,644.114685
3409917,1889,-410,346,2023-01-10 14:22:00,644.114685
3409918,1987,-404,406,2023-01-10 14:22:00,644.114685


## Preprocess data for prediction

In [153]:
# put data into a dataframe as expected by ROCKET
def transform_to_rocket_1feature(df:pd.DataFrame,id):
    df_Rocket=pd.DataFrame(columns=[id])
    df_Y=pd.DataFrame(columns=[id])
    df_Time=pd.DataFrame(columns=['time'])
   
    for g in df.groupby(by=['time']):
        indxC=df_Rocket.index.max()
        if indxC!=indxC:
            indxC=0
        else:
            indxC=indxC+1
        df_Rocket.loc[indxC]= [g[1][id]]
        df_Y=df_Y.append({id: g[1]['label'].iloc[0]}, ignore_index=True)
        df_Time=df_Time.append({'time': g[1]['time'].iloc[0]}, ignore_index=True)
    return df_Rocket,df_Y,df_Time

def transform_to_rocket(df,features):
    dfX=None
    dfY=None
    dfTime=None
    for id in features:
        dfX_,dfY_,dfTime_=transform_to_rocket_1feature(df,id)
        if dfX is None:
            dfX=dfX_
            dfY=dfY_
            dfTime=dfTime_
        else:
            dfX[id]=dfX_[id]
            
    return dfX, dfY, dfTime

In [154]:
# transform training data to ROCKET df
X_train,dfY_train,dfTime_train=transform_to_rocket(df_features,features)

X_train, X_test, y_train, y_test, time_train, time_test = train_test_split(X_train, dfY_train, dfTime_train, test_size=0.35, random_state=42, shuffle=False)

In [155]:
def convert_to_fft(df, N=512):
    """
    FFT transformation
    """
    pd_list = []
    for row in df.iterrows():
        var_splits = []
        for array in row[1]:
            yf = fft(array)

            var_splits.append(np.abs(yf)[1:int(N/2)])
            
        pd_list.append(np.concatenate(var_splits))
        
    transform = pd.DataFrame(pd_list)
    return transform

In [156]:
X_train_transform = convert_to_fft(X_train)
X_test_transform = convert_to_fft(X_test)

## Train the model

In [358]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score

In [158]:
# Fit ridgecv regression model
rm=RidgeCV(alphas=np.logspace(-3, 3, 10))
rm.fit(X_train_transform, y_train)

score_train=rm.score(X_train_transform, y_train)
predictions_train=rm.predict(X_train_transform)

In [159]:
# Fit random forest regression model
regr = RandomForestRegressor() 
regr.fit(X_train_transform, y_train.to_numpy().ravel())

score_train_regr=regr.score(X_train_transform, y_train.to_numpy().ravel())
predictions_train_regr=regr.predict(X_train_transform)

In [160]:
score_test_regr=regr.score(X_test_transform, y_test)
predictions_test_regr=regr.predict(X_test_transform)

In [161]:
score_test=rm.score(X_test_transform, y_test)
predictions_test=rm.predict(X_test_transform)

## Messure modell performance

In [162]:
from sklearn.metrics import mean_squared_error, r2_score
print("Linear regression:")
print("RMSE for trainset:", mean_squared_error(y_train, predictions_train, squared=False))
print("RMSE for testset:", mean_squared_error(y_test, predictions_test, squared=False))
print(" ")
# Random forest regressor
print("Random forest regressor:")
print("RMSE for trainset:", mean_squared_error(y_train, predictions_train_regr, squared=False))
print("RMSE for testset:", mean_squared_error(y_test, predictions_test_regr, squared=False))

Linear regression:
RMSE for trainset: 2646.46102805802
RMSE for testset: 5502.151998725147
 
Random forest regressor:
RMSE for trainset: 689.5711564798806
RMSE for testset: 3337.8961472934375


In [163]:
# Linear regression
print("Linear regression:")
print('Score for training data, features {}: {}'.format(features, score_train))
print('Score for test data, features {}: {}'.format(features, score_test))
print(" ")
# Random forest regressor
print("Random forest regressor:")
print('Score for training data, features {}: {}'.format(features, score_train_regr))
print('Score for test data, features {}: {}'.format(features, score_test_regr))

Linear regression:
Score for training data, features ['x', 'y', 'z']: 0.8352715541904303
Score for test data, features ['x', 'y', 'z']: -0.28674480504681243
 
Random forest regressor:
Score for training data, features ['x', 'y', 'z']: 0.9888160357551599
Score for test data, features ['x', 'y', 'z']: 0.5264423065619023


## Plotting model results

In [164]:
df_pred = pd.DataFrame(predictions_train, columns=['Prediction'])
label = y_train.reset_index()
label.drop(columns="index", inplace=True)
label.rename(columns={"x":"Label"}, inplace=True)
time_train.sort_values(by="time", inplace=True)
df_train = pd.merge(df_pred, label, left_index=True, right_index=True)

In [357]:
fig = px.line(df_train, y=["Prediction", "Label"], x=time_train["time"], title="Regression model prediction vs true value on train data",
labels={"variable": "Variable"})
fig.update_layout(
    xaxis_title="Time",
    yaxis_title = "Value"

    )


fig.show()

In [166]:
df_predt = pd.DataFrame(predictions_test, columns=['Prediction'])
labelt = y_test.reset_index()
labelt.drop(columns="index", inplace=True)
labelt.rename(columns={"x":"Label"}, inplace=True)
time_test.sort_values(by="time", inplace=True)
df_test = pd.merge(df_predt, labelt, left_index=True, right_index=True)

In [356]:
fig = px.line(df_test, y=["Prediction", "Label"], x=time_test["time"], title="Regression model prediction vs true value on test data",
             labels={"variable": "Variable"})
fig.update_layout(
    xaxis_title="Time",
    yaxis_title = "Value"

    )


fig.show()

In [352]:
df_predrf = pd.DataFrame(predictions_train_regr, columns=['Prediction'])
label = y_train.reset_index()
label.drop(columns="index", inplace=True)
label.rename(columns={"x":"Label"}, inplace=True)
time_train.sort_values(by="time", inplace=True)
df_trainrf = pd.merge(df_predrf, label, left_index=True, right_index=True)

In [353]:
df_predrft = pd.DataFrame(predictions_test_regr, columns=['Prediction'])
label = y_test.reset_index()
label.drop(columns="index", inplace=True)
label.rename(columns={"x":"Label"}, inplace=True)
time_train.sort_values(by="time", inplace=True)
df_testrf = pd.merge(df_predrft, label, left_index=True, right_index=True)

In [354]:
fig = px.line(df_trainrf, y=["Prediction", "Label"], x=time_train["time"], title="Random forest model prediction vs true value on train data",
             labels={"variable": "Variable"})
fig.update_layout(
    xaxis_title="Time",
    yaxis_title = "Value"

    )


fig.show()

In [355]:
fig = px.line(df_testrf, y=["Prediction", "Label"], x=time_test["time"], title="Random forest model prediction vs true value on test data",
             labels={"variable": "Variable"})
fig.update_layout(
    xaxis_title="Time",
    yaxis_title = "Value"

    )


fig.show()

## Deep neuroal network

In [170]:
from torch import nn
import torch
from sklearn.preprocessing import StandardScaler

In [259]:
class NN(nn.Module):
  '''
    Deep neural Network for Regression
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(765, 64),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 1),
    )


  def forward(self, x):
    '''
      Forward pass
    '''
    return self.layers(x)

In [260]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [261]:
train_x = torch.Tensor(X_train_transform.to_numpy()).float().to(device)
test_x = torch.Tensor(X_test_transform.to_numpy()).float().to(device)
train_y = torch.Tensor(y_train.to_numpy()).float().to(device)
test_y = torch.Tensor(y_test.to_numpy()).float().to(device)

In [316]:
import time

dnn = NN()
dnn.to(device)    
dnn.train()      


learning_rate = 0.0000000001

loss_func = nn.MSELoss()  # no softmax activation in the network, it needs raw data for the loss 

optimizer = torch.optim.SGD(dnn.parameters(), lr=learning_rate, momentum = 0.9) # execute GD, gets (parameters aka weights, and learning rate, with momentum)

start_time = time.time()
losses = []
test_losses = []

In [317]:
no_epochs = 4000

for iteration in range(no_epochs):
    
    optimizer.zero_grad() # set gradients of all tensor to 0 
    y_hat = dnn(train_x) # we predict on all data points (= batch gradient descent) 
    
    loss = loss_func(y_hat, train_y) # calculate the loss
    loss.backward() # backpropagate the loss to calculate gradients
    optimizer.step() # update the weights using these gradients 
    
    losses.append(loss.item())
    
    if iteration % 20 == 0:
        test_loss = loss_func(dnn(test_x), test_y)
        test_losses.append(test_loss.item())
        print(f"Loss in epoch {iteration} is {loss.item()} and test loss is {test_loss.item()}")

Loss in epoch 0 is 186151728.0 and test loss is 28800814.0
Loss in epoch 20 is 25523184.0 and test loss is 42521488.0
Loss in epoch 40 is 14425206.0 and test loss is 27563104.0
Loss in epoch 60 is 10868683.0 and test loss is 18168634.0
Loss in epoch 80 is 8329524.0 and test loss is 14081560.0
Loss in epoch 100 is 6871328.5 and test loss is 12248671.0
Loss in epoch 120 is 5955430.0 and test loss is 11445415.0
Loss in epoch 140 is 5153142.5 and test loss is 10599885.0
Loss in epoch 160 is 4483145.5 and test loss is 10122042.0
Loss in epoch 180 is 3922793.0 and test loss is 9930985.0
Loss in epoch 200 is 3476405.75 and test loss is 9665673.0
Loss in epoch 220 is 3125899.25 and test loss is 9448323.0
Loss in epoch 240 is 2846424.25 and test loss is 9191011.0
Loss in epoch 260 is 2619344.75 and test loss is 9010010.0
Loss in epoch 280 is 2432362.25 and test loss is 8916006.0
Loss in epoch 300 is 2276120.0 and test loss is 8826677.0
Loss in epoch 320 is 2143156.25 and test loss is 8682026.0


In [360]:
NN_r2tr = r2_score(dnn(train_x).detach().numpy(), 
         train_y.numpy()
         )
NN_r2te = r2_score(dnn(test_x).detach().numpy(), 
         test_y.numpy()
         )
print("Neural network:")
print('Score for training data, features {}: {}'.format(features, NN_r2tr))
print('Score for training data, features {}: {}'.format(features, NN_r2te))

Neural network:
Score for training data, features ['x', 'y', 'z']: 0.9935216542662132
Score for training data, features ['x', 'y', 'z']: 0.6821018068636575


## Plotting the modell results

In [319]:
fig = px.line(x=range(0, no_epochs), y=losses, title="Network loss over number of epochs")
fig.update_layout(
    xaxis_title="Number of epchs",
    yaxis_title = "Loss"
)
fig.show()

In [320]:
fig = px.line(x=range(0, 200), y=test_losses, title="Network loss over number of epochs unseen data")
fig.update_layout(
    xaxis_title="Number of epchs",
    yaxis_title = "Loss"
)
fig.show()

In [321]:
dnn.eval()
y_train_pred = dnn(train_x)


In [322]:
y_train_pred_list = [line.item() for line in y_train_pred]
train_y_list = [line.item() for line in train_y]

In [334]:
fig = px.line(x=time_train["time"], y=[y_train_pred_list, train_y_list], title="Model prediction vs true value on test data",
              labels={"variable": "Variable"})
newnames = {'wide_variable_0':'Prediction', 'wide_variable_1': 'Label'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )
fig.update_layout(
    xaxis_title="Time",
    yaxis_title = "Value"

    )

fig.show()


In [324]:
dnn.eval()
y_test_pred = dnn(test_x)

In [325]:
y_test_pred_list = [line.item() for line in y_test_pred]
test_y_list = [line.item() for line in test_y]

In [332]:
fig = px.line(x=time_test["time"], y=[y_test_pred_list, test_y_list], title="Model prediction vs true value on test data",
              labels={"variable": "Variable"})
newnames = {'wide_variable_0':'Prediction', 'wide_variable_1': 'Label'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )
fig.update_layout(
    xaxis_title="Time",
    yaxis_title = "Value"

    )

fig.show()
