In [34]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import yaml

from typing import Dict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

%load_ext kedro.ipython
device = torch.device('mps')

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


Write functions to check if each node produces valid data that are suitable for next step

# Functions to combine dataset

In [35]:
# get data between exp_no 104 to 113
# append data of 105 at the end of 104 etc

def get_data(exp_no) -> pd.DataFrame:
    file_name = f"{exp_no}_SHT_SMD.txt"
    file_path = f"../data/01_raw/{file_name}"
    df = pd.read_csv(file_path, sep=',', usecols=['timestamp', 'SHT40_temp', 'SHT40_Humidity', 'A1_Sensor', 'A1_Resistance'])
    return df

def concat_data(start:int,end:int) -> pd.DataFrame:
    df = pd.concat([get_data(exp_no) for exp_no in range(start, end)])
    df = df.reset_index(drop=True)
    return df

s_file = 108
e_file = 113

concat_data = concat_data(s_file,e_file)
# concat_data(s_file,e_file).to_parquet(f'../data/02_intermediate/{s_file}_{e_file}.pq')

# Data processing node

In [36]:
def _hi_lo_peak(x: pd.DataFrame) -> pd.DataFrame:
    peaks, properties = find_peaks(x['A1_Sensor'], width=50, height=1)
    peak_heights = properties['peak_heights']
# Determine smaller and larger peaks
    smaller_peaks, larger_peaks = [], []
    for i in range(len(peaks) - 1):
        if peak_heights[i] > peak_heights[i + 1]:
            larger_peaks.append(peaks[i])
            smaller_peaks.append(peaks[i + 1])
    # smaller_peaks_df = x.iloc[smaller_peaks]
    return smaller_peaks

def data_stack(sp: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    After finding the peaks, stack the data according to exp_no
    """
    df_stacked_list = []
    for i in range(len(sp) - 1):
        df_subset = df.iloc[sp[i]:sp[i + 1]].copy()
        df_subset['exp_no'] = i
        df_subset['timestamp'] -= df_subset['timestamp'].iloc[0]
        df_stacked_list.append(df_subset)
        df_stacked = pd.concat(df_stacked_list, ignore_index=True)
    return df_stacked


def _group_by_bin(df_stacked: pd.DataFrame, num_bins: int) -> pd.DataFrame:
    """
    Use PD.CUT to group data into specified bins in parameters
    """
    df_list = []
    grouped = df_stacked.groupby('exp_no')
    for name, group in grouped:
        group['bin'] = pd.cut(group['timestamp'], bins=num_bins, labels=False)
        df_list.append(group)
    return pd.concat(df_list)

def _average_bin(bin_df: pd.DataFrame) -> pd.DataFrame:
    """
    average values within each bin to return only one data point
    """
    bin_df = bin_df.drop(columns=['timestamp'])
    grouped = bin_df.groupby(['exp_no', 'bin']).mean()
    return grouped.reset_index()

def preprocess_data_bin(mox: pd.DataFrame, num_bins: int) -> pd.DataFrame:
    """
    Return data that is sorted by experiment number according to lo_peak interval
    data is stacked and labeled by exp_no
    data is grouped by bin and averaged
    """
    df_stacked = data_stack(_hi_lo_peak(mox), mox)
    bin_df = _group_by_bin(df_stacked, num_bins)
    mean_bin = _average_bin(bin_df)
    return mean_bin

def get_percentile_data(df, percentile):
    """
    Returns the data up to the specified percentile based on the 'bin' column.

    :param df: DataFrame containing the data
    :param percentile: A float value between 0 and 1 representing the percentile
    :return: DataFrame containing the data up to the specified percentile
    """
    # Calculate the bin index corresponding to the percentile
    max_bin = int(percentile * df['bin'].max())

    # Return data up to that bin
    return df[df['bin'] <= max_bin]

def _group_percentile (averaged: pd.DataFrame, percentile_bins: float) -> pd.DataFrame:
    """
    Returns the full specified percentile dataset
    """
    df_list = []
    grouped = averaged.groupby('exp_no')
    for name, group in grouped:
        percentile_data = get_percentile_data(group, percentile_bins)
        df_list.append(percentile_data)
    return pd.concat(df_list)

def _transpose_(df_set: pd.DataFrame) -> pd.DataFrame:
    transposed = df_set.pivot(index='exp_no', columns='bin', values='A1_Resistance')
    transposed.columns = ['bin_' + str(col) for col in transposed.columns]
    transposed.reset_index(inplace=True)
    return transposed


def _res_ratio(averaged: pd.DataFrame) -> pd.DataFrame:
    def calculate_res_ratio(group):
        return group['A1_Resistance'].max() / group['A1_Resistance'].min()

    res_ratio = averaged.groupby('exp_no').apply(calculate_res_ratio).reset_index()
    res_ratio.columns = ['exp_no', 'res_ratio']
    return res_ratio

def _combine_feature_matrix(res_ratio: pd.DataFrame, transposed: pd.DataFrame) -> pd.DataFrame:
    combined = pd.merge(res_ratio, transposed, on='exp_no')
    return combined

def create_model_input_table(mox_bin: pd.DataFrame, percentile_bins: float) -> pd.DataFrame:
    selected_range = _group_percentile(mox_bin, percentile_bins)
    # the ratio is from the entire dataset not filtered to be ground truth
    res_ratio = _res_ratio(mox_bin) 
    transpose_col = _transpose_(selected_range)
    # drop exp_no to avoid training on exp_no
    mox_table = _combine_feature_matrix(transpose_col, res_ratio).drop(columns=['exp_no'])
    return mox_table

# Parameters

In [37]:
with open('nb_parameters.yml') as file:
    parameters = yaml.load(file, Loader=yaml.FullLoader)

test_size = parameters['model_options']['test_size']

print(test_size)


# Hyper-parameters 

num_classes = parameters['model_options']['num_classes']
num_epochs = parameters['model_options']['num_epochs']
batch_size = parameters['model_options']['batch_size']
learning_rate = parameters['model_options']['learning_rate']

"""

Each feature as a time step in your sequence, you could set sequence_length to 150 and input_size to 1.
This would mean you are feeding in sequences of length 150, with each time step in the sequence having 1 feature.

"""

input_size = parameters['model_options']['input_size']
sequence_length = parameters['model_options']['sequence_length'] # the window it trains with can be selected
hidden_size = parameters['model_options']['hidden_size']
num_layers = parameters['model_options']['num_layers']
random_state = parameters['model_options']['random_state']

0.2


---
# Process and examine each file

In [38]:
exp_no = 107
percentile_bins = parameters['percentile_bins']
bin_size = parameters['num_bins']

df_exp = get_data(exp_no)
smaller_peaks = _hi_lo_peak(df_exp)
df_stacked = data_stack(smaller_peaks, df_exp)
bin_df = _group_by_bin(df_stacked, bin_size)
mean_bin = _average_bin(bin_df)
mox_bin = preprocess_data_bin(df_exp, bin_size)
selected_range = _group_percentile(mox_bin, percentile_bins)
res_ratio = _res_ratio(mox_bin)
transpose_col = _transpose_(selected_range)
mox_table = _combine_feature_matrix(transpose_col, res_ratio).drop(columns=['exp_no'])


In [39]:
print(bin_df)

        timestamp  SHT40_temp  SHT40_Humidity  A1_Sensor  A1_Resistance  \
0               0       28.69           41.54     1334.0     2798725.50   
1              50       28.66           41.61     1333.0     2800937.50   
2             100       28.66           41.66     1334.0     2798725.50   
3             150       28.68           41.74     1331.0     2805372.00   
4             200       28.67           41.78     1332.0     2803153.00   
...           ...         ...             ...        ...            ...   
881871     181650       27.07           44.59      970.0     3905257.75   
881872     181701       27.07           44.63      970.0     3905257.75   
881873     181751       27.09           44.66      970.0     3905257.75   
881874     181801       27.09           44.68      968.0     3913636.25   
881875     181851       27.10           44.74      969.0     3909442.75   

        exp_no   bin  
0            0     0  
1            0     0  
2            0     0  
3      

---
# LSTM Code

In [40]:
# Implement LSTM functions below
# there is no validation set in this example
# load mox_table as input

def split_data(model_input_table: pd.DataFrame) -> torch.tensor:
    # print(f"Test size: {parameters['test_size']}, type: {type(parameters['test_size'])}")
    # print(f"Random state: {parameters['random_state']}, type: {type(parameters['random_state'])}")

    # Split data into features and target
    X = model_input_table[model_input_table.columns[:-1]].values  # Assuming last column is the target
    y = model_input_table[model_input_table.columns[-1]].values
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    # Initialize StandardScaler
    scaler = StandardScaler()
    # Fit on training data
    scaler.fit(X_train)
    # Transform both training and testing data
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Ensure y_train and y_test are in the correct format
    if isinstance(y_train, pd.Series):
        y_train = y_train.values
    if isinstance(y_test, pd.Series):
        y_test = y_test.values
    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_scaled.astype(np.float32))
    y_train_tensor = torch.tensor(y_train.astype(np.float32))
    X_test_tensor = torch.tensor(X_test_scaled.astype(np.float32))
    y_test_tensor = torch.tensor(y_test.astype(np.float32))

    return X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor

# create X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor from split_data(df)
X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor = split_data(mox_table)
# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
# Initialize DataLoaders
batch_size = parameters['model_options']['batch_size']  # You can adjust the batch size according to your needs
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Now, train_loader and test_loader can be used in your training loop



In [41]:
# Fully connected neural network with one hidden layer
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # -> x needs to be: (batch_size, seq, input_size)
        
        # or:
        #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # Set initial hidden states (and cell states for LSTM)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        
        # x: (n, 28, 28), h0: (2, n, 128)
        
        # Forward propagate RNN
        # out, _ = self.rnn(x, h0)  
        # or:
        out, _ = self.lstm(x, (h0,c0))  
        
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        # out: (n, 28, 128)
        
        # Decode the hidden state of the last time step
        out = out[:, -1, :]
        # out: (n, 128)
         
        out = self.fc(out)
        # out: (n, 10)
        return out

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)


In [42]:
# turn the block below into a function
def train_model (data: DataLoader)->():
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

# Train the model
    n_total_steps = len(train_loader)
    for epoch in range(num_epochs):
        for i, (bins, target) in enumerate(train_loader):  
            bins = bins.reshape(-1, sequence_length, input_size).to(device)
            target = target.to(device)
        
        # Forward pass
        outputs = model(bins)
        # Example of reshaping/squeezing if applicable
        outputs = outputs.squeeze()  # Removes dimensions of size 1
        outputs = outputs[:64]  # Adjust if you need to slice the outputs

        target = target.unsqueeze(1).to(device)  # Add an extra dimension to match outputs
        loss = criterion(outputs, target)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    # Calculate RMSE at the end of each epoch
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():  # Don't calculate gradients
            total_loss = 0
            count = 0
            for bins, target in test_loader:  # Replace with your validation loader
                bins = bins.reshape(-1, sequence_length, input_size).to(device)
                target = target.unsqueeze(1).to(device)  # Add an extra dimension to match outputs
                outputs = model(bins)
                loss = criterion(outputs, target)
                total_loss += loss.item()
                count += 1
            rmse = np.sqrt(total_loss / count)
            print(f'Epoch [{epoch+1}/{num_epochs}], RMSE on validation data: {rmse}')
        model.train()  # Set the model back to training mode
    # Save the model after training
    # lstm_model = torch.save(model.state_dict())
    lstm_model = model.state_dict()
    return lstm_model

In [43]:
train_model(train_loader)

Epoch [1/100], RMSE on validation data: 1.3545067201259686
Epoch [2/100], RMSE on validation data: 1.3204957394724373
Epoch [3/100], RMSE on validation data: 1.2841561202871044
Epoch [4/100], RMSE on validation data: 1.243695189835406
Epoch [5/100], RMSE on validation data: 1.1975637690104044
Epoch [6/100], RMSE on validation data: 1.1410319895551913
Epoch [7/100], RMSE on validation data: 1.0684854966501616
Epoch [8/100], RMSE on validation data: 0.9692969623933668
Epoch [9/100], RMSE on validation data: 0.8238492743217882
Epoch [10/100], RMSE on validation data: 0.5953366887942981
Epoch [11/100], RMSE on validation data: 0.20842073651058673
Epoch [12/100], RMSE on validation data: 0.5266625914825946
Epoch [13/100], RMSE on validation data: 0.5831717192056292
Epoch [14/100], RMSE on validation data: 0.3939146520724038
Epoch [15/100], RMSE on validation data: 0.1568509822674554
Epoch [16/100], RMSE on validation data: 0.13019104800628117
Epoch [17/100], RMSE on validation data: 0.23983


[1;35mOrderedDict[0m[1m([0m[1m[[0m[1m([0m[32m'lstm.weight_ih_l0'[0m, [1;35mtensor[0m[1m([0m[1m[[0m[1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m][0m,
        [1m[[0mnan[1m]