In [5]:
%load_ext kedro.ipython
df = catalog.load("s3_conc_aligned_df")

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


In [6]:
df.head()

Unnamed: 0,exp_no,timestamp_bin,A1_Resistance,A1_Resistance_diff,A1_Resistance_norm,A1_Sensor,A1_Sensor_diff,A1_Sensor_norm,SHT40_Humidity,SHT40_temp,index,resistance_ratio,ace_conc,expo_time
0,0,0.0,720650.75,0.0,1.0,4518.0,0.0,1.0,42.835,29.43,4481.5,1.768473,3.033e-07,3.0
1,0,1.0,720361.875,0.0,0.999599,4519.5,0.0,1.000332,42.84,29.435,4483.5,1.768473,3.033e-07,3.0
2,0,2.0,719688.47,-673.405,0.998665,4523.0,3.5,1.001107,42.845,29.445,4485.5,1.768473,3.033e-07,3.0
3,0,3.0,719015.94,-1634.81,0.997731,4526.5,8.5,1.001881,42.83,29.435,4487.5,1.768473,3.033e-07,3.0
4,0,4.0,718727.905,-1345.095,0.997332,4528.0,7.0,1.002213,42.845,29.44,4489.5,1.768473,3.033e-07,3.0


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader
import torch


# Check for missing values and handle them if necessary
df.fillna(df.mean(), inplace=True)

# Define the features and target
features = ['timestamp_bin', 'A1_Resistance']
target = 'resistance_ratio'

# Extract the feature matrix and target vector
X = df[features]
y = df[target]
groups = df['exp_no']

# Initialize the GroupShuffleSplit object for training/validation split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split into training and temporary (validation + testing) sets
train_idx, temp_idx = next(gss.split(X, y, groups=groups))

# Create DataFrames for training and temporary sets
train_df = df.iloc[train_idx]
temp_df = df.iloc[temp_idx]

# Split the temporary set into validation and testing sets
gss_temp = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(gss_temp.split(temp_df[features], temp_df[target], groups=temp_df['exp_no']))

# Create DataFrames for validation and testing sets
val_df = temp_df.iloc[val_idx]
test_df = temp_df.iloc[test_idx]

# Extract features and targets for each set
X_train = train_df[features]
y_train = train_df[target]
X_val = val_df[features]
y_val = val_df[target]
X_test = test_df[features]
y_test = test_df[target]

# Normalize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Batch size definition
batch_size = 32

# Create Tensor datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor.view(-1, 1))
val_dataset = TensorDataset(X_val_tensor, y_val_tensor.view(-1, 1))
test_dataset = TensorDataset(X_test_tensor, y_test_tensor.view(-1, 1))

# Create DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


In [10]:
import torch.nn as nn
import torch.optim as optim

class LinearNN(nn.Module):
    def __init__(self, input_dim):
        super(LinearNN, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

# Define model, loss function, and optimizer
input_dim = X_train.shape[1]
model = LinearNN(input_dim)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop with batching
num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for inputs, targets in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_val_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

# Test the model
model.eval()
total_test_loss = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        total_test_loss += loss.item()

avg_test_loss = total_test_loss / len(test_loader)
print(f'Test Loss: {avg_test_loss:.4f}')

Epoch [10/200], Train Loss: 0.0479, Validation Loss: 0.0531
Epoch [20/200], Train Loss: 0.0479, Validation Loss: 0.0532
Epoch [30/200], Train Loss: 0.0479, Validation Loss: 0.0529
