In [6]:
%load_ext kedro.ipython
df = catalog.load("s3_conc_aligned_df")
# df = catalog.load("s3_cluster_df")

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


In [7]:
df.head()

Unnamed: 0,exp_no,timestamp_bin,A1_Resistance,A1_Resistance_diff,A1_Resistance_norm,A1_Sensor,A1_Sensor_diff,A1_Sensor_norm,SHT40_Humidity,SHT40_temp,index,resistance_ratio,ace_conc,expo_time
0,0,0.0,720650.75,0.0,1.0,4518.0,0.0,1.0,42.835,29.43,4481.5,1.768473,3.033e-07,3.0
1,0,1.0,720361.875,0.0,0.999599,4519.5,0.0,1.000332,42.84,29.435,4483.5,1.768473,3.033e-07,3.0
2,0,2.0,719688.47,-673.405,0.998665,4523.0,3.5,1.001107,42.845,29.445,4485.5,1.768473,3.033e-07,3.0
3,0,3.0,719015.94,-1634.81,0.997731,4526.5,8.5,1.001881,42.83,29.435,4487.5,1.768473,3.033e-07,3.0
4,0,4.0,718727.905,-1345.095,0.997332,4528.0,7.0,1.002213,42.845,29.44,4489.5,1.768473,3.033e-07,3.0


# Without Clustering

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
import torch


# Check for missing values and handle them if necessary
df.fillna(df.mean(), inplace=True)

# Define the features and target
# features = ['timestamp_bin', 'A1_Resistance', 'cluster','distance_to_centroid_0','distance_to_centroid_1','distance_to_centroid_2']
features = ['timestamp_bin', 'A1_Resistance']
target = 'resistance_ratio'

# Extract the feature matrix and target vector
X = df[features]
y = df[target]
groups = df['exp_no']

# Initialize the GroupShuffleSplit object for training/validation split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split into training and temporary (validation + testing) sets
train_idx, temp_idx = next(gss.split(X, y, groups=groups))

# Create DataFrames for training and temporary sets
train_df = df.iloc[train_idx]
temp_df = df.iloc[temp_idx]

# Split the temporary set into validation and testing sets
gss_temp = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(gss_temp.split(temp_df[features], temp_df[target], groups=temp_df['exp_no']))

# Create DataFrames for validation and testing sets
val_df = temp_df.iloc[val_idx]
test_df = temp_df.iloc[test_idx]

# Extract features and targets for each set
X_train = train_df[features]
y_train = train_df[target]
X_val = val_df[features]
y_val = val_df[target]
X_test = test_df[features]
y_test = test_df[target]

# Normalize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)


# With Clustering

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import torch

# Check for missing values and handle them if necessary
df.fillna(df.mean(), inplace=True)

# Define the features and target
initial_features = ['timestamp_bin', 'A1_Resistance']
target = 'resistance_ratio'

# Extract the feature matrix and target vector
X = df[initial_features]
y = df[target]
groups = df['exp_no']

# Initialize the GroupShuffleSplit object for training/validation split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Split into training and temporary (validation + testing) sets
train_idx, temp_idx = next(gss.split(X, y, groups=groups))

# Create DataFrames for training and temporary sets
train_df = df.iloc[train_idx]
temp_df = df.iloc[temp_idx]

# Split the temporary set into validation and testing sets
gss_temp = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
val_idx, test_idx = next(gss_temp.split(temp_df[initial_features], temp_df[target], groups=temp_df['exp_no']))

# Create DataFrames for validation and testing sets
val_df = temp_df.iloc[val_idx]
test_df = temp_df.iloc[test_idx]

# Extract features and targets for each set
X_train = train_df[initial_features].copy()
y_train = train_df[target]
X_val = val_df[initial_features].copy()
y_val = val_df[target]
X_test = test_df[initial_features].copy()
y_test = test_df[target]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Apply K-Means clustering on the training data
kmeans = KMeans(n_clusters=3, random_state=42)
X_train['cluster'] = kmeans.fit_predict(X_train_scaled)
X_train_distances = kmeans.transform(X_train_scaled)
for i in range(X_train_distances.shape[1]):
    X_train[f'distance_to_centroid_{i}'] = X_train_distances[:, i]

# Apply the same transformation to the validation and test data
X_val['cluster'] = kmeans.predict(X_val_scaled)
X_val_distances = kmeans.transform(X_val_scaled)
for i in range(X_val_distances.shape[1]):
    X_val[f'distance_to_centroid_{i}'] = X_val_distances[:, i]

X_test['cluster'] = kmeans.predict(X_test_scaled)
X_test_distances = kmeans.transform(X_test_scaled)
for i in range(X_test_distances.shape[1]):
    X_test[f'distance_to_centroid_{i}'] = X_test_distances[:, i]

# Update the features list to include the new cluster features
features = initial_features + ['cluster'] + [f'distance_to_centroid_{i}' for i in range(X_train_distances.shape[1])]

# Extract features and targets for each set again
X_train = X_train[features]
X_val = X_val[features]
X_test = X_test[features]

# Normalize the features again after adding cluster information
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader, Sampler

class GroupDataset(Dataset):
    def __init__(self, X, y, groups):
        self.X = X
        self.y = y
        self.groups = groups

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.groups[idx]

class GroupBatchSampler(Sampler):
    def __init__(self, groups, batch_size):
        self.groups = groups
        self.batch_size = batch_size

        # Create a dictionary mapping group labels to indices
        self.group_to_indices = {}
        for idx, group in enumerate(groups):
            if group not in self.group_to_indices:
                self.group_to_indices[group] = []
            self.group_to_indices[group].append(idx)

        # Create a list of group batches
        self.group_batches = []
        for group, indices in self.group_to_indices.items():
            for i in range(0, len(indices), batch_size):
                self.group_batches.append(indices[i:i + batch_size])

    def __iter__(self):
        # Shuffle the group batches
        np.random.shuffle(self.group_batches)
        for batch in self.group_batches:
            yield batch

    def __len__(self):
        return len(self.group_batches)

batch_size = 64

# Create datasets
train_dataset = GroupDataset(X_train_tensor, y_train_tensor, train_df['exp_no'].values)
val_dataset = GroupDataset(X_val_tensor, y_val_tensor, val_df['exp_no'].values)
test_dataset = GroupDataset(X_test_tensor, y_test_tensor, test_df['exp_no'].values)

# Create batch samplers
train_batch_sampler = GroupBatchSampler(train_dataset.groups, batch_size)
val_batch_sampler = GroupBatchSampler(val_dataset.groups, batch_size)
test_batch_sampler = GroupBatchSampler(test_dataset.groups, batch_size)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_sampler=train_batch_sampler)
val_loader = DataLoader(val_dataset, batch_sampler=val_batch_sampler)
test_loader = DataLoader(test_dataset, batch_sampler=test_batch_sampler)


In [11]:
import torch.nn as nn
import torch.optim as optim

class LinearNN(nn.Module):
    def __init__(self, input_dim):
        super(LinearNN, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

# Define model, loss function, and optimizer
input_dim = X_train.shape[1]
model = LinearNN(input_dim)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop with DataLoader
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for X_batch, y_batch, _ in train_loader:
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch.view(-1, 1))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)

    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch, _ in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch.view(-1, 1))
            val_loss += loss.item() * X_batch.size(0)

    val_loss /= len(val_loader.dataset)

    print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

# Testing the model
model.eval()
test_loss = 0.0
with torch.no_grad():
    for X_batch, y_batch, _ in test_loader:
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch.view(-1, 1))
        test_loss += loss.item() * X_batch.size(0)

test_loss /= len(test_loader.dataset)
print(f'Test Loss: {test_loss:.4f}')


Epoch [1/20], Train Loss: 0.0428, Validation Loss: 0.0335
Epoch [2/20], Train Loss: 0.0340, Validation Loss: 0.0333
Epoch [3/20], Train Loss: 0.0339, Validation Loss: 0.0335
Epoch [4/20], Train Loss: 0.0342, Validation Loss: 0.0339
Epoch [5/20], Train Loss: 0.0337, Validation Loss: 0.0343
Epoch [6/20], Train Loss: 0.0338, Validation Loss: 0.0339
Epoch [7/20], Train Loss: 0.0338, Validation Loss: 0.0393
Epoch [8/20], Train Loss: 0.0341, Validation Loss: 0.0319
Epoch [9/20], Train Loss: 0.0336, Validation Loss: 0.0453
Epoch [10/20], Train Loss: 0.0341, Validation Loss: 0.0333
Epoch [11/20], Train Loss: 0.0339, Validation Loss: 0.0345
Epoch [12/20], Train Loss: 0.0340, Validation Loss: 0.0328
Epoch [13/20], Train Loss: 0.0339, Validation Loss: 0.0329
Epoch [14/20], Train Loss: 0.0338, Validation Loss: 0.0325
Epoch [15/20], Train Loss: 0.0338, Validation Loss: 0.0338
Epoch [16/20], Train Loss: 0.0336, Validation Loss: 0.0339
Epoch [17/20], Train Loss: 0.0337, Validation Loss: 0.0370
Epoch 