In [1]:
%load_ext kedro.ipython
df = catalog.load("s3_conc_aligned_df")
# df = catalog.load("s3_cluster_df")

24/05/16 16:10:27 WARN Utils: Your hostname, Gavins-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.241 instead (on interface en0)
24/05/16 16:10:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/16 16:10:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [34]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

class GroupDataset(Dataset):
    def __init__(self, X, y, groups):
        self.X = X
        self.y = y
        self.groups = groups
        self.group_to_indices = self._group_indices()

    def _group_indices(self):
        group_to_indices = {}
        for idx, group in enumerate(self.groups):
            if group not in group_to_indices:
                group_to_indices[group] = []
            group_to_indices[group].append(idx)
        return group_to_indices

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.groups[idx]

class ExperimentSampler:
    def __init__(self, group_to_indices, batch_size):
        self.group_to_indices = group_to_indices
        self.batch_size = batch_size
        self.group_order = list(group_to_indices.keys())
        np.random.shuffle(self.group_order)

    def __iter__(self):
        for group in self.group_order:
            indices = self.group_to_indices[group]
            for i in range(0, len(indices), self.batch_size):
                yield indices[i:i + self.batch_size]

    def __len__(self):
        # Total number of batches across all groups
        total_batches = 0
        for indices in self.group_to_indices.values():
            total_batches += (len(indices) + self.batch_size - 1) // self.batch_size
        return total_batches

batch_size = 4

# Define the features and target
features = ['timestamp_bin', 'A1_Resistance']
target = 'resistance_ratio'

# Extract the feature matrix and target vector
X_tensor = df[features]
y_tensor = df[target]
groups = df['exp_no']

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_tensor.values, dtype=torch.float32)
y_tensor = torch.tensor(y_tensor.values, dtype=torch.float32)

# Convert groups to NumPy array
groups = np.array(groups)

# Create dataset
dataset = GroupDataset(X_tensor, y_tensor, groups)

# Create experiment sampler
experiment_sampler = ExperimentSampler(dataset.group_to_indices, batch_size)

# Create data loader
data_loader = DataLoader(dataset, batch_sampler=experiment_sampler)

# Example: iterating through the data loader
for batch_indices in data_loader:
    X_batch = dataset.X[batch_indices]
    y_batch = dataset.y[batch_indices]
    print(X_batch.shape, y_batch.shape, batch_indices)


# Use of GSS library

In [45]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

class GroupDataset(Dataset):
    def __init__(self, X, y, groups):
        self.X = X
        self.y = y
        self.groups = groups
        self.group_to_indices = self._group_indices()

    def _group_indices(self):
        group_to_indices = {}
        for idx, group in enumerate(self.groups):
            if group not in group_to_indices:
                group_to_indices[group] = []
            group_to_indices[group].append(idx)
        return group_to_indices

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        print("Getting item at index:", idx)
        return self.X[idx], self.y[idx], self.groups[idx]

class ExperimentSampler:
    def __init__(self, group_to_indices, batch_size):
        self.group_to_indices = group_to_indices
        self.batch_size = batch_size
        self.group_order = list(group_to_indices.keys())
        np.random.shuffle(self.group_order)  # Shuffle the order of groups

    def __iter__(self):
        for group in self.group_order:
            indices = self.group_to_indices[group]
            for i in range(0, len(indices), self.batch_size):
                batch = indices[i:i + self.batch_size]  # Yield a batch of indices
                print("Yielding batch indices:", batch)
                yield batch
                # yield indices[i:i + self.batch_size]

    def __len__(self):
        total_batches = 0
        for indices in self.group_to_indices.values():
            total_batches += (len(indices) + self.batch_size - 1) // self.batch_size
        return total_batches

# Define the features and target
features = ['timestamp_bin', 'A1_Resistance']
target = 'resistance_ratio'
X_tensor = df[features]
y_tensor = df[target]
groups = df['exp_no']

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_tensor.values, dtype=torch.float32)
y_tensor = torch.tensor(y_tensor.values, dtype=torch.float32)
# Convert groups to NumPy array
groups = np.array(groups)

# Create dataset
dataset = GroupDataset(X_tensor, y_tensor, groups)

# Split the data into training and testing sets using GroupShuffleSplit
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X_tensor, y_tensor, groups=groups))

train_dataset = GroupDataset(X_tensor[train_idx], y_tensor[train_idx], groups[train_idx])
test_dataset = GroupDataset(X_tensor[test_idx], y_tensor[test_idx], groups[test_idx])

# Create experiment samplers
train_sampler = ExperimentSampler(train_dataset.group_to_indices, batch_size)
test_sampler = ExperimentSampler(test_dataset.group_to_indices, batch_size)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_sampler=train_sampler)
test_loader = DataLoader(test_dataset, batch_sampler=test_sampler)

# Example: iterating through the train data loader
for batch_indices in train_loader:
    # Flatten the list of indices
    batch_indices = [item for sublist in batch_indices for item in sublist]
    
    # Diagnostic print to check the contents of batch_indices
    print("Batch indices:", batch_indices)

    # Convert batch_indices to a tensor
    batch_indices_tensor = torch.tensor(batch_indices, dtype=torch.long)

    # Diagnostic print to check the tensor
    print("Batch indices tensor:", batch_indices_tensor)

    # Index using the tensor of indices
    X_batch = train_dataset.X[batch_indices_tensor]
    y_batch = train_dataset.y[batch_indices_tensor]

    # Print shapes to verify the correct batches are formed
    print(X_batch.shape, y_batch.shape)


Batch indices: [tensor([      0.0000, 1436449.3750]), tensor([1.0000e+00, 1.4342e+06]), tensor([2.0000e+00, 1.4345e+06]), tensor([3.0000e+00, 1.4329e+06]), tensor([4.0000e+00, 1.4313e+06]), tensor([5.0000e+00, 1.4310e+06]), tensor([6.0000e+00, 1.4256e+06]), tensor([7.0000e+00, 1.4256e+06]), tensor([8.0000e+00, 1.4190e+06]), tensor([9.0000e+00, 1.4153e+06]), tensor([1.0000e+01, 1.4131e+06]), tensor([1.1000e+01, 1.4079e+06]), tensor([1.2000e+01, 1.4026e+06]), tensor([1.3000e+01, 1.3984e+06]), tensor([1.4000e+01, 1.3926e+06]), tensor([1.5000e+01, 1.3893e+06]), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(0.7553), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153), tensor(153)

# GSS rebuild block by block

In [18]:
# Create dataset
dataset = GroupDataset(X_tensor, y_tensor, groups)
# Split the data into training and testing sets using GroupShuffleSplit
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X_tensor, y_tensor, groups=groups))

In [31]:
import pandas as pd
# Ensure train_idx and test_idx are NumPy arrays
train_idx = np.array(train_idx)
test_idx = np.array(test_idx)

# Convert X_tensor and y_tensor from DataFrame to Tensor
X_tensor = torch.tensor(X_tensor.values) if isinstance(X_tensor, pd.DataFrame) else X_tensor
y_tensor = torch.tensor(y_tensor.values) if isinstance(y_tensor, pd.DataFrame) else y_tensor

# Ensure groups is a NumPy array
groups = np.array(groups) if not isinstance(groups, np.ndarray) else groups

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

class GroupDataset(Dataset):
    def __init__(self, X, y, groups):
        self.X = X
        self.y = y
        self.groups = groups
        self.group_to_indices = self._group_indices()

    def _group_indices(self):
        group_to_indices = {}
        for idx, group in enumerate(self.groups):
            if group not in group_to_indices:
                group_to_indices[group] = []
            group_to_indices[group].append(idx)
        return group_to_indices

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.groups[idx]

class ExperimentSampler:
    def __init__(self, group_to_indices, batch_size):
        self.group_to_indices = group_to_indices
        self.batch_size = batch_size
        self.group_order = list(group_to_indices.keys())
        np.random.shuffle(self.group_order)  # Shuffle the order of groups

    def __iter__(self):
        for group in self.group_order:
            indices = self.group_to_indices[group]
            for i in range(0, len(indices), self.batch_size):
                yield indices[i:i + self.batch_size]

    def __len__(self):
        total_batches = 0
        for indices in self.group_to_indices.values():
            total_batches += (len(indices) + self.batch_size - 1) // self.batch_size
        return total_batches

# Define the features and target
features = ['timestamp_bin', 'A1_Resistance']
target = 'resistance_ratio'
X_tensor = df[features]
y_tensor = df[target]
groups = df['exp_no']

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_tensor.values, dtype=torch.float32)
y_tensor = torch.tensor(y_tensor.values, dtype=torch.float32)
# Convert groups to NumPy array
groups = np.array(groups)

# Create dataset
dataset = GroupDataset(X_tensor, y_tensor, groups)

# Split the data into training and testing sets using GroupShuffleSplit
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X_tensor, y_tensor, groups=groups))

train_dataset = GroupDataset(X_tensor[train_idx], y_tensor[train_idx], groups[train_idx])
test_dataset = GroupDataset(X_tensor[test_idx], y_tensor[test_idx], groups[test_idx])

# Set the batch size
batch_size = 16

# Create experiment samplers
train_sampler = ExperimentSampler(train_dataset.group_to_indices, batch_size)
test_sampler = ExperimentSampler(test_dataset.group_to_indices, batch_size)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_sampler=train_sampler)
test_loader = DataLoader(test_dataset, batch_sampler=test_sampler)

# # Example: iterating through the train data loader
# print("Train Batches:")
# for batch_indices in train_loader:
#     # Convert batch_indices to a list if it's a tensor
#     if isinstance(batch_indices, torch.Tensor):
#         batch_indices = batch_indices.tolist()

#     X_batch = train_dataset.X[batch_indices]
#     y_batch = train_dataset.y[batch_indices]
#     print(X_batch.shape, y_batch.shape, batch_indices)
