In [2]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch.utils.data import Dataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [33]:
class CustomDataset(Dataset):
    def __init__(self, X, y, is_train=True):
        """
        Custom dataset that applies normalization and SMOTE (for training set).
        :param X: Features
        :param y: Labels
        :param is_train: Flag to indicate if it is training data
        """
        if is_train:
            smote = SMOTE()
            X, y = smote.fit_resample(X, y)
            self.X = torch.tensor(X, dtype=torch.float32)
            self.y = torch.tensor(y, dtype=torch.long) 
        else:
            # Apply normalization on validation data
            self.X = torch.from_numpy(X).float()
            self.y = torch.tensor(y.values, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx, is_train=True):
        return self.X[idx], self.y[idx]
        

def create_datasets(datapath, test_size=0.2):
    """
    Splits the data into train and validation datasets and applies preprocessing.
    :param X: Features
    :param y: Labels
    :param test_size: Size of the validation set
    """
    with open(datapath, 'rb') as handle:
        data = pd.read_pickle(handle)
    X_init = data['X_train']
    Y_init = data['Y']
    scaler = StandardScaler().fit(X_init)
    X_init = scaler.transform(X_init)
    X_train, X_val, y_train, y_val = train_test_split(X_init, Y_init, test_size=test_size, stratify=Y_init, random_state=42)
    train_dataset = CustomDataset(X_train, y_train, is_train=True)
    val_dataset = CustomDataset(X_val, y_val, is_train=False)
    return train_dataset, val_dataset

In [34]:
train_dataset, val_dataset = create_datasets('../data/data-challenge-student.pickle')
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
print('Train dataset size:', len(train_dataset))
print('Validation dataset size:', len(val_dataset))
print('Number of features:', train_dataset.X.shape[1])
print('Number of classes:', len(train_dataset.y.unique()))
print('Number of batches in train loader:', len(train_loader))
print('Number of batches in validation loader:', len(val_loader))
# shpae of train_loader
for batch in train_loader:
    print('Features shape:', batch[0].shape)
    print('Labels shape:', batch[1].shape)
    break

Train dataset size: 185584
Validation dataset size: 5550
Number of features: 768
Number of classes: 28
Number of batches in train loader: 2900
Number of batches in validation loader: 87
Features shape: torch.Size([64, 768])
Labels shape: torch.Size([64])


In [6]:
full_dataset = ChallengeDataset()
# Example split sizes, adjust according to your dataset
train_size = int(0.7 * len(full_dataset))
val_size = int(0.2 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size
# Split the dataset
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

print(len(train_loader))
print(len(val_loader))
print(len(test_loader))

# print one batch
for batch in test_loader:
    print(batch)
    break

304
87
44


KeyboardInterrupt: 

In [7]:
from torch.utils.data import Dataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler


class CustomDataset(Dataset):
    def __init__(self, X, y, S, is_train=True):
        self.is_train = is_train
        if is_train:
            smote = SMOTE()
            X, y = smote.fit_resample(X, y)
            self.X = torch.tensor(X, dtype=torch.float32)
            self.y = torch.tensor(y, dtype=torch.long)
        else:
            # Apply normalization on validation data
            self.X = torch.from_numpy(X).float()
            self.y = torch.tensor(y.values, dtype=torch.long)
            self.S = torch.tensor(S.values, dtype=torch.float32)
        

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.is_train:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx], self.y[idx], self.S[idx]


def create_datasets(datapath, test_size=0.2):
    with open(datapath, 'rb') as handle:
        data = pd.read_pickle(handle)
    X_init = data['X_train']
    Y_init = data['Y']
    S_init = data['S_train']
    scaler = StandardScaler().fit(X_init)
    X_init = scaler.transform(X_init)
    X_train, X_val, y_train, y_val, _, S_val = train_test_split(
        X_init, Y_init, S_init, test_size=test_size, stratify=Y_init, random_state=42)
    train_dataset = CustomDataset(X_train, y_train, None, is_train=True)
    val_dataset = CustomDataset(X_val, y_val, S_val, is_train=False)
    return train_dataset, val_dataset, scaler

In [8]:
train_dataset, val_dataset, _ = create_datasets('../data/data-challenge-student.pickle')
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
print('Train dataset size:', len(train_dataset))
print('Validation dataset size:', len(val_dataset))

for batch in train_loader:
    print('Features shape:', batch[0].shape)
    print('Labels shape:', batch[1].shape)
    break

for batch in val_loader:
    print('Features shape:', batch[0].shape)
    print('Labels shape:', batch[1].shape)
    print('Sensitive shape:', batch[2].shape)
    break



Train dataset size: 185584
Validation dataset size: 5550
Features shape: torch.Size([64, 768])
Labels shape: torch.Size([64])
Features shape: torch.Size([64, 768])
Labels shape: torch.Size([64])
Sensitive shape: torch.Size([64])
