In [12]:
from sklearn.utils import shuffle
from typing import Tuple
import numpy as np
from si.data.dataset import Dataset

def stratified_train_test_split(dataset:Dataset, test_size=0.2, random_state:int =None) ->Tuple[Dataset, Dataset]:
    """
    split the dataset into training and testing sets while maintaining the class distribution
    
    Parameters
    ----------
    dataset: Dataset
        The dataset to split
    test_size: float
        The proportion of the dataset to include in the test split
    random_state: int
        The seed of the random number generator
        
    Returns
    train: Dataset
        The training dataset
    test: Dataset
        The testing dataset
    
    """
    
    labels = dataset.y
    unique_classes, class_counts = np.unique(labels, return_counts=True)
    train= []
    test=[]
    if random_state is not None:
        np.random.seed(random_state)
        
    for label, count in zip(unique_classes, class_counts):
        
        idxs = np.where(labels == label)[0]
        
        num_test= int(np.floor(test_size * count))
        
        idxs= shuffle(idxs, random_state= random_state)
        
        lables_test_idxs= idxs[:num_test]
        test.extend(lables_test_idxs) #use the extendo because we add multiple elements
        
        lables_train_idxs= idxs[num_test:]
        train.extend(lables_train_idxs)
    
    train= np.array(train)
    test= np.array(test)
    
    # Create training and testing datasets
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    train_dataset = {'data': X_train, 'target': y_train}
    test_dataset = {'data': X_test, 'target': y_test}
    
    # Return the training and testing datasets
    return train_dataset, test_dataset

In [16]:
# Carregar os dados usando o scikit-learn
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
features = iris.feature_names
label = "species"

# Criar o objeto Dataset
dataset = Dataset(X=X, y=y, features=features, label=label)



In [20]:
stratified_train_test_split(dataset, test_size=0.2)

({'data': array([[5.1, 3.5, 1.4, 0.3],
         [5. , 3.4, 1.6, 0.4],
         [5. , 3. , 1.6, 0.2],
         [4.8, 3. , 1.4, 0.3],
         [5. , 3.5, 1.6, 0.6],
         [5.5, 3.5, 1.3, 0.2],
         [4.6, 3.2, 1.4, 0.2],
         [5. , 3.6, 1.4, 0.2],
         [4.8, 3.4, 1.9, 0.2],
         [5.8, 4. , 1.2, 0.2],
         [5.1, 3.5, 1.4, 0.2],
         [5.1, 3.8, 1.6, 0.2],
         [5. , 3.3, 1.4, 0.2],
         [4.4, 3. , 1.3, 0.2],
         [4.8, 3.1, 1.6, 0.2],
         [5.7, 3.8, 1.7, 0.3],
         [4.8, 3. , 1.4, 0.1],
         [5.2, 3.4, 1.4, 0.2],
         [5.4, 3.9, 1.7, 0.4],
         [4.5, 2.3, 1.3, 0.3],
         [5.3, 3.7, 1.5, 0.2],
         [4.6, 3.6, 1. , 0.2],
         [5.1, 3.7, 1.5, 0.4],
         [4.8, 3.4, 1.6, 0.2],
         [4.4, 3.2, 1.3, 0.2],
         [5.4, 3.9, 1.3, 0.4],
         [4.7, 3.2, 1.3, 0.2],
         [4.6, 3.4, 1.4, 0.3],
         [5. , 3.4, 1.5, 0.2],
         [5.1, 3.8, 1.5, 0.3],
         [5.4, 3.4, 1.7, 0.2],
         [4.3, 3. , 1.1, 0.1],
