In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import scipy.sparse as sp

In [2]:
madelon_train = 'data/madelon_train.data'
madelon_train_labels = 'data/madelon_train.labels'
madelon_valid = 'data/madelon_valid.data'
madelon_valid_labels = 'data/madelon_valid.labels'

In [3]:
# Load madelon train data
madelon_train_df = pd.read_csv(madelon_train, delimiter=' ', header=None)
madelon_train_labels_df = pd.read_csv(madelon_train_labels, delimiter=' ', header=None, names=['target'])

madelon_train_withlabels = pd.concat([madelon_train_df, madelon_train_labels_df], axis=1)
madelon_train_withlabels.drop([500], axis=1, inplace=True)
madelon_train_withlabels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,485,477,537,479,452,471,491,476,475,473,...,481,477,485,511,485,481,479,475,496,-1
1,483,458,460,487,587,475,526,479,485,469,...,478,487,338,513,486,483,492,510,517,-1
2,487,542,499,468,448,471,442,478,480,477,...,481,492,650,506,501,480,489,499,498,-1
3,480,491,510,485,495,472,417,474,502,476,...,480,474,572,454,469,475,482,494,461,1
4,484,502,528,489,466,481,402,478,487,468,...,479,452,435,486,508,481,504,495,511,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,490,505,503,474,463,461,519,476,518,467,...,479,449,588,499,506,475,463,507,501,1
1996,480,475,476,480,495,482,515,479,480,484,...,474,473,424,454,570,476,493,465,485,-1
1997,480,517,631,470,485,474,535,476,493,466,...,483,479,687,488,488,483,500,523,481,-1
1998,484,481,505,478,542,477,518,477,510,472,...,483,526,750,486,529,484,473,527,485,1


In [4]:
# Load madelon validation data
madelon_valid_df = pd.read_csv(madelon_valid, delimiter=' ', header=None)
madelon_valid_label_df = pd.read_csv(madelon_valid_labels, delimiter=' ', header=None, names=['target'])

madelon_valid_withlabels = pd.concat([madelon_valid_df, madelon_valid_label_df], axis=1)
madelon_valid_withlabels.drop([500], axis=1, inplace=True)
madelon_valid_withlabels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,483,454,513,495,523,469,453,477,506,479,...,480,543,259,413,520,485,498,523,510,-1
1,485,508,493,487,478,472,504,476,479,475,...,480,535,534,514,452,484,495,548,477,-1
2,483,521,507,475,493,486,421,475,496,483,...,476,498,495,508,528,486,465,508,503,-1
3,474,504,576,480,553,483,524,478,483,483,...,475,470,463,509,525,479,467,552,517,1
4,495,474,523,479,495,488,485,476,497,478,...,471,522,343,509,520,475,493,506,491,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,493,458,503,478,517,479,472,478,444,477,...,475,485,443,517,486,474,489,506,506,1
596,481,484,481,490,449,481,467,478,469,483,...,485,508,599,498,527,481,490,455,451,1
597,485,485,530,480,444,487,462,475,509,494,...,474,502,368,453,482,478,481,484,517,1
598,477,469,528,485,483,469,482,477,494,476,...,476,453,638,471,538,470,490,613,492,1


In [5]:
# Combine madelon training and validation data
data = pd.concat([madelon_train_withlabels, madelon_valid_withlabels], axis=0)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,485,477,537,479,452,471,491,476,475,473,...,481,477,485,511,485,481,479,475,496,-1
1,483,458,460,487,587,475,526,479,485,469,...,478,487,338,513,486,483,492,510,517,-1
2,487,542,499,468,448,471,442,478,480,477,...,481,492,650,506,501,480,489,499,498,-1
3,480,491,510,485,495,472,417,474,502,476,...,480,474,572,454,469,475,482,494,461,1
4,484,502,528,489,466,481,402,478,487,468,...,479,452,435,486,508,481,504,495,511,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,493,458,503,478,517,479,472,478,444,477,...,475,485,443,517,486,474,489,506,506,1
2596,481,484,481,490,449,481,467,478,469,483,...,485,508,599,498,527,481,490,455,451,1
2597,485,485,530,480,444,487,462,475,509,494,...,474,502,368,453,482,478,481,484,517,1
2598,477,469,528,485,483,469,482,477,494,476,...,476,453,638,471,538,470,490,613,492,1


In [6]:
def normalize_features(data, target_column):
    # Create a scaler object
    scaler = MinMaxScaler()
    
    # Separate the features to normalize (all except the last column)
    features_to_normalize = data.iloc[:, :-1]  # Exclude the last column
    
    # Normalize the dataset (excluding the target column)
    data_normalized = scaler.fit_transform(features_to_normalize)
    
    # Convert it back to a DataFrame for convenience
    # Use columns from features_to_normalize and add the excluded column manually
    data_normalized_df = pd.DataFrame(data_normalized, columns=features_to_normalize.columns)
    
    # Add the target column back
    data_normalized_df['target'] = target_column
    
    # Round the normalized data to 3 decimals
    data_normalized_df = data_normalized_df.round(3)
    
    # Display the resulting DataFrame
    data_normalized_df
    
    return data_normalized_df

In [7]:
def generate_subdatasets(data, fractions, target_column, dataset_name="madelon"):
    np.random.seed(42)

    unique_classes = data[target_column].unique()

    # Split data by classes dynamically
    class_data = {cls: data[data[target_column] == cls] for cls in unique_classes}

    subdatasets = {}
    for fraction in fractions:
        samples_per_class = int(fraction * len(data) / len(unique_classes))
        subdataset = pd.concat([
            class_data[cls].sample(samples_per_class, random_state=42) for cls in unique_classes
        ])
        # Add an explicit "Index" column
        subdataset = subdataset.reset_index(drop=True)
        subdataset.insert(0, 'Index', subdataset.index)

        subdatasets[f"subdataset_{int(fraction * 100)}"] = subdataset

        # Save subdataset to respective folder
        fraction_dir = os.path.join(f"{dataset_name}_{int(fraction * 100)}")
        os.makedirs(fraction_dir, exist_ok=True)
        subdataset.to_csv(os.path.join(fraction_dir, f"{dataset_name}_{int(fraction * 100)}.csv"), index=False)

    # Full dataset
    data = data.reset_index(drop=True)
    data.insert(0, 'Index', data.index)  # Add "Index" column to full dataset
    subdatasets["subdataset_full"] = data
    full_dir = os.path.join(f"{dataset_name}_full")
    os.makedirs(full_dir, exist_ok=True)
    data.to_csv(os.path.join(full_dir, f"{dataset_name}_full.csv"), index=False)

    return subdatasets

In [8]:
normalized_data = normalize_features(data, data["target"])
normalized_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,0.561,0.429,0.588,0.394,0.280,0.261,0.567,0.455,0.385,0.36,...,0.529,0.478,0.468,0.579,0.429,0.558,0.44,0.399,0.489,-1
1,0.512,0.348,0.317,0.515,0.692,0.348,0.693,0.727,0.470,0.28,...,0.441,0.533,0.288,0.587,0.432,0.605,0.57,0.523,0.600,-1
2,0.610,0.708,0.454,0.227,0.268,0.261,0.390,0.636,0.427,0.44,...,0.529,0.561,0.670,0.559,0.481,0.535,0.54,0.484,0.500,-1
3,0.439,0.489,0.493,0.485,0.412,0.283,0.300,0.273,0.615,0.42,...,0.500,0.461,0.574,0.348,0.377,0.419,0.47,0.466,0.305,1
4,0.537,0.536,0.556,0.545,0.323,0.478,0.245,0.636,0.487,0.26,...,0.471,0.339,0.406,0.478,0.503,0.558,0.69,0.470,0.568,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,0.756,0.348,0.468,0.379,0.479,0.435,0.498,0.636,0.120,0.44,...,0.353,0.522,0.416,0.603,0.432,0.395,0.54,0.509,0.542,1
2596,0.463,0.459,0.391,0.561,0.271,0.478,0.480,0.636,0.333,0.56,...,0.647,0.650,0.607,0.526,0.565,0.558,0.55,0.327,0.253,1
2597,0.561,0.464,0.563,0.409,0.256,0.609,0.462,0.364,0.675,0.78,...,0.324,0.617,0.324,0.344,0.419,0.488,0.46,0.431,0.600,1
2598,0.366,0.395,0.556,0.485,0.375,0.217,0.534,0.545,0.547,0.42,...,0.382,0.344,0.655,0.417,0.601,0.302,0.55,0.890,0.468,1


In [None]:
subdatasets = generate_subdatasets(normalized_data, [0.33, 0.66], 'target', dataset_name="madelon")
print(subdatasets['subdataset_33']['target'].value_counts())
print(subdatasets['subdataset_66']['target'].value_counts())
print(subdatasets['subdataset_full']['target'].value_counts())