# DATASETS

## Función que ajusta los datos para que sean todos los atributos entre 0 y 1

In [24]:
from collections import Counter

def minmax_scale_features(df, label_column = "label"):
    import pandas as pd
    from sklearn.preprocessing import MinMaxScaler, LabelEncoder
    import pandas.api.types as ptypes
    """
    Escala todas las columnas de características (todas excepto la columna de etiqueta)
    en el rango [0,1] usando MinMaxScaler y transforma la columna de etiquetas a enteros.
    
    Primero se verifica si la columna de etiquetas es de tipo entero o se puede convertir.
    Si no es posible, se codifica usando LabelEncoder.
    
    Parameters:
        df (pd.DataFrame): DataFrame que contiene las características y la columna de etiqueta.
        label_column (str): Nombre de la columna de etiquetas.
    
    Returns:
        tuple: (df_result, encoder)
            - df_result (pd.DataFrame): DataFrame con las características escaladas y la columna de etiqueta codificada.
            - encoder (LabelEncoder or None): Codificador usado para transformar las etiquetas, o None si no se usó.
    """
    # Se separan las características (excluyendo la columna de etiqueta)
    features = df.drop(columns=[label_column])
    
    # Escalado de las características
    scaler = MinMaxScaler(feature_range=(0, 1))
    features_scaled = pd.DataFrame(
        scaler.fit_transform(features),
        columns=features.columns,
        index=df.index
    )
    
    # Procesamiento de la columna de etiquetas:
    # Si la columna ya es de tipo entero, se deja tal cual.
    # En otro caso, se intenta convertirla a enteros.
    # Si falla, se usa LabelEncoder.
    if ptypes.is_integer_dtype(df[label_column]):
        labels_encoded = df[label_column]
        encoder = None
    else:
        try:
            labels_encoded = df[label_column].astype(int)
            encoder = None
        except (ValueError, TypeError):
            encoder = LabelEncoder()
            labels_encoded = pd.Series(encoder.fit_transform(df[label_column]), index=df.index)
    
    # Se une la columna de etiquetas codificada a las características escaladas
    df_result = features_scaled.join(labels_encoded.rename(label_column))
    
    return df_result


### MNIST with numbers


In [25]:
def mnist_numbers():
    
    import pandas as pd
    from sklearn.datasets import fetch_openml
    
    # Load data from https://www.openml.org/d/554
    X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas")
    
    # Crear nombres de columnas para X: C0, C1, ..., C783
    column_names = [f'C{i}' for i in range(X.shape[1])]
    
    # Crear el DataFrame a partir de X con los nombres de columna adecuados
    df = pd.DataFrame(X, columns=column_names)
    df["label"] = y
    df = minmax_scale_features(df)
    return df
df = mnist_numbers()
print(df.shape)
res = list(set(df["label"]))
print(res)
print(len(res))
Counter(df["label"])

(70000, 785)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
10


Counter({1: 7877,
         7: 7293,
         3: 7141,
         2: 6990,
         9: 6958,
         0: 6903,
         6: 6876,
         8: 6825,
         4: 6824,
         5: 6313})

### MNIST with clothes

In [27]:
def mnist_with_clothes():
    import pandas as pd
    import gdown
    import os
    
    output = 'fashion-mnist_train.csv'
    # Get the current working directory
    file_path = os.getcwd() + f'\\{output}'
    
    if os.path.exists(file_path):
        df = pd.read_csv(output)
    else:
        file_id = '11k7ZAbGlTwnvvKOTCyfpvzOan5KsM5Au'
        url = f'https://drive.google.com/uc?id={file_id}'
        
        gdown.download(url, output, quiet=False)
        
        # Now you can load it with pandas
        df = pd.read_csv(output)
        df = minmax_scale_features(df)
    return df

df = mnist_with_clothes()
print(df.shape)
res = list(set(df["label"]))
print(res)
print(len(res))
Counter(df["label"])

(60000, 785)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
10


Counter({2: 6000,
         9: 6000,
         6: 6000,
         0: 6000,
         3: 6000,
         4: 6000,
         5: 6000,
         8: 6000,
         7: 6000,
         1: 6000})

### Chinese MNIST

In [28]:
def chinese_mnist():
    import pandas as pd
    import gdown
    import os
    
    output = 'chineseMNIST.csv'
    file_path = os.getcwd() + f'\\{output}'
    
    if os.path.exists(file_path):
        df = pd.read_csv(output)
    else:
        file_id = '1fFcfjVYtCdDSotcP19VdIGSm14znGzHD'
        url = f'https://drive.google.com/uc?id={file_id}'
        gdown.download(url, output, quiet=False)
        
        # Now you can load it with pandas
        df = pd.read_csv(output)
        df = minmax_scale_features(df)
    return df
df = chinese_mnist()
print(df.shape)
res = list(set(df["label"]))
print(res)
print(len(res))
Counter(df["label"])

(15000, 4097)
[100000000, 0, 1, 2, 100, 3, 4, 5, 1000, 9, 10, 6, 7, 8, 10000]
15


Counter({9: 1000,
         10: 1000,
         100: 1000,
         1000: 1000,
         10000: 1000,
         100000000: 1000,
         0: 1000,
         1: 1000,
         2: 1000,
         3: 1000,
         4: 1000,
         5: 1000,
         6: 1000,
         7: 1000,
         8: 1000})

### Wine quality

In [29]:
def wine():
    import os
    import pandas as pd
    from ucimlrepo import fetch_ucirepo 
  
    # fetch dataset 
    wine_quality = fetch_ucirepo(id=186) 
  
    # data (as pandas dataframes) 
    df = wine_quality.data.features 
    df["label"] = wine_quality.data.targets
    df = minmax_scale_features(df)
    return df
df = wine()
print(df.shape)
res = list(set(df["label"]))
print(res)
print(len(res))
Counter(df["label"])

(6497, 12)
[3, 4, 5, 6, 7, 8, 9]
7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = wine_quality.data.targets


Counter({6: 2836, 5: 2138, 7: 1079, 4: 216, 8: 193, 3: 30, 9: 5})

### Gamma telescope

In [30]:
def gamma_telescope():
    from ucimlrepo import fetch_ucirepo 
    import pandas as pd
    
    # fetch dataset 
    magic_gamma_telescope = fetch_ucirepo(id=159) 
      
    # data (as pandas dataframes) 
    df = magic_gamma_telescope.data.features 
    df["label"] = magic_gamma_telescope.data.targets 
    df = minmax_scale_features(df)
    return df
df = gamma_telescope()
print(df.shape)
res = list(set(df["label"]))
print(res)
print(len(res))
Counter(df["label"])

(19020, 11)
[0, 1]
2


Counter({0: 12332, 1: 6688})

### Image Segmentation

In [31]:
def image_segmentation():
    from ucimlrepo import fetch_ucirepo 
    import pandas as pd
    
    # fetch dataset 
    image_segmentation = fetch_ucirepo(id=50) 
      
    # data (as pandas dataframes) 
    df = image_segmentation.data.features 
    df["label"] = image_segmentation.data.targets 
    df = minmax_scale_features(df)
    return df

df = image_segmentation()
print(df.shape)
res = list(set(df["label"]))
print(res)
print(len(res))
Counter(df["label"])

(210, 20)
[0, 1, 2, 3, 4, 5, 6]
7


Counter({0: 30, 5: 30, 2: 30, 1: 30, 6: 30, 4: 30, 3: 30})

### Digits Dataset from Sklearn

In [32]:
def digits_dataset():
    import pandas as pd
    from sklearn.datasets import load_digits
    digits = load_digits()
    df = pd.DataFrame(digits.data, columns = digits.feature_names)
    df["label"] = digits.target
    df = minmax_scale_features(df)
    return df
df = digits_dataset()
print(df.shape)
res = list(set(df["label"]))
print(res)
print(len(res))
Counter(df["label"])

(1797, 65)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
10


Counter({3: 183,
         1: 182,
         5: 182,
         4: 181,
         6: 181,
         9: 180,
         7: 179,
         0: 178,
         2: 177,
         8: 174})

### Breast Cancer

In [33]:
def load_breast_cancer():
    import pandas as pd
    from sklearn.datasets import load_breast_cancer
    cancer = load_breast_cancer()
    df = pd.DataFrame(cancer.data, columns = cancer.feature_names)
    df["label"] = cancer.target
    df = minmax_scale_features(df)
    return df
df = load_breast_cancer()
print(df.shape)
res = list(set(df["label"]))
print(res)
print(len(res))
Counter(df["label"])

(569, 31)
[0, 1]
2


Counter({1: 357, 0: 212})