In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
import pickle
import os
import sys
import copy
from cleanlab import noise_generation

from utils import *
pd.options.mode.chained_assignment = None # None / 'warn'

In [None]:
###
noise_amount = 0.2
###
frac_zero_noise_rates = 0
seeds = list(range(10))

def _helper():
    for seed in seeds:
        df_train_scaled, df_test_scaled, continuous_features, categorical_features = get_data(seed)
        
        # generate symmetric noise
        np.random.seed(seed=seed)
        num_class = len(np.unique(df_train_scaled["Target"]))
        nm = np.ones((num_class, num_class)) * noise_amount/(num_class-1)
        np.fill_diagonal(nm, 1-noise_amount)
        y_train_clean_arr = copy.deepcopy(df_train_scaled["Target"].to_numpy())
        s = noise_generation.generate_noisy_labels(y_train_clean_arr, nm)
        df_train_scaled["Target"] = s
        
        # save processed train and test data
        folderpath = os.getcwd()+'/datasets/{}/seed_{}/data'.format(dataset, seed)
        save_data(folderpath, df_train_scaled, df_test_scaled, continuous_features, categorical_features, y_train_clean_arr=y_train_clean_arr)


In [None]:
dataset = 'Cardiotocography'

### read dataset
# source: http://archive.ics.uci.edu/ml/datasets/Cardiotocography 
datapath = 'data_raw/Cardiotocography/CTG.xls'
df = pd.read_excel(datapath, 'Data')

df.drop('DL.1', axis=1, inplace=True)
df.drop('DS.1', axis=1, inplace=True)
df.drop('DP.1', axis=1, inplace=True)
df.drop('b', axis=1, inplace=True)
df.drop('e', axis=1, inplace=True)
df.drop('AC', axis=1, inplace=True)
df.drop('FM', axis=1, inplace=True)
df.drop('UC', axis=1, inplace=True)

### remove empty columns
missing_features = df.isnull().sum()
empty_columns = missing_features[missing_features>=200].index
df.drop(empty_columns, axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)

### preprocessing columns
# assign the column name of the target feature as "Target"
df.rename(columns={"NSP":"Target"}, inplace=True)
# make sure the label ranges from 0 to (num_class-1)
# Targets = Normal=0; Suspect=1; Pathologic=2
df["Target"] = df["Target"].replace({1:0})
df["Target"] = df["Target"].replace({2:1})
df["Target"] = df["Target"].replace({3:2})

categorical_features = ['A', 'B', 'C', 'D', 'E','AD', 'DE', 'LD', 'FS', 'SUSP', 'CLASS'] ###
continuous_features = list(OrderedSet(df.columns.to_list()) - OrderedSet(["Target"]) - OrderedSet(categorical_features))

### fill NA
for c in df.columns[df.isnull().any(axis=0)]:
    if c in continuous_features:
        df[c].fillna(df[c].mean(), inplace=True)
    else:
        df[c].fillna(df[c].mode()[0], inplace=True)

def _clip_outliers(_df):
    _df.loc[_df["FM.1"]>0.00139*15,"FM.1"] = 0.00139*15
_clip_outliers(df)

train_fraction = 0.8 ###

def get_data(seed):
    df_train, df_test = shuffle_split_data(df, train_fraction, seed=seed)
    
    for feature in df_train.columns:
        if feature in (categorical_features+["Target"]):
            continue
        else:
            lower = np.percentile(df_train[feature], 2)
            upper = np.percentile(df_train[feature], 95 if feature in ["FM.1", "ALTV"] else 98)

        df_train.loc[df_train[feature]<lower, feature] = lower
        df_train.loc[df_train[feature]>upper, feature] = upper
        df_test.loc[df_test[feature]<lower, feature] = lower
        df_test.loc[df_test[feature]>upper, feature] = upper
    
    scaler_list = [MinMaxScaler(clip=True), MinMaxScaler(clip=True)]
    feature_list = [continuous_features, categorical_features]
    df_train_scaled, df_test_scaled = scale_features(df_train, df_test, feature_list, scaler_list)
    return df_train_scaled, df_test_scaled, continuous_features, categorical_features

_helper()


In [None]:
dataset = 'CreditFraud'

# source: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
datapath = 'data_raw/CreditFraud/creditcard.csv'

train_fraction = 0.75 ###

def get_data(seed):
    df = pd.read_csv(datapath)
    fraud = df[df['Class']==1]
    non_fraud = df[df['Class']==0].sample(2*len(fraud), random_state=seed)
    df = non_fraud.append(fraud).reset_index(drop=True)
    
    df.rename(columns={"Class":"Target"}, inplace=True)
    
    categorical_features = [] ###
    continuous_features = list(OrderedSet(df.columns.to_list()) - OrderedSet(["Target"]) - OrderedSet(categorical_features))
    
    df_train, df_test = shuffle_split_data(df, train_fraction, seed=seed)
    
    for feature in df_train.columns:
        if feature in (categorical_features+["Target"]):
            continue
        else:
            lower = np.percentile(df_train[feature], 2)
            upper = np.percentile(df_train[feature], 95 if feature=="Amount" else 98)
        
        df_train.loc[df_train[feature]<lower, feature] = lower
        df_train.loc[df_train[feature]>upper, feature] = upper
        df_test.loc[df_test[feature]<lower, feature] = lower
        df_test.loc[df_test[feature]>upper, feature] = upper
    
    scaler_list = [MinMaxScaler(clip=True), MinMaxScaler(clip=True)]
    feature_list = [continuous_features, categorical_features]
    df_train_scaled, df_test_scaled = scale_features(df_train, df_test, feature_list, scaler_list)
    return df_train_scaled, df_test_scaled, continuous_features, categorical_features

_helper()


In [None]:
dataset = 'HAR'

# source: https://www.kaggle.com/datasets/uciml/human-activity-recognition-with-smartphones
datapath_tr = 'data_raw/HAR/train.csv'
datapath_te = 'data_raw/HAR/test.csv'
df_tr = pd.read_csv(datapath_tr)
df_te = pd.read_csv(datapath_te)
df = pd.concat([df_tr, df_te], ignore_index=False, copy=False)
num_train = len(df_tr)

df = df.drop("subject", axis=1)

# assign the column name of the target feature as "Target"
df.rename(columns={"Activity":"Target"}, inplace=True)

# make sure the label ranges from 0 to (num_class-1)
df["Target"] = LabelEncoder().fit_transform(df["Target"])

def get_data(seed):
    df_train = shuffle_data(df.iloc[:num_train,:], seed=seed)
    df_test = shuffle_data(df.iloc[num_train:,:], seed=seed)
    
    # dim reduction
    features = list(OrderedSet(df_train.columns.to_list()) - OrderedSet(["Target"]))
    scaler_list = [StandardScaler()]
    feature_list = [features]
    df_train, df_test = scale_features(df_train, df_test, feature_list, scaler_list)

    pca = PCA(n_components=0.9, random_state=seed)
    df_train = pd.concat([pd.DataFrame(pca.fit_transform(df_train[features])), df_train["Target"]], axis=1)
    df_test = pd.concat([pd.DataFrame(pca.transform(df_test[features])), df_test["Target"]], axis=1)
    df_train.rename(columns=lambda c: 'col'+str(c) if isinstance(c, int) else c, inplace=True)
    df_test.rename(columns=lambda c: 'col'+str(c) if isinstance(c, int) else c, inplace=True)
    
    for feature in df_train.columns:
        if feature=="Target":
            continue
        elif feature=='col0':
            lower = np.percentile(df_train[feature], 1)
            upper = np.percentile(df_train[feature], 99)
        else:
            q1 = df_train[feature].quantile(0.25)
            q3 = df_train[feature].quantile(0.75)
            iqr = q3 - q1
            lower = q1 - 1.5 * iqr
            upper = q3 + 1.5 * iqr

        df_train.loc[df_train[feature]<lower, feature] = lower
        df_train.loc[df_train[feature]>upper, feature] = upper
        df_test.loc[df_test[feature]<lower, feature] = lower
        df_test.loc[df_test[feature]>upper, feature] = upper
    
    categorical_features = [] ###
    continuous_features = list(OrderedSet(df_train.columns.to_list()) - OrderedSet(["Target"]) - OrderedSet(categorical_features))
    
    scaler_list = [MinMaxScaler(clip=True), MinMaxScaler(clip=True)]
    feature_list = [continuous_features, categorical_features]
    df_train_scaled, df_test_scaled = scale_features(df_train, df_test, feature_list, scaler_list)
    return df_train_scaled, df_test_scaled, continuous_features, categorical_features

_helper()


In [None]:
dataset = 'Letter'

# source: https://www.kaggle.com/datasets/nishan192/letterrecognition-using-svm
datapath = 'data_raw/Letter/letter-recognition.csv'
df = pd.read_csv(datapath)

# assign the column name of the target feature as "Target"
df.rename(columns={"letter":"Target"}, inplace=True)

# make sure the label ranges from 0 to (num_class-1)
df["Target"] = LabelEncoder().fit_transform(df["Target"])

categorical_features = [] ###
continuous_features = list(OrderedSet(df.columns.to_list()) - OrderedSet(["Target"]) - OrderedSet(categorical_features))

train_fraction = 0.75 ###

def get_data(seed):
    df_train, df_test = shuffle_split_data(df, train_fraction, seed=seed)
    
    for feature in continuous_features:
        lower = np.percentile(df_train[feature], 1)
        upper = np.percentile(df_train[feature], 99)

        df_train.loc[df_train[feature]<lower, feature] = lower
        df_train.loc[df_train[feature]>upper, feature] = upper
        df_test.loc[df_test[feature]<lower, feature] = lower
        df_test.loc[df_test[feature]>upper, feature] = upper
    
    scaler_list = [MinMaxScaler(clip=True), MinMaxScaler(clip=True)]
    feature_list = [continuous_features, categorical_features]
    df_train_scaled, df_test_scaled = scale_features(df_train, df_test, feature_list, scaler_list)
    return df_train_scaled, df_test_scaled, continuous_features, categorical_features

_helper()


In [None]:
dataset = 'Mushroom'

# source: https://www.kaggle.com/datasets/uciml/mushroom-classification
datapath = 'data_raw/Mushroom/mushrooms.csv'
df = pd.read_csv(datapath)

df.rename(columns={"class":"Target"}, inplace=True)
df["Target"] = df["Target"].replace({'e':0, 'p':1})

df["cap-shape"] = df["cap-shape"].replace({'b':'other', 'c':'other', 's':'other'})
df["cap-surface"] = df["cap-surface"].replace({'f':'other', 'g':'other'})
df["cap-color"] = df["cap-color"].replace({'b':'other', 'c':'other', 'p':'other', 'r':'other', 'u':'other'})
df["odor"] = df["odor"].replace({'c':'other', 'm':'other'})
df["gill-color"] = df["gill-color"].replace({'e':'other', 'o':'other', 'r':'other', 'y':'other'})
df["stalk-root"] = df["stalk-root"].replace({'c':'other', 'r':'other'})
df["stalk-surface-above-ring"] = df["stalk-surface-above-ring"].replace({'f':'other', 'y':'other'})
df["stalk-color-above-ring"] = df["stalk-color-above-ring"].replace({'c':'other', 'e':'other', 'o':'other', 'y':'other'})
df["stalk-color-below-ring"] = df["stalk-color-below-ring"].replace({'c':'other', 'e':'other', 'o':'other', 'y':'other'})
df["veil-color"] = df["veil-color"].replace({'n':'other', 'o':'other', 'y':'other'})
df["ring-number"] = df["ring-number"].replace({'n':'other', 't':'other'})
df["ring-type"] = df["ring-type"].replace({'f':'other', 'l':'other', 'n':'other'})
df["spore-print-color"] = df["spore-print-color"].replace({'b':'other', 'o':'other', 'r':'other', 'u':'other', 'y':'other'})

df = df.drop(["veil-type"], axis=1)

to_dummy = []
to_le = []
for feature in list(OrderedSet(df.columns.to_list()) - OrderedSet(["Target"])):
    if len(np.unique(df[feature]))>2:
        to_dummy.append(feature)
    else:
        to_le.append(feature)
df = pd.get_dummies(df, prefix=to_dummy, columns=to_dummy, drop_first=True)
le = LabelEncoder()
df[to_le] = df[to_le].apply(lambda col: le.fit_transform(col))

df.drop_duplicates(inplace=True, ignore_index=True)

continuous_features = [] ###
categorical_features = list(OrderedSet(df.columns.to_list()) - OrderedSet(["Target"]) - OrderedSet(continuous_features))

train_fraction = 0.75 ###

def get_data(seed):
    df_train, df_test = shuffle_split_data(df, train_fraction, seed=seed)
    
    scaler_list = [MinMaxScaler(clip=True), MinMaxScaler(clip=True)]
    feature_list = [continuous_features, categorical_features]
    df_train_scaled, df_test_scaled = scale_features(df_train, df_test, feature_list, scaler_list)
    return df_train_scaled, df_test_scaled, continuous_features, categorical_features

_helper()


In [None]:
dataset = 'SatIm'

# source: https://www.kaggle.com/datasets/markjibrilmononutu/statlog-landsat-satellite-data-set
datapath_tr = 'data_raw/SatIm/sat_train.csv'
datapath_te = 'data_raw/SatIm/sat_test.csv'
df_tr = pd.read_csv(datapath_tr)
df_te = pd.read_csv(datapath_te)
df = pd.concat([df_tr, df_te], ignore_index=False, copy=False)
num_train = len(df_tr)

# assign the column name of the target feature as "Target"
df.rename(columns={"label":"Target"}, inplace=True)

# make sure the label ranges from 0 to (num_class-1)
df["Target"] = LabelEncoder().fit_transform(df["Target"])

continuous_features = list(OrderedSet(df.columns.to_list()) - OrderedSet(["Target"])) ###
categorical_features = list(OrderedSet(df.columns.to_list()) - OrderedSet(["Target"]) - OrderedSet(continuous_features))

def get_data(seed):
    df_train = shuffle_data(df.iloc[:num_train,:], seed=seed)
    df_test = shuffle_data(df.iloc[num_train:,:], seed=seed)
    
    for feature in continuous_features:
        lower = np.percentile(df_train[feature], 1)
        upper = np.percentile(df_train[feature], 99)

        df_train.loc[df_train[feature]<lower, feature] = lower
        df_train.loc[df_train[feature]>upper, feature] = upper
        df_test.loc[df_test[feature]<lower, feature] = lower
        df_test.loc[df_test[feature]>upper, feature] = upper
    
    scaler_list = [MinMaxScaler(clip=True), MinMaxScaler(clip=True)]
    feature_list = [continuous_features, categorical_features]
    df_train_scaled, df_test_scaled = scale_features(df_train, df_test, feature_list, scaler_list)
    return df_train_scaled, df_test_scaled, continuous_features, categorical_features

_helper()


In [None]:
dataset = 'SenDrive'

# source: https://archive.ics.uci.edu/ml/datasets/dataset+for+sensorless+drive+diagnosis
datapath = 'data_raw/SenDrive/Sensorless_drive_diagnosis.txt'
df = pd.read_csv(datapath, sep=" ", header=None)

df.rename(columns=lambda c: 'col'+str(c), inplace=True)

# assign the column name of the target feature as "Target"
df.rename(columns={'col48':"Target"}, inplace=True)

# make sure the label ranges from 0 to (num_class-1)
df["Target"] = LabelEncoder().fit_transform(df["Target"])

categorical_features = [] ###
continuous_features = list(OrderedSet(df.columns.to_list()) - OrderedSet(["Target"]) - OrderedSet(categorical_features))

train_fraction = 0.75 ###

def get_data(seed):
    df_train, df_test = shuffle_split_data(df, train_fraction, seed=seed)
    
    for feature in continuous_features:
        lower = np.percentile(df_train[feature], 2)
        upper = np.percentile(df_train[feature], 98)
        
        df_train.loc[df_train[feature]<lower, feature] = lower
        df_train.loc[df_train[feature]>upper, feature] = upper
        df_test.loc[df_test[feature]<lower, feature] = lower
        df_test.loc[df_test[feature]>upper, feature] = upper
    
    scaler_list = [MinMaxScaler(clip=True), MinMaxScaler(clip=True)]
    feature_list = [continuous_features, categorical_features]
    df_train_scaled, df_test_scaled = scale_features(df_train, df_test, feature_list, scaler_list)
    return df_train_scaled, df_test_scaled, continuous_features, categorical_features

_helper()
