# Preprocessing for real data sets

## List of data sets:

1. abalone
1. australian
1. banknote
1. breastcancer
1. cardiotocography
1. cmc
1. htru2
1. phoneme
1. ringnorm
1. texture
1. yeast

In [None]:
import os
from pathlib import Path

import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
from sklearn.preprocessing import StandardScaler

from label_flip_revised.utils import create_dir

In [None]:
PATH_DATA = os.path.join(Path(os.getcwd()).parent.parent, "data", "raw")
abspath = Path(PATH_DATA).absolute()
print(abspath)

path_output_std = os.path.join(Path(os.getcwd()).parent.parent, "data", "standard")
print(path_output_std)
create_dir(path_output_std)

## 1. Abalone

Read from `./data/raw/abalone.dat`

In [None]:
path_data = os.path.join(abspath, "abalone.dat")
cols = [
    "Sex",
    "Length",
    "Diameter",
    "Height",
    "Whole_weight",
    "Shucked_weight",
    "Viscera_weight",
    "Shell_weight",
    "Rings",
]
df = pd.read_csv(path_data, skiprows=13, names=cols, index_col=None, header=None)
print(df.shape)

In [None]:
print(df["Rings"].nunique())

print(len(df[df["Rings"] >= 10]))
print(len(df[df["Rings"] < 10]))

In [None]:
# Assign # of rings greater and equal to 10 as 1, less than 10 as 0
df["Class"] = (
    df["Rings"].apply(lambda x: 0 if x < 10 else 1).astype("category").cat.codes
)
df = df.drop(["Rings", "Sex"], axis=1)

# Only keep Male and Female classes
df = df[(df["Class"] == 0) | (df["Class"] == 1)]

print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
df0 = df[df["Class"] == 0].sample(1000, replace=False)
df1 = df[df["Class"] == 1].sample(1000, replace=False)
df = pd.concat([df0, df1])

df["Class"].value_counts().plot(kind="bar")

In [None]:
# Save data
path_output = os.path.join(Path(abspath).parent, "preprocessed")
create_dir(path_output)
path_output = os.path.join(path_output, "abalone_subset.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df.head()

In [None]:
def preprocessing(df):
    col_redundant = df.columns[df.nunique() == 1]
    if len(col_redundant) > 0:
        print("Remove:", col_redundant)
        df = df.drop(col_redundant, axis=1)

    scaler = StandardScaler()
    col_X = df.columns[~df.columns.isin(["Class"])]
    df_preprocessing = pd.DataFrame(df)
    df_preprocessing[col_X] = (
        scaler.fit_transform(df[col_X]).round(6).astype(np.float32)
    )
    df_preprocessing["Class"] = df_preprocessing["Class"].astype(int)
    return df_preprocessing

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(
    os.path.join(path_output_std, "abalone_subset_std.csv"), index=False
)

## 2. Australian

Read from `./data/raw/australian.dat`

In [None]:
path_data = os.path.join(abspath, "australian.dat")
col_names = ["A{}".format(i) for i in range(1, 15)] + ["Class"]
print(col_names)
df = pd.read_csv(path_data, sep="\s+", names=col_names, header=None)
df["Class"] = df["Class"].astype("category").cat.codes
print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "australian.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(os.path.join(path_output_std, "australian_std.csv"), index=False)

## 3. Banknote

Read from `./data/raw/banknote.csv`

In [None]:
path_data = os.path.join(abspath, "banknote.csv")
df = pd.read_csv(path_data)
df["Class"] = df["Class"].astype("category").cat.codes
print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "banknote.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(os.path.join(path_output_std, "banknote_std.csv"), index=False)

## 4. Breast Cancer

Read data from `sklearn.datasets`

In [None]:
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
col_names = dataset.feature_names

df = pd.DataFrame(X, columns=col_names)
df["Class"] = pd.Series(y, dtype="category").cat.codes

print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "breastcancer.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(os.path.join(path_output_std, "breastcancer_std.csv"), index=False)

## 6. CMC

Read from: `./data/raw/cmc.data`

In [None]:
path_data = os.path.join(abspath, "cmc.data")
col_names = [
    "W_age",
    "W_edu",
    "H_edu",
    "Children",
    "W_religion",
    "W_work",
    "H_occ",
    "SoL",
    "Media",
    "Class",
]
df = pd.read_csv(path_data, index_col=None, header=None, names=col_names)
df["Class"] = df["Class"].astype("category").cat.codes
print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

Class attributes is **Contraceptive method used**. (1=No-use, 2=Long-term, 3=Short-term).
We merge 2 and 3.

In [None]:
df["Class"] = df["Class"].apply(lambda x: 0 if x == 1 else 1)

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "cmc.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(os.path.join(path_output_std, "cmc_std.csv"), index=False)

## 7. HTRU2

Read from: `./data/raw/HTRU_2.csv`

In [None]:
path_data = os.path.join(abspath, "HTRU_2.csv")
col_names = ["A{}".format(i) for i in range(1, 9)] + ["Class"]
df = pd.read_csv(path_data, names=col_names, index_col=None, header=None)
df["Class"] = df["Class"].astype("category").cat.codes

print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
df0 = df[df["Class"] == 0].sample(1000, replace=False)
df1 = df[df["Class"] == 1].sample(1000, replace=False)
df = pd.concat([df0, df1])

df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "htru2_subset.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(os.path.join(path_output_std, "htru2_subset_std.csv"), index=False)

## 8. Phoneme

Read from: `./data/raw/phoneme.csv`

In [None]:
path_data = os.path.join(abspath, "phoneme.csv")
df = pd.read_csv(path_data, index_col=None)
df["Class"] = df["Class"].astype("category").cat.codes

print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
df0 = df[df["Class"] == 0].sample(1000, replace=False)
df1 = df[df["Class"] == 1].sample(1000, replace=False)
df = pd.concat([df0, df1])

df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "phoneme_subset.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(
    os.path.join(path_output_std, "phoneme_subset_std.csv"), index=False
)

## 9. Ringnorm

Read from: `./data/raw/ringnorm.dat`

In [None]:
path_data = os.path.join(abspath, "ringnorm.dat")
col_names = ["A{}".format(i) for i in range(1, 21)] + ["Class"]
df = pd.read_csv(path_data, skiprows=26, names=col_names, header=None, index_col=None)
df["Class"] = df["Class"].astype("category").cat.codes

print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
df0 = df[df["Class"] == 0].sample(1000, replace=False)
df1 = df[df["Class"] == 1].sample(1000, replace=False)
df = pd.concat([df0, df1])

df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "ringnorm_subset.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(
    os.path.join(path_output_std, "ringnorm_subset_std.csv"), index=False
)

## 10. Texture

Read from: `./data/raw/texture.dat`

In [None]:
path_data = os.path.join(abspath, "texture.dat")
col_names = ["A{}".format(i) for i in range(1, 41)] + ["Class"]
df = pd.read_csv(path_data, skiprows=45, names=col_names, header=None, index_col=None)
df["Class"] = df["Class"].astype("category").cat.codes

print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

To make it binary classification, only select class 3 and 9

In [None]:
df = df[(df["Class"] == 3) | (df["Class"] == 9)]

# Remap classes
cattype = CategoricalDtype(categories=[3, 9], ordered=True)
df["Class"] = df["Class"].astype(cattype).cat.codes

print(df.shape)
df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "texture.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(
    os.path.join(path_output_std, "texture_subset_std.csv"), index=False
)

## 11. Yeast

Read from: `./data/raw/yeast.dat`

In [None]:
path_data = os.path.join(abspath, "yeast.dat")
col_names = [
    "Mcg",
    "Gvh",
    "Alm",
    "Mit",
    "Erl",
    "Pox",
    "Vac",
    "Nuc",
    "Class",
]
output_names = [
    "MIT",
    "NUC",
    "CYT",
    "ME1",
    "ME2",
    "ME3",
    "EXC",
    "VAC",
    "POX",
    "ERL",
]
df = pd.read_csv(path_data, skiprows=13, names=col_names, header=None, index_col=None)
cattype = CategoricalDtype(categories=output_names, ordered=False)
df["Class"] = df["Class"].astype("category").cat.codes

print(df.shape)
df.head()

In [None]:
df["Class"].value_counts().plot(kind="bar")

In [None]:
# Only select the major classes
df = df[(df["Class"] == 0) | (df["Class"] == 7)]

# Remap classes
cattype = CategoricalDtype(categories=[0, 7], ordered=True)
df["Class"] = df["Class"].astype(cattype).cat.codes

print(df.shape)
df["Class"].value_counts().plot(kind="bar")

In [None]:
path_output = os.path.join(Path(abspath).parent, "preprocessed", "yeast.csv")
print(f"Save to: {path_output}")
df.to_csv(path_output, index=False)

In [None]:
df_preprocess = preprocessing(df)
df_preprocess.head()

In [None]:
df_preprocess.to_csv(os.path.join(path_output_std, "yeast_subset_std.csv"), index=False)