# Preprocessing for tabular datasets

## List of data

1. australian
2. banknote
3. breastcancer
4. cardiotocography
5. cmc
6. htru2
7. phoneme
8. ringnorm
9. texture
10. yeast

In [1]:
import os
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import MinMaxScaler


In [2]:
PATH_ROOT = Path(os.getcwd()).absolute().parent
print('ROOT:', PATH_ROOT)

ROOT: /home/lukec/workspace/baard_v4


In [3]:
PATH_RAW = Path(os.path.join(PATH_ROOT, 'data', 'tabular', 'raw'))
print('RAW:', PATH_RAW)

files = glob(os.path.join(PATH_RAW, '*'))
print(*files, sep='\n')

RAW: /home/lukec/workspace/baard_v4/data/tabular/raw
/home/lukec/workspace/baard_v4/data/tabular/raw/australian.doc
/home/lukec/workspace/baard_v4/data/tabular/raw/australian.dat
/home/lukec/workspace/baard_v4/data/tabular/raw/cmc.names
/home/lukec/workspace/baard_v4/data/tabular/raw/banknote.csv
/home/lukec/workspace/baard_v4/data/tabular/raw/yeast.dat
/home/lukec/workspace/baard_v4/data/tabular/raw/cmc.data
/home/lukec/workspace/baard_v4/data/tabular/raw/texture.dat
/home/lukec/workspace/baard_v4/data/tabular/raw/HTRU_2.csv
/home/lukec/workspace/baard_v4/data/tabular/raw/phoneme.csv
/home/lukec/workspace/baard_v4/data/tabular/raw/ringnorm.dat


In [4]:
def apply_minmax(df):
    scaler = MinMaxScaler()
    col_X = df.columns[~df.columns.isin(["Class"])]
    df_preprocessing = pd.DataFrame(df)
    df_preprocessing[col_X] = (
        scaler.fit_transform(df[col_X])
    )
    df_preprocessing["Class"] = df_preprocessing["Class"].astype(int)
    return df_preprocessing

In [5]:
# Australian
path_data = os.path.join(PATH_RAW, "australian.dat")
col_names = ["A{}".format(i) for i in range(1, 15)] + ["Class"]
print(col_names)
df = pd.read_csv(path_data, sep="\s+", names=col_names, header=None)
df["Class"] = df["Class"].astype("category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))

['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'Class']
    A1        A2        A3   A4        A5     A6        A7   A8   A9  \
0  1.0  0.125263  0.409286  0.5  0.230769  0.375  0.055614  0.0  0.0   
1  0.0  0.134135  0.250000  0.5  0.538462  0.375  0.005789  0.0  0.0   
2  0.0  0.238045  0.062500  0.0  0.230769  0.375  0.043860  0.0  0.0   
3  0.0  0.119098  0.410714  0.0  0.307692  0.250  0.000000  1.0  1.0   
4  1.0  0.096541  0.291786  0.5  0.384615  0.375  0.068772  1.0  1.0   

        A10  A11  A12   A13      A14  Class  
0  0.000000  1.0  0.5  0.05  0.01212      0  
1  0.000000  0.0  0.5  0.08  0.00000      0  
2  0.000000  1.0  0.5  0.14  0.00000      0  
3  0.164179  1.0  0.5  0.00  0.00000      1  
4  0.208955  0.0  0.5  0.03  0.00158      1  
Shape: (690, 15)
# classes: 2


In [6]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'australian_preprocessed.csv'),
    index=False,
)

In [7]:
# Banknote
path_data = os.path.join(PATH_RAW, "banknote.csv")
df = pd.read_csv(path_data)
df["Class"] = df["Class"].astype("category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))


   variance  skewness  curtosis   entropy  Class
0  0.769004  0.839643  0.106783  0.736628      0
1  0.835659  0.820982  0.121804  0.644326      0
2  0.786629  0.416648  0.310608  0.786951      0
3  0.757105  0.871699  0.054921  0.450440      0
4  0.531578  0.348662  0.424662  0.687362      0
Shape: (1372, 5)
# classes: 2


In [8]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'banknote_preprocessed.csv'),
    index=False,
)

In [9]:
# Breast Cancer
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()
X = dataset.data
y = dataset.target
col_names = dataset.feature_names

df = pd.DataFrame(X, columns=col_names)
df["Class"] = pd.Series(y, dtype="category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))


   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0     0.521037      0.022658        0.545989   0.363733         0.593753   
1     0.643144      0.272574        0.615783   0.501591         0.289880   
2     0.601496      0.390260        0.595743   0.449417         0.514309   
3     0.210090      0.360839        0.233501   0.102906         0.811321   
4     0.629893      0.156578        0.630986   0.489290         0.430351   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0          0.792037        0.703140             0.731113       0.686364   
1          0.181768        0.203608             0.348757       0.379798   
2          0.431017        0.462512             0.635686       0.509596   
3          0.811361        0.565604             0.522863       0.776263   
4          0.347893        0.463918             0.518390       0.378283   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [10]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'breastcancer_preprocessed.csv'),
    index=False,
)

In [11]:
# CMC
path_data = os.path.join(PATH_RAW, "cmc.data")
col_names = [
    "W_age",
    "W_edu",
    "H_edu",
    "Children",
    "W_religion",
    "W_work",
    "H_occ",
    "SoL",
    "Media",
    "Class",
]
df = pd.read_csv(path_data, index_col=None, header=None, names=col_names)
df["Class"] = df["Class"].astype("category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))


      W_age     W_edu     H_edu  Children  W_religion  W_work     H_occ  \
0  0.242424  0.333333  0.666667    0.1875         1.0     1.0  0.333333   
1  0.878788  0.000000  0.666667    0.6250         1.0     1.0  0.666667   
2  0.818182  0.333333  0.666667    0.4375         1.0     1.0  0.666667   
3  0.787879  0.666667  0.333333    0.5625         1.0     1.0  0.666667   
4  0.606061  0.666667  0.666667    0.5000         1.0     1.0  0.666667   

        SoL  Media  Class  
0  0.666667    0.0      0  
1  1.000000    0.0      0  
2  1.000000    0.0      0  
3  0.666667    0.0      0  
4  0.333333    0.0      0  
Shape: (1473, 10)
# classes: 3


In [12]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'cmc_preprocessed.csv'),
    index=False,
)

In [13]:
# HTRU2
path_data = os.path.join(PATH_RAW, "HTRU_2.csv")
col_names = ["A{}".format(i) for i in range(1, 9)] + ["Class"]
df = pd.read_csv(path_data, names=col_names, index_col=None, header=None)
df["Class"] = df["Class"].astype("category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))


         A1        A2        A3        A4        A5        A6        A7  \
0  0.721342  0.417687  0.165043  0.015627  0.013382  0.113681  0.294986   
1  0.517628  0.460908  0.235415  0.018268  0.006560  0.072524  0.364015   
2  0.520346  0.196868  0.221138  0.040677  0.013030  0.139188  0.288624   
3  0.700933  0.437884  0.181750  0.016534  0.015368  0.131583  0.266348   
4  0.443854  0.214847  0.249044  0.041712  0.004327  0.039684  0.462029   

         A8  Class  
0  0.063890      0  
1  0.108443      0  
2  0.054610      0  
3  0.046581      0  
4  0.213369      0  
Shape: (17898, 9)
# classes: 2


In [14]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'htru2_preprocessed.csv'),
    index=False,
)

In [15]:
# Phoneme
path_data = os.path.join(PATH_RAW, "phoneme.csv")
df = pd.read_csv(path_data, index_col=None)
df["Class"] = df["Class"].astype("category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))


         V1        V2        V3        V4        V5  Class
0  0.506286  0.385946  0.322087  0.341025  0.337504      0
1  0.339008  0.469601  0.569034  0.283524  0.374984      0
2  0.562579  0.384703  0.621742  0.595047  0.460548      0
3  0.340904  0.406122  0.871641  0.191285  0.320799      0
4  0.345642  0.455681  0.891710  0.144044  0.204100      0
Shape: (5404, 6)
# classes: 2


In [16]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'phoneme_preprocessed.csv'),
    index=False,
)

In [17]:
# Ringnorm
path_data = os.path.join(PATH_RAW, "ringnorm.dat")
col_names = ["A{}".format(i) for i in range(1, 21)] + ["Class"]
df = pd.read_csv(path_data, skiprows=26, names=col_names, header=None, index_col=None)
df["Class"] = df["Class"].astype("category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))


         A1        A2        A3        A4        A5        A6        A7  \
0  0.594500  0.546437  0.554969  0.448461  0.528930  0.603550  0.606924   
1  0.580219  0.485493  0.562529  0.354530  0.677158  0.589013  0.617944   
2  0.562899  0.605533  0.537700  0.570049  0.612221  0.507123  0.465946   
3  0.412944  0.583559  0.638970  0.496080  0.465763  0.449996  0.741931   
4  0.502963  0.173873  0.470186  0.950639  0.391096  0.466725  0.548770   

         A8        A9       A10  ...       A12       A13       A14       A15  \
0  0.515220  0.599005  0.637501  ...  0.494361  0.618935  0.548429  0.343211   
1  0.530100  0.456342  0.506684  ...  0.529184  0.569968  0.539333  0.636673   
2  0.691820  0.563493  0.580456  ...  0.596222  0.707460  0.547003  0.448416   
3  0.492182  0.555858  0.492601  ...  0.817003  0.517001  0.324306  0.499932   
4  0.940705  0.485447  0.646579  ...  0.612858  0.382991  0.511301  0.299103   

        A16       A17       A18       A19       A20  Class  
0  0.57

In [18]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'ringnorm_preprocessed.csv'),
    index=False,
)

In [19]:
# Texture
path_data = os.path.join(PATH_RAW, "texture.dat")
col_names = ["A{}".format(i) for i in range(1, 41)] + ["Class"]
df = pd.read_csv(path_data, skiprows=45, names=col_names, header=None, index_col=None)
df["Class"] = df["Class"].astype("category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))


         A1        A2        A3        A4        A5        A6        A7  \
0  0.102068  0.262745  0.267836  0.277385  0.340483  0.406186  0.238315   
1  0.017986  0.111765  0.179565  0.126620  0.195040  0.224742  0.095211   
2  0.154227  0.360131  0.411729  0.363958  0.459786  0.484536  0.303520   
3  0.080935  0.225490  0.212817  0.237338  0.290885  0.367010  0.193883   
4  0.053507  0.220915  0.331923  0.247939  0.371984  0.441237  0.202539   

         A8        A9       A10  ...       A32       A33       A34       A35  \
0  0.305142  0.341365  0.241785  ...  0.236812  0.324918  0.284819  0.346863   
1  0.154939  0.188755  0.137012  ...  0.149083  0.206915  0.192201  0.259779   
2  0.407307  0.443106  0.357719  ...  0.279243  0.385291  0.372563  0.475277   
3  0.253721  0.292503  0.167390  ...  0.200688  0.234907  0.243036  0.293727   
4  0.324763  0.375502  0.296962  ...  0.182339  0.316136  0.236072  0.346863   

        A36       A37       A38       A39       A40  Class  
0  0.37

In [20]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'texture_preprocessed.csv'),
    index=False,
)

In [21]:
# Yeast
path_data = os.path.join(PATH_RAW, "yeast.dat")
col_names = [
    "Mcg",
    "Gvh",
    "Alm",
    "Mit",
    "Erl",
    "Pox",
    "Vac",
    "Nuc",
    "Class",
]
output_names = [
    "MIT",
    "NUC",
    "CYT",
    "ME1",
    "ME2",
    "ME3",
    "EXC",
    "VAC",
    "POX",
    "ERL",
]
df = pd.read_csv(path_data, skiprows=13, names=col_names, header=None, index_col=None)
cattype = CategoricalDtype(categories=output_names, ordered=False)
df["Class"] = df["Class"].astype("category").cat.codes

df_preprocess = apply_minmax(df)
print(df_preprocess.head())

print('Shape:', df_preprocess.shape)
print('# classes:', len(df_preprocess['Class'].unique()))


        Mcg       Gvh       Alm   Mit  Erl  Pox       Vac   Nuc  Class
0  0.528090  0.551724  0.329114  0.13  0.0  0.0  0.657534  0.22      6
1  0.359551  0.620690  0.341772  0.27  0.0  0.0  0.726027  0.22      6
2  0.595506  0.563218  0.354430  0.15  0.0  0.0  0.726027  0.22      6
3  0.528090  0.356322  0.455696  0.13  0.0  0.0  0.739726  0.22      7
4  0.348315  0.356322  0.341772  0.54  0.0  0.0  0.657534  0.22      6
Shape: (1484, 9)
# classes: 10


In [22]:
df_preprocess.to_csv(
    os.path.join(PATH_RAW.parent, 'yeast_preprocessed.csv'),
    index=False,
)