In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np

## Yeast Dataset
https://sci2s.ugr.es/keel/dataset.php?cod=112

In [2]:
COLUMN_NAMES = ['Mcg', 'Gvh', 'Alm', 'Mit', 'Erl', 'Pox', 'Vac', 'Nuc', 'Class']
yeast = pd.read_csv('./data/yeast.dat', sep=',\s', names=COLUMN_NAMES, header=None, skiprows=13)
yeast.head()

  


Unnamed: 0,Mcg,Gvh,Alm,Mit,Erl,Pox,Vac,Nuc,Class
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


In [3]:
CLASSES = ['MIT', 'NUC', 'CYT', 'ME1', 'EXC', 'ME2', 'ME3', 'VAC', 'POX', 'ERL']
yeast_types = CategoricalDtype(categories=CLASSES)
yeast['Class'] = yeast['Class'].astype(yeast_types).cat.codes.astype('long')
print(yeast.shape)
yeast.head()

(1484, 9)


Unnamed: 0,Mcg,Gvh,Alm,Mit,Erl,Pox,Vac,Nuc,Class
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,0
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,0
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,0
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,1
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,0


In [4]:
yeast.to_csv('./data/yeast_preprocessed.csv', index=False)

## Abalone Dataset
https://sci2s.ugr.es/keel/dataset.php?cod=52

In [5]:
COLUMN_NAMES = ['Class', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']
abalone = pd.read_csv('./data/abalone.dat', sep=r',\s', names=COLUMN_NAMES, header=None, skiprows=13)
print(abalone.shape)
abalone.head()

(4174, 9)


  


Unnamed: 0,Class,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [6]:
SEX = ['F', 'M', 'I']
sex_types = CategoricalDtype(categories=SEX)
abalone['Class'] = abalone['Class'].astype(sex_types).cat.codes

CLASS_AT_END = ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings', 'Class']
abalone = abalone.reindex(columns=CLASS_AT_END)
abalone.head()

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Class
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,2


In [7]:
abalone.to_csv('./data/abalone_preprocessed.csv', index=False)

## Banknote
https://archive.ics.uci.edu/ml/datasets/banknote+authentication

In [8]:
COLUMN_NAMES = ['variance', 'skewness', 'curtosis', 'entropy', 'Class']
banknote = pd.read_csv('./data/data_banknote_authentication.txt', sep=',', names=COLUMN_NAMES, header=None)
print(banknote.shape)
banknote.head()

(1372, 5)


Unnamed: 0,variance,skewness,curtosis,entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [9]:
banknote.to_csv('./data/banknote_preprocessed.csv', index=False)

## Texture
https://sci2s.ugr.es/keel/dataset.php?cod=72

In [10]:
COLUMN_NAMES = ['A{}'.format(i) for i in range(1, 41)]
COLUMN_NAMES.append('Class')

texture = pd.read_csv('./data/texture.dat', sep=',', names=COLUMN_NAMES, header=None, skiprows=45)
print(texture.shape)
texture.head()

(5500, 41)


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A32,A33,A34,A35,A36,A37,A38,A39,A40,Class
0,-1.223,-0.798,-0.867,-0.639,-0.545,-0.412,-0.795,-0.629,-0.547,-0.868,...,-0.766,-0.555,-0.714,-0.545,-0.587,-0.871,-0.62,-0.568,-0.607,2
1,-1.41,-1.029,-1.013,-0.895,-0.762,-0.676,-1.043,-0.851,-0.775,-1.037,...,-0.919,-0.77,-0.847,-0.663,-0.723,-1.013,-0.748,-0.698,-0.817,2
2,-1.107,-0.649,-0.629,-0.492,-0.367,-0.298,-0.682,-0.478,-0.395,-0.681,...,-0.692,-0.445,-0.588,-0.371,-0.368,-0.746,-0.457,-0.379,-0.469,2
3,-1.27,-0.855,-0.958,-0.707,-0.619,-0.469,-0.872,-0.705,-0.62,-0.988,...,-0.829,-0.719,-0.774,-0.617,-0.688,-0.937,-0.693,-0.657,-0.779,2
4,-1.331,-0.862,-0.761,-0.689,-0.498,-0.361,-0.857,-0.6,-0.496,-0.779,...,-0.861,-0.571,-0.784,-0.545,-0.562,-0.952,-0.642,-0.578,-0.648,2


In [11]:
CLASSES = [2, 3, 4, 9, 10, 7, 6, 8, 12, 13, 14]
class_type = CategoricalDtype(categories=CLASSES)
texture['Class'] = texture['Class'].astype(class_type).cat.codes
print(texture.shape)
texture.head()

(5500, 41)


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A32,A33,A34,A35,A36,A37,A38,A39,A40,Class
0,-1.223,-0.798,-0.867,-0.639,-0.545,-0.412,-0.795,-0.629,-0.547,-0.868,...,-0.766,-0.555,-0.714,-0.545,-0.587,-0.871,-0.62,-0.568,-0.607,0
1,-1.41,-1.029,-1.013,-0.895,-0.762,-0.676,-1.043,-0.851,-0.775,-1.037,...,-0.919,-0.77,-0.847,-0.663,-0.723,-1.013,-0.748,-0.698,-0.817,0
2,-1.107,-0.649,-0.629,-0.492,-0.367,-0.298,-0.682,-0.478,-0.395,-0.681,...,-0.692,-0.445,-0.588,-0.371,-0.368,-0.746,-0.457,-0.379,-0.469,0
3,-1.27,-0.855,-0.958,-0.707,-0.619,-0.469,-0.872,-0.705,-0.62,-0.988,...,-0.829,-0.719,-0.774,-0.617,-0.688,-0.937,-0.693,-0.657,-0.779,0
4,-1.331,-0.862,-0.761,-0.689,-0.498,-0.361,-0.857,-0.6,-0.496,-0.779,...,-0.861,-0.571,-0.784,-0.545,-0.562,-0.952,-0.642,-0.578,-0.648,0


In [12]:
texture.to_csv('./data/texture_preprocessed.csv', index=False)

## HTRU2
https://archive.ics.uci.edu/ml/datasets/HTRU2

In [13]:
COLUMN_NAMES = ['Profile_mean', 'Profile_stdev', 'Profile_skewness', 'Profile_kurtosis', 'DM_mean', 'DM_stdev', 'DM_skewness', 'DM_kurtosis', 'Class']
htru2 = pd.read_csv('./data/HTRU_2.csv', sep=',', names=COLUMN_NAMES, header=None)
print(htru2.shape)
htru2.head()

(17898, 9)


Unnamed: 0,Profile_mean,Profile_stdev,Profile_skewness,Profile_kurtosis,DM_mean,DM_stdev,DM_skewness,DM_kurtosis,Class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [14]:
htru2.to_csv('./data/htru2_preprocessed.csv', index=False)

## Segment
https://sci2s.ugr.es/keel/dataset.php?cod=107

In [15]:
COLUMN_NAMES = ['Region-centroid-col', 'Region-centroid-row', 'Region-pixel-count', 
                'Short-line-density-5', 'Short-line-density-2', 'Vedge-mean', 
                'Vegde-sd', 'Hedge-mean', 'Hedge-sd', 'Intensity-mean', 'Rawred-mean', 
                'Rawblue-mean', 'Rawgreen-mean', 'Exred-mean', 'Exblue-mean', 'Exgreen-mean', 
                'Value-mean', 'Saturatoin-mean', 'Hue-mean', 'Class']
segment = pd.read_csv('./data/segment.dat', sep=',\s', names=COLUMN_NAMES, header=None, skiprows=24)
print(segment.shape)
segment.head()

(2310, 20)


  


Unnamed: 0,Region-centroid-col,Region-centroid-row,Region-pixel-count,Short-line-density-5,Short-line-density-2,Vedge-mean,Vegde-sd,Hedge-mean,Hedge-sd,Intensity-mean,Rawred-mean,Rawblue-mean,Rawgreen-mean,Exred-mean,Exblue-mean,Exgreen-mean,Value-mean,Saturatoin-mean,Hue-mean,Class
0,218.0,178.0,9.0,0.111111,0.0,0.833333,0.547722,1.111109,0.544331,59.62963,52.444443,75.22222,51.22222,-21.555555,46.77778,-25.222221,75.22222,0.318996,-2.040554,6
1,113.0,130.0,9.0,0.0,0.0,0.277778,0.250924,0.333333,0.365148,0.888889,0.0,2.555556,0.111111,-2.666667,5.0,-2.333333,2.555556,1.0,-2.123254,3
2,32.0,173.0,9.0,0.0,0.0,1.722222,1.781593,9.0,6.749488,43.592594,39.555557,52.88889,38.333336,-12.111111,27.88889,-15.777778,52.88889,0.266914,-1.998858,6
3,61.0,197.0,9.0,0.0,0.0,1.444444,1.515353,2.611111,1.925463,49.592594,44.22222,61.555557,43.0,-16.11111,35.88889,-19.777779,61.555557,0.302925,-2.022274,6
4,149.0,185.0,9.0,0.0,0.0,1.555555,1.068055,3.055555,1.925463,49.333332,45.333332,59.555557,43.11111,-12.0,30.666666,-18.666666,59.555557,0.275889,-1.95277,6


In [16]:
CLASSES = list(range(1,8))
class_type = CategoricalDtype(categories=CLASSES)
segment['Class'] = segment['Class'].astype(class_type).cat.codes
segment.head()

Unnamed: 0,Region-centroid-col,Region-centroid-row,Region-pixel-count,Short-line-density-5,Short-line-density-2,Vedge-mean,Vegde-sd,Hedge-mean,Hedge-sd,Intensity-mean,Rawred-mean,Rawblue-mean,Rawgreen-mean,Exred-mean,Exblue-mean,Exgreen-mean,Value-mean,Saturatoin-mean,Hue-mean,Class
0,218.0,178.0,9.0,0.111111,0.0,0.833333,0.547722,1.111109,0.544331,59.62963,52.444443,75.22222,51.22222,-21.555555,46.77778,-25.222221,75.22222,0.318996,-2.040554,5
1,113.0,130.0,9.0,0.0,0.0,0.277778,0.250924,0.333333,0.365148,0.888889,0.0,2.555556,0.111111,-2.666667,5.0,-2.333333,2.555556,1.0,-2.123254,2
2,32.0,173.0,9.0,0.0,0.0,1.722222,1.781593,9.0,6.749488,43.592594,39.555557,52.88889,38.333336,-12.111111,27.88889,-15.777778,52.88889,0.266914,-1.998858,5
3,61.0,197.0,9.0,0.0,0.0,1.444444,1.515353,2.611111,1.925463,49.592594,44.22222,61.555557,43.0,-16.11111,35.88889,-19.777779,61.555557,0.302925,-2.022274,5
4,149.0,185.0,9.0,0.0,0.0,1.555555,1.068055,3.055555,1.925463,49.333332,45.333332,59.555557,43.11111,-12.0,30.666666,-18.666666,59.555557,0.275889,-1.95277,5


In [17]:
# 'Region-pixel-count' has single unique value. Remove this column
segment = segment.drop(['Region-pixel-count'], axis=1)
segment.head()

Unnamed: 0,Region-centroid-col,Region-centroid-row,Short-line-density-5,Short-line-density-2,Vedge-mean,Vegde-sd,Hedge-mean,Hedge-sd,Intensity-mean,Rawred-mean,Rawblue-mean,Rawgreen-mean,Exred-mean,Exblue-mean,Exgreen-mean,Value-mean,Saturatoin-mean,Hue-mean,Class
0,218.0,178.0,0.111111,0.0,0.833333,0.547722,1.111109,0.544331,59.62963,52.444443,75.22222,51.22222,-21.555555,46.77778,-25.222221,75.22222,0.318996,-2.040554,5
1,113.0,130.0,0.0,0.0,0.277778,0.250924,0.333333,0.365148,0.888889,0.0,2.555556,0.111111,-2.666667,5.0,-2.333333,2.555556,1.0,-2.123254,2
2,32.0,173.0,0.0,0.0,1.722222,1.781593,9.0,6.749488,43.592594,39.555557,52.88889,38.333336,-12.111111,27.88889,-15.777778,52.88889,0.266914,-1.998858,5
3,61.0,197.0,0.0,0.0,1.444444,1.515353,2.611111,1.925463,49.592594,44.22222,61.555557,43.0,-16.11111,35.88889,-19.777779,61.555557,0.302925,-2.022274,5
4,149.0,185.0,0.0,0.0,1.555555,1.068055,3.055555,1.925463,49.333332,45.333332,59.555557,43.11111,-12.0,30.666666,-18.666666,59.555557,0.275889,-1.95277,5


In [18]:
segment.to_csv('./data/segment_preprocessed.csv', index=False)