In [1]:
import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(message)s', level=logging.INFO)

import math
import numpy as np
import pandas as pd

import h5py
import scipy.io as sio



In [2]:
def read_matlab_data_file(filepath):
    mat = sio.loadmat(filepath)
    columns = [f'A{i+1}' for i in range(mat['X'].shape[1])]
    df = pd.DataFrame(data=mat['X'], columns=columns)
    df['label'] = mat['y']
    return df

In [3]:
def read_matlab_data_with_h5py(filepath):
    mat = {}
    f = h5py.File(filepath)
    for k, v in f.items():
        mat[k] = np.array(v)
    columns = [f'A{i+1}' for i in range(mat['X'].shape[0])]
    df = pd.DataFrame(data=mat['X'].T, columns=columns)
    df['label'] = mat['y'].T
    return df

In [29]:
label = 'label'
filepath = r'../data/odds/mammography.mat'
df = read_matlab_data_file(filepath)
df.to_csv('../data/odds/mammography.csv',index=False, )
new_df = pd.read_csv('../data/odds/mammography.csv')
print(f'outliers: {len(list(np.where(df[label] == 1)[0]))}')
print(f'inliers: {len(list(np.where(df[label] == 0)[0]))}')
print(f'# data points: {df.shape[0]}')
print(f'# attributes: {df.shape[1]-1}')

outliers: 260
inliers: 10923
# data points: 11183
# attributes: 6


In [28]:
filepath = r'../data/odds/shuttle.mat'
df = read_matlab_data_file(filepath)
df.to_csv('../data/odds/shuttle.csv',index=False, )
new_df = pd.read_csv('../data/odds/shuttle.csv')
print(f'outliers: {len(list(np.where(df[label] == 1)[0]))}')
print(f'inliers: {len(list(np.where(df[label] == 0)[0]))}')
print(f'# data points: {df.shape[0]}')
print(f'# attributes: {df.shape[1]-1}')

outliers: 3511
inliers: 45586
# data points: 49097
# attributes: 9


In [27]:
filepath = r'../data/odds/wine.mat'
df = read_matlab_data_file(filepath)
df.to_csv('../data/odds/wine.csv',index=False, )
new_df = pd.read_csv('../data/odds/wine.csv')
print(f'outliers: {len(list(np.where(df[label] == 1)[0]))}')
print(f'inliers: {len(list(np.where(df[label] == 0)[0]))}')
print(f'# data points: {df.shape[0]}')
print(f'# attributes: {df.shape[1]-1}')

outliers: 10
inliers: 119
# data points: 129
# attributes: 13


In [26]:
filepath = r'../data/odds/cover.mat'
df = read_matlab_data_file(filepath)
df.to_csv('../data/odds/cover.csv',index=False, )
new_df = pd.read_csv('../data/odds/cover.csv')
print(f'outliers: {len(list(np.where(df[label] == 1)[0]))}')
print(f'inliers: {len(list(np.where(df[label] == 0)[0]))}')
print(f'{df.shape[0]}')
print(f'{df.shape[1]-1}')

outliers: 2747
inliers: 283301
286048
10


In [31]:
filepath = r'../data/odds/smtp.mat'
df = read_matlab_data_with_h5py(filepath)
df.to_csv('../data/odds/smtp.csv',index=False, )
new_df = pd.read_csv('../data/odds/smtp.csv')
print(f'outliers: {len(list(np.where(df[label] == 1)[0]))}')
print(f'inliers: {len(list(np.where(df[label] == 0)[0]))}')
print(f'# data points: {df.shape[0]}')
print(f'# attributes: {df.shape[1]-1}')

outliers: 30
inliers: 95126
# data points: 95156
# attributes: 3


In [32]:
filepath = r'../data/odds/http.mat'
df = read_matlab_data_with_h5py(filepath)
df.to_csv('../data/odds/http.csv',index=False, )
new_df = pd.read_csv('../data/odds/http.csv')
print(f'outliers: {len(list(np.where(df[label] == 1)[0]))}')
print(f'inliers: {len(list(np.where(df[label] == 0)[0]))}')
print(f'# data points: {df.shape[0]}')
print(f'# attributes: {df.shape[1]-1}')

outliers: 2211
inliers: 565287
# data points: 567498
# attributes: 3


In [11]:
filepath = '../data/odds/cover.mat'
df = read_matlab_data_file(filepath)
n = math.floor(df.shape[0]/10)
df = df.head(n)
df.to_csv('../data/odds/forest_cover.csv',index=False, )
new_df = pd.read_csv('../data/odds/forest_cover.csv')
new_df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,label
0,2804,139,9,268,65,3180,234,238,135,6121,0
1,2785,155,18,242,118,3090,238,238,122,6211,0
2,2579,132,6,300,-15,67,230,237,140,6031,0
3,2886,151,11,371,26,5253,234,240,136,4051,0
4,2742,134,22,150,69,3215,248,224,92,6091,0
...,...,...,...,...,...,...,...,...,...,...,...
28599,3008,107,11,541,89,4968,239,226,116,6081,0
28600,2998,111,13,540,79,4944,242,225,110,6108,0
28601,2995,113,6,540,76,4920,231,234,136,6134,0
28602,2991,319,4,541,72,4897,208,235,166,6161,0


In [12]:
filepath = '../data/odds/annthyroid.mat'
df = read_matlab_data_file(filepath)
df.to_csv('../data/odds/annthyroid.csv',index=False, )
new_df = pd.read_csv('../data/odds/annthyroid.csv')
new_df

Unnamed: 0,A1,A2,A3,A4,A5,A6,label
0,0.73,0.00060,0.0150,0.120,0.082,0.1460,0
1,0.24,0.00025,0.0300,0.143,0.133,0.1080,0
2,0.47,0.00190,0.0240,0.102,0.131,0.0780,0
3,0.64,0.00090,0.0170,0.077,0.090,0.0850,0
4,0.23,0.00025,0.0260,0.139,0.090,0.1530,0
...,...,...,...,...,...,...,...
7195,0.59,0.00250,0.0208,0.079,0.099,0.0800,0
7196,0.51,0.10600,0.0060,0.005,0.089,0.0055,1
7197,0.51,0.00076,0.0201,0.090,0.067,0.1340,0
7198,0.35,0.00280,0.0201,0.090,0.089,0.1010,0


In [13]:
filepath = '../data/odds/cover.mat'
df = read_matlab_data_file(filepath)
df.to_csv('../data/odds/cover.csv',index=False, )
new_df = pd.read_csv('../data/odds/cover.csv')
new_df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,label
0,2804,139,9,268,65,3180,234,238,135,6121,0
1,2785,155,18,242,118,3090,238,238,122,6211,0
2,2579,132,6,300,-15,67,230,237,140,6031,0
3,2886,151,11,371,26,5253,234,240,136,4051,0
4,2742,134,22,150,69,3215,248,224,92,6091,0
...,...,...,...,...,...,...,...,...,...,...,...
286043,2617,29,13,390,128,2081,215,211,130,592,0
286044,2614,21,13,379,125,2051,211,212,135,618,0
286045,2612,17,13,371,123,2021,208,211,138,644,0
286046,2610,16,14,365,110,1991,208,211,138,671,0
