In [39]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler


# display top 5 rows
def display(data):
    return pd.DataFrame(data).head(5)

# scale each feature to [0, 1] (all but last column)
def scale(data):
    scaled = MinMaxScaler().fit_transform(data[:, :-1])
    return np.hstack((scaled, data[:, -1:]))

# one-hot encode features (all but last column)
def oneHotEncode(data):
    one_hot = OneHotEncoder().fit_transform(data[:, :-1]).toarray()
    return np.hstack((one_hot, data[:, -1:]))


In [50]:
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_classification

def loadAdult(n):
    data = np.loadtxt('data/adult_raw.txt', delimiter=',', dtype='str')
    data = data[np.sum(data == ' ?', axis=1) == 0] # filter out missing values
    return data[np.random.randint(data.shape[0], size=n)] # return random n datapoints 

# extract only continuous features from 'adult' dataset
def loadAdultContinuous():
    data = loadAdult(1000)
    data = data[:, [0, 2, 4, 10, 11, 12, 14]] # get only numerical features
    data[:, -1] = LabelEncoder().fit_transform(data[:, -1]) # convert string labels to numeric labels
    return scale(data.astype(np.float64))

# extract only categorical features from 'adult' dataset
def loadAdultDiscrete():    
    data = loadAdult(1000)
    data = data[:, [1, 3, 5, 6, 7, 8, 9, 13, 14]] # get only discrete features
    for c in range(data.shape[1]):
        data[:, c] = LabelEncoder().fit_transform(data[:, c])  # convert string labels to numeric labels
    return oneHotEncode(data.astype(int))

def loadPima():
    data = np.loadtxt('data/pima_raw.txt', delimiter=',')
    return scale(data)

# generate synthetic data using scikit's make_classification class
def loadLargeSynth():
    data = make_classification(n_samples=10000, n_features=200,
                               n_clusters_per_class=1, n_informative=170, flip_y = 0.2,
                               random_state=12)
    data = np.hstack((data[0], data[1][:, np.newaxis]))
    return scale(data)

In [51]:
data = loadAdultContinuous()
np.save('data/adult_continuous.npy', data)
display(data)

Unnamed: 0,0,1,2,3,4,5,6
0,0.232877,0.457811,0.533333,0.0,0.0,0.378947,1.0
1,0.60274,0.181955,0.933333,0.0,0.0,0.326316,1.0
2,0.178082,0.239528,0.733333,0.0,0.0,0.221053,0.0
3,0.39726,0.016996,0.533333,0.0,0.0,0.378947,0.0
4,0.369863,0.161626,0.6,0.143441,0.0,0.378947,1.0


In [52]:
data = loadAdultDiscrete()
np.save('data/adult_dicrete.npy', data)
display(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [53]:
data = loadPima()
np.save('data/pima.npy', data)
display(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0


In [54]:
data = loadLargeSynth()
np.save('data/large_synthetic.npy', data)
display(data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,0.606511,0.312821,0.163949,0.410291,0.317182,0.288538,0.656023,0.213349,0.539617,0.302991,...,0.493758,0.382823,0.456633,0.813516,0.481122,0.53135,0.274647,0.243685,0.541073,0.0
1,0.573608,0.357763,0.608406,0.384233,0.678324,0.528074,0.542219,0.487778,0.568581,0.716354,...,0.610443,0.556186,0.48603,0.380567,0.627104,0.458685,0.462534,0.644005,0.554919,1.0
2,0.415048,0.495885,0.534118,0.605249,0.473891,0.447872,0.719504,0.415232,0.318348,0.444825,...,0.396201,0.578941,0.454478,0.718261,0.440631,0.475118,0.67181,0.1944,0.499046,0.0
3,0.680612,0.59055,0.518725,0.516232,0.656417,0.615236,0.441226,0.417292,0.681163,0.610486,...,0.585998,0.778787,0.56286,0.591823,0.646833,0.566789,0.515572,0.469853,0.448799,0.0
4,0.435177,0.437659,0.575668,0.441295,0.586334,0.691105,0.476023,0.518119,0.30789,0.579665,...,0.501644,0.61665,0.430989,0.49228,0.351221,0.452206,0.527056,0.463266,0.503279,0.0
