In [2]:
import numpy as np
import random
from function import RVFL_train_val
import h5py
from option import option as op
import time

#### Key parameters for assignment:
Q1) option.link = 0 or 1 <br>
Q2) option.bias = 0 or 1 <br>
Q3) option.scale = 2 ^ np.linspace(-5, 5, 21) <br>
Q4) option.actfn =  sigmoid,radbas,sine,sign OR hardlim, tribas. <br>
Q5) option.mode = 1 (regularised least sq) or 2 (moore penrose) <br>

In [37]:
def read_data(dataset_name):
    temp = h5py.File("UCI data python\\" + dataset_name + "_R.mat", 'r')
    data = np.array(temp['data']).T
    
    # skip index columns and split features/labels
    data = data[:, 1:]
    dataX = data[:, 0:-1]
    dataY = data[:, -1]
    dataY = np.expand_dims(dataY, 1)
    
    # do normalization for each feature
    dataX_mean = np.mean(dataX, axis=0)
    dataX_std = np.std(dataX, axis=0)
    dataX = (dataX - dataX_mean) / dataX_std

    # Do train/test split according to consuntos file
    temp = h5py.File("UCI data python\\" + dataset_name + "_conxuntos.mat" ,'r') 
    index1 = np.array(temp['index1']).astype(np.int32) - 1
    index2 = np.array(temp['index2']).astype(np.int32) - 1
    index1 = np.squeeze(index1, axis=1)
    index2 = np.squeeze(index2, axis=1)

    trainX = dataX[index1, :]
    trainY = dataY[index1, :]
    testX = dataX[index2, :]
    testY = dataY[index2, :]
    
    # Get indexes for KFolds
    temp = h5py.File("UCI data python\\" + dataset_name + "_conxuntos_kfold.mat" ,'r')
    index = []
    for i in range(8):
        index_temp = np.array([temp[element[i]][:] for element in temp['index']]).astype(np.int32) - 1
        index_temp = np.squeeze(index_temp, axis=0)
        index_temp = np.squeeze(index_temp, axis=1)
        index.append(index_temp)
    temp.close()
    
    return trainX, trainY, testX, testY, (index, dataX, dataY)   

In [29]:
# Code to search optimal N and C using default params
# Initialise options and params
def tune_params(trainX, trainY, testX, testY):
    print('Tuning RVFL....')
    max_acc, best_N, best_C = 0, 0, 0
    option = op()
    option.link = 1
    option.bias = 1
    option.Scale = 1
    option.ActivationFunction = 'radbas'
    option.mode = 1

    start = time.time()
    for N in range(3, 204, 20):
        for C in range(-5, 15):
            # Tune these 2
            option.N = N
            option.C = 2 ** C

            _, test_accuracy = RVFL_train_val(trainX, trainY, testX, testY, option)

             # parameter tuning: we prefer the parameter which lead to better accuracy on the test data
            if test_accuracy > max_acc: 
                max_acc = test_accuracy
                best_N, best_C = N, C
                
    print(f'Time taken = {time.time()-start:0.2f} seconds')
    print(f'Maximum test accuracy was {max_acc*100:0.2f}%, using N = {best_N} and C = {best_C}')
    return best_N, best_C
    

In [34]:
def validate(data, link = 1, bias = 1, scale = 1, act_fn = 'radbas', mode = 1, N = 3, C = 1):
    # Define options with given params
    index, dataX, dataY = data
    val_option = op(N = N,
                C = 2**C,
                link = link,
                bias = bias,
                Scale = scale,
                ActivationFunction = act_fn,
                mode = mode                
                )
    acc = np.zeros(4)
    # Loop through 4 folds and validate
    for i in range(4):
        trainX = dataX[index[2 * i], :]
        trainY = dataY[index[2 * i], :]
        testX = dataX[index[2 * i + 1], :]
        testY = dataY[index[2 * i + 1], :]

        _, acc[i] = RVFL_train_val(trainX, trainY, testX, testY, val_option)

    #print(f'Mean Val accuracy in 4 fold is {acc.mean()*100:0.2f}%')
    return acc.mean()

In [65]:
def main(dataset):
    trainX, trainY, testX, testY, data = read_data(dataset)
    
    print('Training Data Shape: ',trainX.shape,'\nTesting Data Shape:', testX.shape)
    print('TrainX Mean and STD', trainX.mean(), trainX.std())
    print('TestX Mean and STD', testX.mean(), testX.std())
    print('No. of classes:', len(np.unique(trainY)),'\nClass Labels:', np.unique(trainY))
    
    N,C = tune_params(trainX, trainY, testX, testY)
    
    kfold_acc = []
    kfold_acc.append(validate(data, N = N, C = C))
    kfold_acc.append(validate(data, link = 0, N = N, C = C))
    kfold_acc.append(validate(data, bias = 0, N = N, C = C))
    kfold_acc.append(validate(data, mode = 2, N = N, C = C))
    kfold_acc.append(validate(data, act_fn='tribas', N = N, C = C))
    kfold_acc = [round(x*100, 2) for x in kfold_acc]
    max_acc, best_s = 0, 0
    for scale in np.linspace(-5, 5, 21):
        acc = validate(data, scale = 2**scale, N = N, C = C)
        if acc > max_acc:
            max_acc = acc
            best_s = scale
    kfold_acc.append((best_s,round(max_acc*100,2)))
    #return (kfold_acc)

In [55]:
main('balance_scale')

Training Data Shape:  (313, 4) 
Testing Data Shape: (312, 4)
Train Mean and STD 0.011295632284761216 0.990707740448745
Train Mean and STD -0.011331836234391865 1.0091090383419214
No. of classes: 3 
Class Labels: [0. 1. 2.]
Tuning RVFL....
Time taken = 1.26 seconds
Maximum test accuracy was 92.63%, using N = 83 and C = 7
[91.03, 91.03, 91.03, 90.87, 92.47, (1.5, 93.27)]


In [56]:
main('car')

Training Data Shape:  (864, 6) 
Testing Data Shape: (864, 6)
Train Mean and STD -0.007730272451021512 0.9897391912291658
Train Mean and STD 0.007730272451021451 1.01009743050891
No. of classes: 4 
Class Labels: [0. 1. 2. 3.]
Tuning RVFL....
Time taken = 4.56 seconds
Maximum test accuracy was 95.95%, using N = 183 and C = 8
[95.2, 95.14, 95.14, 95.08, 92.36, (0.0, 95.2)]


In [66]:
main('contrac')

Training Data Shape:  (737, 9) 
Testing Data Shape: (736, 9)
TrainX Mean and STD -0.0009095826382162517 0.9897626551816237
TestX Mean and STD 0.0009108184841921084 1.0101464769175452
No. of classes: 3 
Class Labels: [0. 1. 2.]
Tuning RVFL....
Time taken = 3.70 seconds
Maximum test accuracy was 52.85%, using N = 163 and C = -4


In [57]:
main('credit_approval')

Training Data Shape:  (345, 15) 
Testing Data Shape: (345, 15)
Train Mean and STD 0.009789348477715634 1.0404599986917131
Train Mean and STD -0.009789348477715652 0.9577323887366471
No. of classes: 2 
Class Labels: [0. 1.]
Tuning RVFL....
Time taken = 1.36 seconds
Maximum test accuracy was 89.86%, using N = 23 and C = 5
[86.48, 83.28, 86.48, 86.92, 86.34, (-4.0, 88.08)]


In [58]:
main('led_display')

Training Data Shape:  (500, 7) 
Testing Data Shape: (500, 7)
Train Mean and STD 0.0028891534124725403 0.9992752571280451
Train Mean and STD -0.002889153412472573 1.0007158767986035
No. of classes: 10 
Class Labels: [0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
Tuning RVFL....
Time taken = 3.82 seconds
Maximum test accuracy was 72.20%, using N = 3 and C = 1
[72.7, 38.4, 73.0, 72.7, 72.8, (0.5, 72.9)]


In [59]:
main('molec_biol_splice')

Training Data Shape:  (1595, 60) 
Testing Data Shape: (1595, 60)
Train Mean and STD -0.002785969381028657 1.0016319161737697
Train Mean and STD 0.002785969381028664 0.9983576419556661
No. of classes: 3 
Class Labels: [0. 1. 2.]
Tuning RVFL....
Time taken = 9.12 seconds
Maximum test accuracy was 80.19%, using N = 23 and C = -3
[80.43, 51.76, 80.4, 81.37, 74.69, (2.5, 80.9)]


In [60]:
main('ringnorm')

Training Data Shape:  (3700, 20) 
Testing Data Shape: (3700, 20)
Train Mean and STD -0.0029062757604696406 1.0011692391315408
Train Mean and STD 0.002906275760469651 0.9988209357733648
No. of classes: 2 
Class Labels: [0. 1.]
Tuning RVFL....
Time taken = 18.44 seconds
Maximum test accuracy was 96.38%, using N = 143 and C = 5
[95.97, 95.51, 91.42, 95.96, 96.76, (2.0, 97.74)]


In [61]:
main('spambase')

Training Data Shape:  (2301, 57) 
Testing Data Shape: (2300, 57)
Train Mean and STD 0.0058683669028863445 1.0176719956538907
Train Mean and STD -0.005870918366757164 0.981967049417251
No. of classes: 2 
Class Labels: [0. 1.]
Tuning RVFL....
Time taken = 13.44 seconds
Maximum test accuracy was 91.52%, using N = 183 and C = 4
[91.17, 89.39, 91.09, 91.33, 90.76, (3.0, 91.85)]


In [64]:
main('semeion')

Training Data Shape:  (797, 256) 
Testing Data Shape: (796, 256)
Train Mean and STD 0.002165341274335672 0.9997924251729191
Train Mean and STD -0.002168061552318513 1.000203095862763
No. of classes: 10 
Class Labels: [0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
Tuning RVFL....
Time taken = 8.87 seconds
Maximum test accuracy was 90.20%, using N = 83 and C = 0
[89.01, 57.6, 89.13, 86.12, 87.81, (0.0, 89.01)]


In [63]:
main('waveform')

Training Data Shape:  (2500, 21) 
Testing Data Shape: (2500, 21)
Train Mean and STD -0.001999555627883342 1.0022877285362837
Train Mean and STD 0.0019995556278833435 0.9977030183276777
No. of classes: 3 
Class Labels: [0. 1. 2.]
Tuning RVFL....
Time taken = 12.08 seconds
Maximum test accuracy was 87.32%, using N = 103 and C = -5
[86.02, 77.82, 86.04, 86.28, 84.78, (0.0, 86.02)]
