In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.io import arff

from weka.core import jvm
from weka.classifiers import Classifier
from weka.core.converters import Loader
from weka.filters import Filter

from copy import deepcopy
import sklearn

In [2]:
jvm.start()

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['/home/edoardobucheli/anaconda3/envs/tf-gpu/lib/python3.6/site-packages/javabridge/jars/rhino-1.7R4.jar', '/home/edoardobucheli/anaconda3/envs/tf-gpu/lib/python3.6/site-packages/javabridge/jars/runnablequeue.jar', '/home/edoardobucheli/anaconda3/envs/tf-gpu/lib/python3.6/site-packages/javabridge/jars/cpython.jar', '/home/edoardobucheli/anaconda3/envs/tf-gpu/lib/python3.6/site-packages/weka/lib/python-weka-wrapper.jar', '/home/edoardobucheli/anaconda3/envs/tf-gpu/lib/python3.6/site-packages/weka/lib/weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


### Import Data

In [5]:
arff_data = arff.loadarff('./breast-cancer.arff')
df = pd.DataFrame(arff_data[0])

In [6]:
cols = df.columns

In [7]:
loader = Loader(classname='weka.core.converters.ArffLoader')
data = loader.load_file('./breast-cancer.arff')

In [8]:
attributes = [f.name for f in data.attributes()]

In [9]:
map_att_toint = dict([[f.name,i] for i,f in enumerate(data.attributes())])

In [10]:
map_att_label_toint = dict([[f.name,dict([[g,j] 
                                          for j,g in enumerate(f.values)])] 
                                                for i,f in enumerate(data.attributes())])

Get Normal Data Split

In [11]:
keep_normal = Filter(classname='weka.filters.unsupervised.instance.RemoveWithValues',
                     options = ['-C','last','-L','last'])
keep_normal.inputformat(data)
data_normal = keep_normal.filter(data)

Remove class

In [12]:
remove = Filter(classname='weka.filters.unsupervised.attribute.Remove',options = ['-R','last'])
remove.inputformat(data_normal)
data_normal_v2 = remove.filter(data_normal)

In [13]:
N = data_normal_v2.num_instances

Get Anomalous Data Split

In [14]:
keep_anom = Filter(classname='weka.filters.unsupervised.instance.RemoveWithValues',
                     options = ['-C','last','-L','first'])
keep_anom.inputformat(data)
data_anom = keep_anom.filter(data)

In [15]:
data_anom_v2 = remove.filter(data_anom)

In [16]:
N_anom = data_anom_v2.num_instances

### Train Trees

In [18]:
from weka.classifiers import Evaluation
from weka.core.classes import Random

In [65]:
data_normal_v2.class_index = 1
evl = Evaluation(data_normal_v2)
evl.crossvalidate_model(this_clf,data_normal_v2,10,Random(1))

this_clf = Classifier(classname='weka.classifiers.trees.J48',options = ['-C','0.25','-M','2'])
this_clf.build_classifier(data_normal_v2)

In [68]:
clfs = []
evls = []

dt_y_hat = []

for i,att in enumerate(attributes[:-1]):
    
    data_normal_v2.class_index = i
    
    this_clf = Classifier(classname='weka.classifiers.trees.J48',options = ['-C','0.25','-M','2'])
    this_clf.build_classifier(data_normal_v2)
    
    this_evl = Evaluation(data_normal_v2)
    this_evl.crossvalidate_model(this_clf,data_normal_v2,10,Random(1))
    
    dt_y_hat.append(this_clf.distributions_for_instances(data_normal_v2))
    
    clfs.append(this_clf)
    evls.append(this_evl)

### Get initial weights

In [16]:
from sklearn.metrics import recall_score, roc_auc_score

Initial weights for layer 1

In [17]:
w2_init = []

for i,att in enumerate(attributes[:-1]):
    
    this_y_hat = np.argmax(dt_y_hat[i],axis = 1)
    this_y = data_normal_v2.values(i)
    
    rocs = []
    
    for j in np.unique(this_y):
        
        new_y_hat = np.array([1 if f == j else 0 for f in this_y_hat])
        new_y = np.array([1 if f == j else 0 for f in this_y])
        
        if np.all(new_y == 0):
            pass
        else:
            rocs.append(roc_auc_score(new_y,new_y_hat))

    w2_init.append(np.mean(rocs))
            
    print('AUC: {:0.4f}'.format(np.mean(rocs)))
    
    #print('ACURACCY: {:0.4f}'.format(np.mean(this_y_hat==this_y)))

AUC: 0.8657
AUC: 0.8710
AUC: 0.7019
AUC: 0.7974
AUC: 0.9000
AUC: 0.8015
AUC: 0.8341
AUC: 0.7255
AUC: 0.7611


In [26]:
w1_init = []

for i,att in enumerate(attributes[:-1]):
    
    temp_w = np.zeros(len(data_normal_v2.attribute(i).values))
    this_y_hat = np.argmax(dt_y_hat[i],axis = 1)
    this_y = data_normal_v2.values(i)
    this_y[np.isnan(this_y)] = 1
    
    this_recs = recall_score(this_y,this_y_hat,average=None)
    
    for i,rec in zip(np.unique(np.concatenate((this_y_hat,this_y))),this_recs):
        
        temp_w[int(i)] = rec
    
    w1_init.append(temp_w)

In [27]:
w1_init

[array([0.        , 1.        , 0.76190476, 0.82539683, 0.69014085,
        0.825     , 0.6       , 0.        , 0.        ]),
 array([0.6       , 0.90425532, 0.91176471]),
 array([0.42857143, 1.        , 0.62962963, 0.65217391, 0.55882353,
        0.58333333, 0.45714286, 0.25      , 0.4375    , 0.        ,
        0.        , 0.        ]),
 array([0.99401198, 0.52631579, 0.42857143, 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]),
 array([0.84      , 0.97159091]),
 array([0.83050847, 0.75490196, 0.625     ]),
 array([0.90291262, 0.76530612]),
 array([0.76056338, 0.8       , 0.55      , 0.22222222, 0.41176471]),
 array([0.54054054, 0.98170732])]

In [28]:
w2_init

[0.8657342070508022,
 0.8709659556120252,
 0.7018916614472068,
 0.7974086544593689,
 0.8999737506645402,
 0.8015096091074447,
 0.8341093719041015,
 0.7255382886872905,
 0.7611239288068556]

In [29]:
bias_init = np.zeros((len(w2_init)))

In [30]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

### Define some network functions

In [31]:
def neuron_l1(x_prime,weights,bias,indxs):
    
    my_res = np.zeros((len(x_prime)))

    for i,this_x_prime in enumerate(x_prime):
        
        if indxs[i] >= len(weights):
            indxs[i] -= 1
        this_x_wrong = np.delete(this_x_prime,indxs[i])
        w_wrong = np.delete(weights,indxs[i])
        
        my_res[i] = sigmoid(this_x_prime[indxs[i]]*weights[indxs[i]]-np.mean(this_x_wrong*w_wrong)+bias)
        
    return my_res

Encoder

In [32]:
w1 = deepcopy(w1_init)
w2 = deepcopy(w2_init)
b1 = deepcopy(bias_init)

lr = 0.001
iterations = 100
lrw1 = 0.01

losses = []
accs = []

y_norm = np.ones(N)

for i in range(iterations):
    
    hl1_this = np.zeros((N,len(dt_y_hat)))
    
    for j,x_prime in enumerate(dt_y_hat):
    
        x_prime_prime = x_prime*w1[j]
        
        label = cols[1:][j]
        
        num_labels = data_normal_v2.values(j)
        
        #num_labels = [map_dicts_v2[label][f] for f in ys[j]]
        
        for k in np.unique(num_labels):
            
            if np.isnan(k):
                k = 0
            
            k = int(k)
            
            
            if k >= x_prime.shape[1]:
                break
            
            indices = np.where(num_labels==k)
            this_probs = x_prime[indices]
            x_prime_this_this = x_prime[indices]
            x_prime_prime_this_this = x_prime_prime[indices]
            
            edo = neuron_l1(x_prime_this_this,w1[j],b1[j],np.ones((len(x_prime_this_this)),dtype = int)*k)
            
            grad_wj = np.dot(edo*(1-edo),x_prime_this_this[:,k])
            grad_bias = np.mean(edo*(1-edo))
            
    
            w1[j][k] = w1[j][k] + lrw1*grad_wj
            b1[j] = b1[j] + lrw1*grad_bias
            #for this_indx in [f for f in np.arange(np.max(num_labels)) if f != k]:
                
            #    if this_indx < k:
            #        grad_indx = this_indx
            #    elif this_indx > k:
            #        grad_indx = this_indx-1
                    
            #    w1[j][this_indx] = w1[j][grad_indx] - lrw1*grad_wl[grad_indx]
    
    #this_loss = []
    
    for j,x_prime in enumerate(dt_y_hat):
        
        label = cols[1:][j]
        num_labels = np.array(data_normal_v2.values(j),dtype = np.int64)
        
        for dani,f in enumerate(num_labels):
            if f < 0:
                num_labels[dani] = 0
        #print(num_labels)
        
        hl1_this[:,j] = neuron_l1(x_prime,w1[j],b1[j],num_labels)
        
    this_loss = np.mean(hl1_this,axis = 0)
    #print(this_loss.shape)
    if i % 50 == 0 or i == iterations-1:
        print('Iteration {}:'.format(i+1))
        for m, loss_part in enumerate(this_loss):
            print('\tAttribute {}: {:0.4f}'.format(m+1,loss_part))

Iteration 1:
	Attribute 1: 0.6436
	Attribute 2: 0.7000
	Attribute 3: 0.5620
	Attribute 4: 0.7436
	Attribute 5: 0.7548
	Attribute 6: 0.6154
	Attribute 7: 0.6396
	Attribute 8: 0.6000
	Attribute 9: 0.6967
Iteration 51:
	Attribute 1: 0.9004
	Attribute 2: 0.9179
	Attribute 3: 0.8446
	Attribute 4: 0.9476
	Attribute 5: 0.9505
	Attribute 6: 0.8307
	Attribute 7: 0.8239
	Attribute 8: 0.8374
	Attribute 9: 0.8770
Iteration 100:
	Attribute 1: 0.9423
	Attribute 2: 0.9331
	Attribute 3: 0.9193
	Attribute 4: 0.9677
	Attribute 5: 0.9565
	Attribute 6: 0.8656
	Attribute 7: 0.8365
	Attribute 8: 0.8915
	Attribute 9: 0.8880


### Test

Normal Data

In [33]:
hl1_this_all = np.zeros((N,len(cols[:-1])))
preds = []

for j,x_prime in enumerate(dt_y_hat):
        
        label = cols[:-1][j]
        num_labels = np.array(data_normal_v2.values(j),dtype = np.int64)
        
        for dani,f in enumerate(num_labels):
            if f < 0:
                num_labels[dani] = 0
        
        preds.append(neuron_l1(x_prime,w1[j],b1[j],num_labels))

In [34]:
for i,this_pred in enumerate(preds):
    print('Attribute {}: Score {:0.4f}'.format(i+1,np.mean(this_pred)))

Attribute 1: Score 0.9423
Attribute 2: Score 0.9331
Attribute 3: Score 0.9193
Attribute 4: Score 0.9677
Attribute 5: Score 0.9565
Attribute 6: Score 0.8656
Attribute 7: Score 0.8365
Attribute 8: Score 0.8915
Attribute 9: Score 0.8880


In [35]:
np.mean(np.array(preds))

0.9111590841599293

Anomalous Data

In [36]:
dt_anom = []

for i,att in enumerate(attributes[:-1]):
    
    data_anom_v2.class_index = i
    dt_anom.append(clfs[i].distributions_for_instances(data_anom_v2))

In [37]:
hl1_this_anom = np.zeros((N_anom,len(cols[:-1])))
preds_anom = []

for j,x_prime in enumerate(dt_anom):
        
        label = cols[1:][j]
        num_labels = np.array(data_anom_v2.values(j),dtype = np.int64)
        
        for dani,f in enumerate(num_labels):
            if f < 0:
                num_labels[dani] = 0
        
        preds_anom.append(neuron_l1(x_prime,w1[j],b1[j],num_labels))

In [38]:
for i,this_pred in enumerate(preds_anom):
    print('Attribute {}: Score {:0.4f}'.format(i+1,np.mean(this_pred)))

Attribute 1: Score 0.7834
Attribute 2: Score 0.8359
Attribute 3: Score 0.8690
Attribute 4: Score 0.8568
Attribute 5: Score 0.7234
Attribute 6: Score 0.4948
Attribute 7: Score 0.5720
Attribute 8: Score 0.6778
Attribute 9: Score 0.6099


In [39]:
np.mean(np.array(preds_anom))

0.7136742790402084

In [40]:
b1

array([0.84123603, 0.31293194, 1.72558359, 0.92835619, 0.34460285,
       0.39311397, 0.17726547, 0.81791636, 0.1428926 ])

Initial Weights

In [41]:
hl1_this_all = np.zeros((N,len(cols[1:])))
preds = []

for j,x_prime in enumerate(dt_y_hat):
        
        label = cols[1:][j]
        num_labels = np.array(data_normal_v2.values(j),dtype = np.int64)
        
        for dani,f in enumerate(num_labels):
            if f < 0:
                num_labels[dani] = 0
        
        preds.append(neuron_l1(x_prime,w1_init[j],bias_init[j],num_labels))

In [42]:
for i,this_pred in enumerate(preds):
    print('Attribute {}: Score {}'.format(i+1,np.mean(this_pred)))

Attribute 1: Score 0.6263584326780358
Attribute 2: Score 0.6697571959533569
Attribute 3: Score 0.5524379574537233
Attribute 4: Score 0.6944250283745258
Attribute 5: Score 0.6978110974646121
Attribute 6: Score 0.5981608870097443
Attribute 7: Score 0.6169912338960316
Attribute 8: Score 0.5867774235915114
Attribute 9: Score 0.6561085378480972


In [43]:
np.mean(np.array(preds))

0.633203088252182

In [44]:
hl1_this_anom = np.zeros((N_anom,len(cols[1:])))
preds_anom = []

for j,x_prime in enumerate(dt_anom):
        
        label = cols[1:][j]
        num_labels = np.array(data_anom_v2.values(j),dtype = np.int64)
        
        for dani,f in enumerate(num_labels):
            if f < 0:
                num_labels[dani] = 0
        
        preds_anom.append(neuron_l1(x_prime,w1_init[j],bias_init[j],num_labels))

In [45]:
for i,this_pred in enumerate(preds_anom):
    print('Attribute {}: Score {}'.format(i+1,np.mean(this_pred)))

Attribute 1: Score 0.5532686320614217
Attribute 2: Score 0.6369465944104618
Attribute 3: Score 0.5143079719426659
Attribute 4: Score 0.6102281946353203
Attribute 5: Score 0.5905296405214252
Attribute 6: Score 0.4970453985576635
Attribute 7: Score 0.5224972798925632
Attribute 8: Score 0.5198206946978073
Attribute 9: Score 0.5461818075939341


In [46]:
np.mean(np.array(preds_anom))

0.554536246034807

### Final Test

In [47]:
data_noclass = remove.filter(data)

In [48]:
N_all = data_noclass.num_instances

In [49]:
dt_all = []

for i,att in enumerate(attributes[:-1]):
    
    data_noclass.class_index = i
    dt_all.append(clfs[i].distributions_for_instances(data_noclass))

In [51]:
hl1_this_all = np.zeros((N_all,len(cols[:-1])))
preds = []

for j,x_prime in enumerate(dt_all):
        
    label = cols[:-1][j]
    num_labels = np.array(data_noclass.values(j),dtype = np.int64)

    for dani,f in enumerate(num_labels):
        if f < 0:
            num_labels[dani] = 0

    preds.append(neuron_l1(x_prime,w1[j],b1[j],num_labels))

In [66]:
res = np.dot(np.array(preds).T,w2)/len(attributes[:-1])

In [67]:
y = data.values(9)

In [68]:
y = np.abs(y-1)

In [69]:
roc_auc_score(y,res)

0.9165642376353527

In [82]:
data.class_index

-1

In [83]:
data_noclass = remove.filter(data)

In [84]:
N_all = data_noclass.num_instances

In [85]:
dt_all = []

for i,att in enumerate(attributes[:-1]):
    
    data_noclass.class_index = i
    dt_all.append(clfs[i].distributions_for_instances(data_noclass))

In [86]:
hl1_this_all = np.zeros((N_all,len(cols[:-1])))
preds = []

for j,x_prime in enumerate(dt_all):
        
    label = cols[:-1][j]
    num_labels = np.array(data_noclass.values(j),dtype = np.int64)

    for dani,f in enumerate(num_labels):
        if f < 0:
            num_labels[dani] = 0

    preds.append(neuron_l1(x_prime,w1_init[j],bias_init[j],num_labels))

In [87]:
res = np.dot(np.array(preds).T,w2_init)/len(attributes[:-1])

In [88]:
y = data.values(9)

In [89]:
y = np.abs(y-1)

In [90]:
roc_auc_score(y,res)

0.9135206321334505

In [91]:
w1_init

[array([0.        , 1.        , 0.76190476, 0.82539683, 0.69014085,
        0.825     , 0.6       , 0.        , 0.        ]),
 array([0.6       , 0.90425532, 0.91176471]),
 array([0.42857143, 1.        , 0.62962963, 0.65217391, 0.55882353,
        0.58333333, 0.45714286, 0.25      , 0.4375    , 0.        ,
        0.        , 0.        ]),
 array([0.99401198, 0.52631579, 0.42857143, 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]),
 array([0.84      , 0.97159091]),
 array([0.83050847, 0.75490196, 0.625     ]),
 array([0.90291262, 0.76530612]),
 array([0.76056338, 0.8       , 0.55      , 0.22222222, 0.41176471]),
 array([0.54054054, 0.98170732])]

In [92]:
w1

[array([0.        , 1.10570678, 2.54090261, 4.07145302, 4.25604559,
        3.4634775 , 1.17544487, 0.        , 0.        ]),
 array([1.15591243, 4.87687463, 5.08489828]),
 array([0.7829456 , 1.28248344, 2.10744995, 2.0258428 , 2.45673093,
        2.53735764, 2.45070698, 1.05458955, 1.55718043, 0.05993642,
        0.23469907, 0.        ]),
 array([4.80590494, 2.17340369, 1.1618797 , 1.46604048, 1.14152089,
        0.21113271, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]),
 array([6.02226071, 5.18043671]),
 array([5.02215376, 5.82400625, 4.07067964]),
 array([6.33136659, 6.38582259]),
 array([4.66328357, 4.67576166, 2.38637668, 1.7567905 , 2.00800826]),
 array([3.23901488, 5.70109353])]