In [1]:
#!/usr/bin/env python

import wuml 
import numpy as np
import scipy.stats
from wplotlib import histograms
from wplotlib import lines


data = wuml.wData('../../data/shap_regress_example_uniform.csv', first_row_is_label=True)
print(data)

2021-10-19 13:55:17.536767: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


         A       B       C       D   label
0   0.5488  0.7152  0.6028  0.5449 -1.1010
1   0.4237  0.6459  0.4376  0.8918 -5.6690
2   0.9637  0.3834  0.7917  0.5289  0.7819
3   0.5680  0.9256  0.0710  0.0871  3.5834
4   0.0202  0.8326  0.7782  0.8700 -7.5046
5   0.9786  0.7992  0.4615  0.7805 -1.0018
6   0.1183  0.6399  0.1434  0.9447 -8.0479
7   0.5218  0.4147  0.2646  0.7742 -4.1428
8   0.4562  0.5684  0.0188  0.6176 -2.6073
9   0.6121  0.6169  0.9437  0.6818 -2.3098
10  0.3595  0.4370  0.6976  0.0602  1.8985
11  0.6668  0.6706  0.2104  0.1289  3.3795
12  0.3154  0.3637  0.5702  0.4386 -1.8187
13  0.9884  0.1020  0.2089  0.1613  3.8171
14  0.6531  0.2533  0.4663  0.2444  1.6280
15  0.1590  0.1104  0.6563  0.1382 -0.2118
16  0.1966  0.3687  0.8210  0.0971  0.6198
17  0.8379  0.0961  0.9765  0.4687  0.1970
18  0.9768  0.6048  0.7393  0.0392  5.7602
19  0.2828  0.1202  0.2961  0.1187  0.5982
20  0.3180  0.4143  0.0641  0.6925 -4.3534
21  0.5666  0.2654  0.5232  0.0939  2.4780
22  0.5759 

In [2]:
print(wuml.feature_wise_HSIC(data))

              A         B         C         D     label
A      1.000000  0.020198  0.064668  0.013744  0.124390
B      0.020198  1.000000  0.060263  0.166303  0.121114
C      0.064668  0.060263  1.000000  0.010036  0.034319
D      0.013744  0.166303  0.010036  1.000000  0.642688
label  0.124390  0.121114  0.034319  0.642688  1.000000


In [3]:
# This command will order list all pairs of dependencies from the highest
print(wuml.feature_wise_HSIC(data, get_top_dependent_pairs=True))

                0
D label  0.642688
B D      0.166303
A label  0.124390
B label  0.121114
A C      0.064668


In [4]:
# This command will order list pairs of dependencies only against the label from the highest
print(wuml.feature_wise_HSIC(data, label_name='label', get_top_dependent_pairs=True))

      label
D  0.642688
A  0.124390
B  0.121114
C  0.034319


In [5]:
# This command will compare joint features against a list of labels
X = data[:,0:3]
Ys = data[:,3:5]
depList = wuml.HSIC_of_feature_groups_vs_label_list(X, Ys)
print(depList)

       feature_group
label       0.922283
D           0.525231


In [8]:
# This function handles missing data as well by removing missing entries during pairwise HSIC
data = wuml.wData('../../data/missin_example.csv', first_row_is_label=True)
print(data)
print('\n\n')
print(wuml.feature_wise_HSIC(data))

     A    B    C    D    E     F
0   NaN  NaN  NaN  NaN  NaN  7.0
1   1.0  2.0  3.0  4.0  5.0  6.0
2   1.0  2.0  3.0  3.0  3.0  4.0
3   2.0  2.0  3.0  3.0  3.0  5.0
4   NaN  2.0  3.0  3.0  3.0  3.0
5   NaN  2.0  3.0  3.0  3.0  NaN
6   NaN  2.0  3.0  3.0  3.0  NaN
7   NaN  2.0  3.0  NaN  3.0  NaN
8   NaN  2.0  3.0  3.0  3.0  NaN
9   NaN  2.0  NaN  3.0  3.0  NaN
10  NaN  2.0  NaN  NaN  3.0  NaN
11  NaN  2.0  NaN  NaN  3.0  NaN
12  NaN  2.0  NaN  NaN  3.0  NaN
13  NaN  2.0  NaN  NaN  NaN  NaN
14  NaN  2.0  NaN  NaN  NaN  NaN



          A    B    C         D         E          F
A   1.000000  0.0  0.0  0.222222  0.222222  0.241344
B   0.000000  1.0  0.0  0.000000  0.000000  0.000000
C   0.000000  0.0  1.0  0.000000  0.000000  0.000000
D   0.222222  0.0  0.0  1.000000  0.437500  0.521159
E   0.222222  0.0  0.0  0.437500  1.000000  0.521159
F   0.241344  0.0  0.0  0.521159  0.521159  1.000000
