In [1]:
#!/usr/bin/env python

import wuml 
import numpy as np
import scipy.stats
from wplotlib import histograms
from wplotlib import lines


	
	
'''
	Identifies a weight associated with each sample based on its likelihood. 
	Given p(X1) > p(Xi) for all i
	Using KDE if p(X1)/p(X2)=2  the weight for X1 = 1, and X2 = 2
	This means that if X1 is the most likely samples and if X1 is 
	2 times more likely than X2, then X1 would have a weight of 1
	and X2 would have a weight of 2.
	This weight can then be used to balance the sample importance for regression

'''

data = wuml.wData('../../data/Chem_decimated_imputed.csv', row_id_with_label=0)
data.delete_column('id')	# the id should not be part of the likelihood 

sample_weights = wuml.get_likelihood_weight(data['finalga_best'])
print(wuml.output_two_columns_side_by_side(data['finalga_best'], sample_weights, labels=['age','weight'], rounding=3))




['age', 'weight']
[ 33.857  24.243]
[ 37.429   2.861]
[ 39.571   1.011]
[ 38.857   1.094]
[ 37.286   3.231]
[ 38.857   1.094]
[ 39.      1.048]
[ 39.714   1.035]
[ 36.571   6.326]
[ 35.     19.745]
[ 39.143   1.017]
[ 38.857   1.094]
[ 38.429   1.345]
[ 38.714   1.158]
[ 38.571   1.241]
[ 39.143   1.017]
[ 39.714   1.035]
[ 39.714   1.035]
[ 37.714   2.261]
[ 38.571   1.241]
[ 39.429   1.   ]
[ 39.143   1.017]
[ 39.857   1.072]
[ 37.429   2.861]
[ 36.     10.57 ]
[ 38.857   1.094]
[ 40.571   1.529]
[ 38.143   1.625]
[ 38.286   1.472]
[ 38.714   1.158]
[ 39.286   1.002]
[ 39.286   1.002]
[ 39.143   1.017]
[ 36.857   4.772]
[ 39.571   1.011]
[ 39.      1.048]
[ 37.      4.169]
[ 38.571   1.241]
[ 34.143  23.303]
[ 39.286   1.002]
[ 39.      1.048]
[ 35.571  13.981]
[ 39.286   1.002]
[ 35.857  11.691]
[ 38.857   1.094]
[ 37.143   3.661]
[ 37.714   2.261]
[ 38.143   1.625]
[ 38.857   1.094]
[ 38.857   1.094]
[ 39.714   1.035]
[ 38.857   1.094]
[ 41.571   4.761]
[ 38.429   1.345]
[ 40.     