In [None]:
#| eval: false
! [ -e /content ] && pip install -Uqq xcube # upgrade xcube on colab

In [None]:
from xcube.l2r.all import *

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from fastai.data.core import *
# from fastai.data.transforms import *
# from fastai.text.core import *
# from fastai.text.data import *

# Boot L2R 

> Bootstrapping a learning-to-rank model

In this tutorial we will find a needle in the haystack with mutual infomation gain:

In [None]:
paths = make_paths(Path.cwd(), 'mimic3-9k')

#### Mutual-Information Computation

In [None]:
source = untar_xxx(XURLs.MIMIC3_L2R)
data = source/'mimic3-9k.csv'
data

Path('/home/deb/.xcube/data/mimic3_l2r/mimic3-9k.csv')

In [None]:
df = pd.read_csv(data,
                 header=0,
                 names=['subject_id', 'hadm_id', 'text', 'labels', 'length', 'is_valid'],
                 dtype={'subject_id': str, 'hadm_id': str, 'text': str, 'labels': str, 'length': np.int64, 'is_valid': bool})
df[['text', 'labels']] = df[['text', 'labels']].astype(str)
len(df)

52726

In [None]:
df.head(3)

Unnamed: 0,subject_id,hadm_id,text,labels,length,is_valid
0,86006,111912,admission date discharge date date of birth sex f service surgery allergies patient recorded as having no known allergies to drugs attending first name3 lf chief complaint 60f on coumadin was found slightly drowsy tonight then fell down stairs paramedic found her unconscious and she was intubated w o any medication head ct shows multiple iph transferred to hospital1 for further eval major surgical or invasive procedure none past medical history her medical history is significant for hypertension osteoarthritis involving bilateral knee joints with a dependence on cane for ambulation chronic...,801.35;348.4;805.06;807.01;998.30;707.24;E880.9;427.31;414.01;401.9;V58.61;V43.64;707.00;E878.1;96.71,230,False
1,85950,189769,admission date discharge date service neurosurgery allergies sulfa sulfonamides attending first name3 lf chief complaint cc cc contact info major surgical or invasive procedure none history of present illness hpi 88m who lives with family had fall yesterday today had decline in mental status ems called pt was unresponsive on arrival went to osh head ct showed large r sdh pt was intubated at osh and transferred to hospital1 for further care past medical history cad s p mi in s p cabg in ventricular aneurysm at that time cath in with occluded rca unable to intervene chf reported ef 1st degre...,852.25;E888.9;403.90;585.9;250.00;414.00;V45.81;96.71,304,False
2,88025,180431,admission date discharge date date of birth sex f service surgery allergies no known allergies adverse drug reactions attending first name3 lf chief complaint s p fall major surgical or invasive procedure none history of present illness 45f etoh s p fall from window at feet found ambulating and slurring speech on scene intubated en route for declining mental status in the er the patient was found to be bradycardic to the s with bp of systolic she was given atropine dilantin and was started on saline past medical history unknown social history unknown family history unknown physical exam ex...,518.81;348.4;348.82;801.25;427.89;E882;V49.86;305.00;96.71;38.93,359,False


*Technical Point:* If we want to sample do perform quick iterations, we need to make sure the number of data points in the sample is a multiple of `bs`. So that we do not have to do a `drop_last=True` while creating the `Dataloaders`. This is because we are about to do some probability computations, and dropping data points is not a good idea as probabilities would not sum to 1.

In [None]:
cut = len(df) - len(df)%8
df = df[:cut]
len(df)

52720

**Run the cell below only if you want to sample from the full dataset to create a tiny dataset for the purpose of quick iterations.**

In [None]:
bs = 8
_arr = np.arange(0, len(df), bs)
mask = (_arr > 4000) & (_arr < 5000)
_n = np.random.choice(_arr[mask], 1)
df = df.sample(n=_n, random_state=89, ignore_index=True)
len(df)

4840

In [None]:
splits = ColSplitter()(df)
splits

((#4503) [0,1,2,3,5,6,9,10,11,12...],
 (#337) [4,7,8,51,64,74,120,125,132,141...])

In [None]:
lm_vocab = torch.load(dls_lm_vocab_path)

In [None]:
@Transform
def Cleanser(toks): return [o for o in toks if o in lm_vocab]

In [None]:
class MyNumericalize(Transform):
    "Transform to remove tokens not present in `vocab`"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None):
        store_attr('vocab,min_freq,max_vocab,special_toks')
        self.o2i = None if vocab is None else defaultdict(int, {v: i for i,v in enumerate(vocab)})
    
    def setups(self, dsets):
        if dsets is None: return
        if self.vocab is None:
            count = dsets.counter if getattr(dsets, 'counter', None) is not None else Counter(p for o in dsets for p in o)
            if self.special_toks is None and hasattr(dsets, 'special_toks'):
                self.special_toks = dsets.special_toks
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab, special_toks=self.special_toks)
            self.o2i = defaultdict(int, {v:i for i,v in enumerate(self.vocab) if v != 'xxfake'})
    
    def encodes(self, o): return TensorText(tensor([self.o2i[o_] for o_ in o if o_ in self.vocab]))
    def decodes(self, o): return L(self.vocab[o_] for o_ in o)

In [None]:
# resort to this if anythiong goes wrong below
x_tfms = [Tokenizer.from_df('text', n_workers=num_cpus()), attrgetter("text"), Cleanser, MultiCategorize(vocab=lm_vocab), OneHotEncode()]
y_tfms = [ColReader('labels', label_delim=';'), MultiCategorize(), OneHotEncode()]
tfms = [x_tfms, y_tfms]

In [None]:
class Chunkifize(Transform):
    order = 4
    def __init__(self, num_chunks=3): store_attr('num_chunks')
    def encodes(self, o): 
        return list(torch.chunk(o, self.num_chunks))
    def decodes(self, o): 
        return torch.cat(o)

In [None]:
chnk_tfm = Chunkifize()
chnks = chnk_tfm(torch.arange(10))
test_eq(type(chnks), list)
test_eq(chnks, [tensor([0, 1, 2, 3]), tensor([4, 5, 6, 7]), tensor([8, 9])])
# test_fail(lambda: chnk_tfm.decode(chnks), tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
test_eq(chnk_tfm.decode(chnks), tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

In [None]:
# y_tfms = [ColReader('labels', label_delim=';'), MultiCategorize(), OneHotEncode(), Chunkifize()]
# tfmd_y = TfmdLists(df, tfms=y_tfms)
# tfmd_y.decode(tfmd_y[0])

In [None]:
x_tfms = [Tokenizer.from_df('text', n_workers=num_cpus()), attrgetter("text"), Numericalize(), OneHotEncode()]
y_tfms = [ColReader('labels', label_delim=';'), MultiCategorize(), OneHotEncode()]
tfms = [x_tfms, y_tfms]

In [None]:
dsets = Datasets(df, tfms=[x_tfms, y_tfms])

In [None]:
dsets = onehot_dsets(df)

In [None]:
torch.save(dsets, 'dsets.pkl')

In [None]:
dsets = torch.load('dsets.pkl')

In [None]:
toks, lbs = dsets.vocab
L(toks), L(lbs), len(toks)*len(lbs)

((#57352) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the'...],
 (#8922) ['003.0','003.1','003.8','003.9','004.1','004.8','004.9','005.1','005.81','005.9'...],
 511694544)

In [None]:
x, y = dsets[0]

In [None]:
' '.join(L(toks)[torch.where(x==1)[0]])

'xxbos and to of was with a on in for no patient is name she discharge or as last history her right by admission date pain hospital first has ct medications hospital1 o well also given course after known service any medical past instructions exam md physical family allergies transferred condition then number procedure surgery surgical head chronic further w hypertension diagnosis non birth lf found social received sex bilateral followup name3 namepattern4 disposition lung back revealed severe attending complaint mm major invasive brief coumadin f chief significant multiple since intubated none completed medication down having made cancer name11 both scan very pattern1 drugs poor arrival requiring slightly recorded nc pupils unknown expired reactive knee eye considered le sedated withdrawal involving min toes shows bilat eval fell shortly stairs cane osteoarthritis ambulation prognosis cmo ue fixed recovered pupil metastasis flexion opening rt triple lobectomy extends corneal upgoing lt

In [None]:
lbs.map_ids(torch.where(y==1)[0])

(#15) ['348.4','401.9','414.01','427.31','707.00','707.24','801.35','805.06','807.01','96.71'...]

In [None]:
class BatchLbsChunkify(ItemTransform):
    order = 100
    def __init__(self, chnk_st, chnk_end): store_attr('chnk_st,chnk_end')
    def encodes(self, x): 
        return (x[0], x[1][:, self.chnk_st:self.chnk_end])

In [None]:
dls = lbs_chunked(dsets, chnk_sz=len(lbs))

In [None]:
len(dls)

1

In [None]:
bs = 8
len(dls[0]), len(dsets)//bs

(6590, 6590)

In [None]:
for x, y in itertools.islice(dls[0], 5):
    print(f'{x.shape = }, {y.shape =}' )

x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 8922])
x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 8922])
x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 8922])
x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 8922])
x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 8922])


In [None]:
dls = lbs_chunked(dsets)

In [None]:
assert isinstance(dls[0], TfmdDL)
test_eq(len(dls),  np.ceil(len(lbs)/200))
test_eq(len(dls[0]), len(dsets)//bs)

In [None]:
for x, y in itertools.islice(dls[0], 5):
    print(f'{x.shape = }, {y.shape =}' )

x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 200])
x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 200])
x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 200])
x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 200])
x.shape = torch.Size([8, 57352]), y.shape =torch.Size([8, 200])


In [None]:
bs, chnk_sz = 8, 200
dls = []
for chnk_st in range(0, len(lbs), chnk_sz):
    dls.append(TfmdDL(dsets, bs=bs, after_batch=[BatchLbsChunkify(chnk_st, min(chnk_st+chnk_sz, len(lbs)))], device=default_device()))
len(dls)

45

In [None]:
# x, y = dsets[0]

In [None]:
# test_eq(tensor(dsets.tfms[1][2].decode(y)), torch.where(y==1)[0])

In [None]:
# test_eq(tensor(dsets.tfms[0][-1].decode(x)), torch.where(x==1)[0])

**[Mutual Information](https://en.wikipedia.org/wiki/Mutual_information#)**

<img alt="Pictorial representation of simple neural network" width="400" src="info-gain.svg" caption="Pictorial representation of a simple neural network" id="img_simple_nn">

In [None]:
def mutual_info_gain(dl):
    """
    Computes [mutual information gain](https://en.wikipedia.org/wiki/Mutual_information) for each token label pair
    `dl` is (bag-of-words text, one-hot encoded targets)
    """
    xb, yb = dl.one_batch() 
    toksize, lblsize = xb.size(1), yb.size(1)
    p_TL = torch.zeros(toksize, lblsize, 4, dtype=torch.float, device=default_device())
    eps = p_TL.new_empty(1).fill_(1e-8)
    for x,y in dl:
        test_eq(x.shape, (dl.bs, toksize)); test_eq(y.shape, (dl.bs, lblsize))
        t = x.unsqueeze(-1).expand(-1, -1, lblsize) ; test_eq(t.shape, (dl.bs, toksize, lblsize))
        l = y.unsqueeze(1).expand(-1, toksize, -1) ; test_eq(l.shape, (dl.bs, toksize, lblsize))
        tl = torch.stack((t,l), dim=-1) ; test_eq(tl.shape, (dl.bs, toksize, lblsize, 2)) 
        p_TL_tt = tl[...,0].logical_and(tl[...,1]) ; test_eq(p_TL_tt.shape, (dl.bs, toksize, lblsize)) 
        p_TL_tf = tl[...,0].logical_and(tl[...,1].logical_not()) ; test_eq(p_TL_tf.shape, (dl.bs, toksize, lblsize)) 
        p_TL_ft = tl[...,0].logical_not().logical_and(tl[...,1]) ; test_eq(p_TL_ft.shape, (dl.bs, toksize, lblsize))
        p_TL_ff = tl[...,0].logical_not().logical_and(tl[...,1].logical_not()) ; test_eq(p_TL_ff.shape, (dl.bs, toksize, lblsize)) 
        p_TL = p_TL + torch.stack((p_TL_tt, p_TL_tf, p_TL_ft, p_TL_ff), dim=-1).float().sum(dim=0)
    p_TL = p_TL / tensor(len(dsets)).float()
    p_TL = p_TL.view(toksize, lblsize, 2, 2) ; test_eq(p_TL.shape, (toksize, lblsize, 2, 2))# last axis: lbl axis, 2nd last axis: token axis
    return p_TL

In [None]:
%%time
p_TL_full = [] 
for dl in progress_bar(dls):
    p_TL = mutual_info_gain(dl)
    p_TL_full.append(p_TL)
    del p_TL; del p_T; del p_L; del p_TxL; del I_TL; torch.cuda.empty_cache()
p_TL_full = torch.cat(p_TL_full, dim=1); test_eq(p_TL_full.shape, (len(toks), len(lbs), 2, 2))
# torch.save(p_TL_full, 'p_TL.pkl')

CPU times: user 3h 10min 23s, sys: 3min 22s, total: 3h 13min 46s
Wall time: 3h 13min 58s


In [None]:
def _compute(p_TL):
    eps = p_TL.new_empty(1).fill_(1e-15)
    toksize, lblsize = p_TL.size(0), p_TL.size(1)
    p_T = p_TL[:,0].sum(-1, keepdim=True); test_eq(p_T.shape, (toksize, 2, 1))# 0 because we can pick any label and apply total prob law
    p_L = p_TL[0,:].sum(-2, keepdim=True); test_eq(p_L.shape, (lblsize, 1, 2)) # 0 becuase we can pick any token and apply total prob law
    p_TxL = p_TL.sum(-1, keepdim=True) @ p_TL.sum(-2, keepdim=True); test_eq(p_TxL.shape, (toksize, lblsize, 2, 2))
    H_T = -(p_T * torch.log(p_T+eps)).sum(-2).squeeze(); test_eq(H_T.shape, [toksize])
    H_L = -(p_L * torch.log(p_L+eps)).sum(-1).squeeze(); test_eq(H_L.shape, [lblsize])
    I_TL = (p_TL * torch.log((p_TL + eps)/(p_TxL + eps))).flatten(start_dim=-2).sum(-1); test_eq(I_TL.shape, (toksize, lblsize))
    return p_T, p_L, p_TxL, H_T, H_L, I_TL

In [None]:
%%time
p_TL = torch.load('p_TL.pkl')
p_T, p_L, p_TxL, H_T, H_L, I_TL = _compute(p_TL)
torch.save((p_T, p_L, p_TxL, H_T, H_L, I_TL), 'info.pkl')

CPU times: user 12.5 s, sys: 11.5 s, total: 24 s
Wall time: 30 s


In [None]:
%%time 
p_TL = torch.load('p_TL.pkl')
p_T, p_L, p_TxL, H_T, H_L, I_TL = torch.load('info.pkl')

CPU times: user 8.61 s, sys: 3.62 s, total: 12.2 s
Wall time: 13.3 s


Make sure that aren't any of those pesky nans or negs:

In [None]:
for o in (p_TL, p_T, p_L, p_TxL, H_T, H_L, I_TL):
    try:
        assert not o.isnan().all() # check for nans
        test_eq(torch.where(o>=0, True, False).all(), True) # check for negs
    except AssertionError:
        print(f"{namestr(o, globals())[0]} failed")

I_TL failed


Theoretically, Mutual-Info as defined [here](https://en.wikipedia.org/wiki/Mutual_information) is suposed to be non-negative (can be proved by tossing in [Jensen](https://en.wikipedia.org/wiki/Jensen%27s_inequality)). But, practically, it turns out `I_TL` has some negs because we distorted the `p_TL` and `p_TxL`  with `eps` in the `I_TL` computation.

In [None]:
torch.topk(I_TL.flatten(), 10, largest=False)

torch.return_types.topk(
values=TensorMultiCategory([-1.9016e-07, -1.8314e-07, -1.8314e-07, -1.7385e-07, -1.7277e-07, -1.7277e-07, -1.6798e-07, -1.6798e-07, -1.6798e-07, -1.6767e-07], device='cuda:0'),
indices=TensorMultiCategory([22423614,  2731838,  2735913,  1911099,  6389159,  6393113,  6693073,  6695018,  6695355, 32253137], device='cuda:0'))

In [None]:
howmany = torch.where(I_TL < 0, True, False).sum().item()
negs = torch.where(I_TL < 0, I_TL, I_TL.new_zeros(I_TL.shape))
negs.sum()/howmany

TensorMultiCategory(-3.9054e-08, device='cuda:0')

Those negs on an avg are pretty close to zero. So we need not worry. Let's roll!

In [None]:
test_eq(p_TL.shape, (len(toks), len(lbs), 2, 2))
test_eq(p_T.shape, (len(toks), 2, 1))
test_eq(p_L.shape, (len(lbs), 1, 2))
test_eq(p_TxL.shape, (len(toks), len(lbs), 2, 2))
test_eq(H_T.shape, [len(toks)])
test_eq(H_L.shape, [len(lbs)])
test_eq(I_TL.shape, (len(toks), len(lbs)))

In [None]:
# r_t, r_l = random.randrange(0, len(toks)), random.randrange(0, len(lbs))
# toks[r_t], lbs[r_l]

In [None]:
# test_close(p_TL[r_t,r_l].sum(), 1, eps=1e-1)
# test_eq(p_T[r_t].sum(), 1)
# test_eq(p_L[r_l].sum(), 1)

In [None]:
# p_TL[r_t,r_l].sum(-1), p_TL[r_t, 400].sum(-1) 

In [None]:
# p_T[r_t], p_L[r_l]
# I_TL[r_t,r_l]

Let's save the `info`, we will use this bootstrap the collab model:

In [None]:
eps = I_TL.new_empty(1).fill_(1e-15)
info_lbl_entropy = I_TL/(H_L + eps)
info_jaccard = I_TL/(H_T.unsqueeze(-1) + H_L.unsqueeze(0) - I_TL + eps)
assert not info_lbl_entropy.isnan().all(); assert not info_jaccard.isnan().all()
collab_bootstrap = {'toks': toks, 'lbs': lbs, 'mut_info_lbl_entropy': info_lbl_entropy, 'mutual_info_jaccard': info_jaccard}

In [None]:
torch.save(collab_bootstrap, collab_bootst_path)
assert collab_bootst_path.exists()

#### Save those Mutual Information Gain values

Let's take a look at the *Mutual Information Gain* (`I_TL`) for each of the labels:

In [None]:
f = ColReader('labels', label_delim=';')
lbs_frqs = Counter()
for o in df.itertuples(): lbs_frqs.update(f(o))
with open(path.parent/'data'/'code_desc.pkl', 'rb') as f: lbs_desc = pickle.load(f)

In [None]:
def _gen(toks, lbs, lbs_frqs, lbs_desc, p_TL, p_T, p_L, info, H_T, H_L, k=5):
    sorted_by_tok, tok_idxs = torch.sort(info, dim=0, descending=True) 
    for i,o in enumerate(lbs):
        topk_tok_idxs = tok_idxs[:k, i].cpu()
        topk_toks = toks[topk_tok_idxs]
        topk_toks_probs = p_T.squeeze()[:,0][topk_tok_idxs].cpu().numpy()
        topk_info_gains = sorted_by_tok[:k, i].cpu().numpy()
        topk_jnt_probs = p_TL[topk_tok_idxs, [i]][:,0,0].cpu().numpy()
        lbl_entropy = H_L[i].cpu().numpy()
        topk_tok_entrops = H_T[topk_tok_idxs].cpu().numpy()
        yield (o, lbs_frqs[o], p_L[i][0,0].cpu().numpy(), lbl_entropy, lbs_desc.get(o, 'Not Found'), 
               array(list(zip(topk_toks, topk_toks_probs, topk_tok_entrops, topk_jnt_probs, topk_info_gains))))

In [None]:
def show_infogain(data, save_as=None):
    df = pd.DataFrame(data, columns=['label', 'freq', 'prob', 'entropy', 'description', 'top-k (token, prob, entropy, joint, info)'],)
    df[['prob', 'entropy',]] = df[['prob', 'entropy']].astype(np.float)
    df[['top-k (token, prob, entropy, joint, info)']] = df[['top-k (token, prob, entropy, joint, info)']].astype(np.str_) 
    if save_as is not None: df.to_feather(save_as)

In [None]:
%%time
eps = I_TL.new_empty(1).fill_(1e-15)
# info = I_TL/H_L
info = I_TL/(H_T.unsqueeze(-1) + H_L.unsqueeze(0) - I_TL + eps)
_data = _gen(array(toks), lbs, lbs_frqs, lbs_desc, p_TL, p_T, p_L, info, H_T, H_L, k=10)
show_infogain(_data, save_as='mut_info_jaccard.ft')

CPU times: user 14.5 s, sys: 593 ms, total: 15.1 s
Wall time: 17.3 s


#### Let's look at those Mutual-Information Gain values:

In [None]:
df_jc = pd.read_feather('mut_info_jaccard.ft')
df_le = pd.read_feather('mut_info_lbl_entropy.ft')

In [None]:
# df_lbs.sort_values(by='freq', ascending=False).head(20)

In [None]:
pd.options.display.max_colwidth = None
df_jc[df_jc.label == '032.9']

In [None]:
mask = (df_le.freq>50) & (df_le.freq<150)
# with pd.option_context('display.max_colwidth', 100):
# pd.reset_option('all')
_df_jc = df_jc[mask].reset_index(drop=True)
_df_le = df_le[mask].reset_index(drop=True)
len(_df_jc), len(_df_le)

(822, 822)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
_df_jc.head()

Unnamed: 0,label,freq,prob,entropy,description,"top-k (token, prob, entropy, joint, info)"
0,8.8,132,0.002504,0.017498,"Intestinal infection due to other organism, not elsewhere classified",[['gastroenteritis' '0.008327011' '0.048164062' '0.0018778453' '0.135108']\n ['gasteroenteritis' '0.000113808805' '0.0011472754' '7.587254e-05' '0.020989483']\n ['viral' '0.06646434' '0.24439576' '0.002124431' '0.018479552']\n ['norovirus' '0.0006259484' '0.0052429195' '0.000113808805' '0.017389983']\n ['watery' '0.013770865' '0.07268653' '0.00056904403' '0.01273447']\n ['monobasic' '5.6904402e-05' '0.00061311224' '3.793627e-05' '0.010679064']\n ['profuse' '0.006866465' '0.041045412' '0.00026555388' '0.008544236']\n ['gestures' '0.0008345979' '0.0067503336' '7.587254e-05' '0.00845181']\n ['virally' '0.000113808805' '0.0011472754' '3.793627e-05' '0.008393114']\n ['ksb' '0.000113808805' '0.0011472754' '3.793627e-05' '0.008393114']]
1,38.12,116,0.0022,0.015662,Not Found,[['carbacephems' '0.0061836117' '0.037613403' '0.0006638847' '0.040515352']\n ['carbapenems' '0.0064491658' '0.038956657' '0.0006638847' '0.03890332']\n ['staphylococci' '0.0067336876' '0.04038363' '0.0006828528' '0.038864423']\n ['combinations' '0.006904401' '0.041234046' '0.0006638847' '0.036400057']\n ['consultations' '0.0042109257' '0.027236082' '0.00041729896' '0.029816346']\n ['rifampin' '0.013694993' '0.072362244' '0.00092943857' '0.029683795']\n ['lactamase' '0.010944613' '0.060298413' '0.0006638847' '0.022703482']\n ['protochol' '0.001972686' '0.014257325' '0.00018968133' '0.018664824']\n ['dysthesia' '9.484067e-05' '0.0009733652' '5.6904402e-05' '0.017445711']\n ['fungi' '0.0027883158' '0.019186173' '0.00020864948' '0.016050713']]
2,38.19,148,0.002807,0.019298,Other staphylococcal septicemia,[['epidermidis' '0.0024279212' '0.01704282' '0.00024658575' '0.0187256']\n ['coagulase' '0.02348255' '0.111299396' '0.0011001518' '0.018020378']\n ['coag' '0.04516313' '0.18401921' '0.0015174507' '0.014196403']\n ['staph' '0.06320182' '0.23568806' '0.0018778453' '0.013870501']\n ['staphylococcus' '0.041388467' '0.17233191' '0.0011191199' '0.0092715']\n ['staphlococcus' '0.0003414264' '0.003066752' '5.6904402e-05' '0.008194677']\n ['surveillance' '0.021509863' '0.103858486' '0.0006259484' '0.007927067']\n ['mrse' '0.0008156298' '0.006615689' '7.587254e-05' '0.0076404638']\n ['rvg' '0.00013277694' '0.0013180688' '3.793627e-05' '0.007032873']\n ['oxacillin' '0.028414264' '0.12918602' '0.00070182094' '0.0067001204']]
3,38.2,85,0.001612,0.011978,Pneumococcal septicemia [Streptococcus pneumoniae septicemia],[['pneumococcus' '0.0013277694' '0.010122354' '0.00030349015' '0.060944773']\n ['streptococcal' '0.0020675266' '0.014845582' '0.00030349015' '0.043650616']\n ['pneumo' '0.0066767833' '0.040099256' '0.0006259484' '0.041905276']\n ['pneumococcal' '0.008421851' '0.048616834' '0.0006828528' '0.03765242']\n ['pneumoniae' '0.013808802' '0.0728485' '0.00091047044' '0.03456903']\n ['breakpoints' '0.00036039454' '0.0032176247' '9.484067e-05' '0.027405556']\n ['asplenia' '0.00037936267' '0.0033675581' '7.587254e-05' '0.019955589']\n ['streptococcus' '0.018209409' '0.09098615' '0.0007397572' '0.018613825']\n ['mus' '5.6904402e-05' '0.00061311224' '3.793627e-05' '0.016812751']\n ['mucousa' '5.6904402e-05' '0.00061311224' '3.793627e-05' '0.016812751']]
4,38.3,108,0.002049,0.014728,Septicemia due to anaerobes,[['septicum' '0.00085356605' '0.006884547' '0.000113808805' '0.017611679']\n ['perfringens' '0.00142261' '0.010747153' '0.00013277694' '0.015447679']\n ['megacolon' '0.0030728378' '0.020844972' '0.00018968133' '0.013581591']\n ['bacteroides' '0.0023141124' '0.01635513' '0.00013277694' '0.010470199']\n ['ulitmately' '0.000113808805' '0.0011472754' '3.793627e-05' '0.010368616']\n ['pancolitis' '0.0023520486' '0.016584992' '0.00013277694' '0.010324825']\n ['klebisella' '0.00013277694' '0.0013180688' '3.793627e-05' '0.009809668']\n ['clostridial' '0.00013277694' '0.0013180688' '3.793627e-05' '0.009809668']\n ['citracel' '0.00015174507' '0.0014860831' '3.793627e-05' '0.0093412995']\n ['culutures' '0.0001707132' '0.0016517199' '3.793627e-05' '0.008926498']]


In [None]:
_df_le.head()

Unnamed: 0,label,freq,prob,entropy,description,"top-k (token, prob, entropy, joint, info)"
0,8.8,132,0.002504,0.017498,"Intestinal infection due to other organism, not elsewhere classified",[['gastroenteritis' '0.008327011' '0.048164062' '0.0018778453' '0.44664875']\n ['viral' '0.06646434' '0.24439576' '0.002124431' '0.2715633']\n ['diarrhea' '0.23753795' '0.5482253' '0.0020295903' '0.10494876']\n ['vomiting' '0.31278452' '0.62131053' '0.002143399' '0.09137799']\n ['nausea' '0.3579097' '0.652206' '0.0021054628' '0.07120143']\n ['watery' '0.013770865' '0.07268653' '0.00056904403' '0.064807415']\n ['medicine' '0.47397572' '0.691792' '0.0022382399' '0.056979857']\n ['sick' '0.049734447' '0.19773257' '0.0008156298' '0.054789137']\n ['emesis' '0.06274659' '0.23445892' '0.0008725342' '0.051838394']\n ['ns' '0.12010623' '0.3671366' '0.0011191199' '0.047479752']]
1,38.12,116,0.0022,0.015662,Not Found,[['mrsa' '0.09195752' '0.30704355' '0.0019347497' '0.24584907']\n ['bacteremia' '0.068304256' '0.24923033' '0.0014984825' '0.17290637']\n ['rifampin' '0.013694993' '0.072362244' '0.00092943857' '0.16202216']\n ['vancomycin' '0.2591047' '0.5721184' '0.0020864948' '0.15388577']\n ['aureus' '0.05064492' '0.20040981' '0.001270865' '0.15034734']\n ['staph' '0.06320182' '0.23568806' '0.0013467375' '0.14798154']\n ['vegetations' '0.032720033' '0.14407371' '0.0011001518' '0.14625418']\n ['staphylococci' '0.0067336876' '0.04038363' '0.0006828528' '0.13387245']\n ['carbacephems' '0.0061836117' '0.037613403' '0.0006638847' '0.13245058']\n ['carbapenems' '0.0064491658' '0.038956657' '0.0006638847' '0.1305896']]
2,38.19,148,0.002807,0.019298,Other staphylococcal septicemia,[['staph' '0.06320182' '0.23568806' '0.0018778453' '0.18076809']\n ['coag' '0.04516313' '0.18401921' '0.0015174507' '0.14747754']\n ['vancomycin' '0.2591047' '0.5721184' '0.0024658574' '0.124441765']\n ['coagulase' '0.02348255' '0.111299396' '0.0011001518' '0.11979452']\n ['grew' '0.11272762' '0.3521803' '0.0016312596' '0.09334226']\n ['staphylococcus' '0.041388467' '0.17233191' '0.0011191199' '0.09122223']\n ['cultures' '0.28729135' '0.59970856' '0.0023141124' '0.09089726']\n ['line' '0.30273142' '0.61316085' '0.002143399' '0.06575568']\n ['bacteremia' '0.068304256' '0.24923033' '0.0010811837' '0.060057342']\n ['sepsis' '0.18167679' '0.4739266' '0.0016502277' '0.059517227']]
3,38.2,85,0.001612,0.011978,Pneumococcal septicemia [Streptococcus pneumoniae septicemia],[['pneumoniae' '0.013808802' '0.0728485' '0.00091047044' '0.23663042']\n ['strep' '0.048691202' '0.19464338' '0.0010811837' '0.19061059']\n ['pneumococcal' '0.008421851' '0.048616834' '0.0006828528' '0.1835643']\n ['pneumo' '0.0066767833' '0.040099256' '0.0006259484' '0.174864']\n ['ceftriaxone' '0.122742794' '0.3723544' '0.001270865' '0.1572411']\n ['streptococcus' '0.018209409' '0.09098615' '0.0007397572' '0.15708087']\n ['sepsis' '0.18167679' '0.4739266' '0.0012329287' '0.10871605']\n ['pneumococcus' '0.0013277694' '0.010122354' '0.00030349015' '0.1059879']\n ['pneumonia' '0.29294384' '0.60476655' '0.0014036419' '0.09822925']\n ['vancomycin' '0.2591047' '0.5721184' '0.0013277694' '0.09428377']]
4,38.3,108,0.002049,0.014728,Septicemia due to anaerobes,[['sepsis' '0.18167679' '0.4739266' '0.0014036419' '0.084921986']\n ['septic' '0.06329666' '0.23594368' '0.00092943857' '0.08369133']\n ['diff' '0.08988999' '0.3022832' '0.0010432474' '0.08100143']\n ['colitis' '0.05694234' '0.21847004' '0.00085356605' '0.07671062']\n ['flagyl' '0.14711685' '0.4176752' '0.0012329287' '0.07597934']\n ['clostridium' '0.034427166' '0.14980963' '0.0006828528' '0.07129074']\n ['difficile' '0.050929442' '0.20124286' '0.0007397572' '0.06350024']\n ['metronidazole' '0.051915783' '0.20411795' '0.00064491655' '0.048172895']\n ['vancomycin' '0.2591047' '0.5721184' '0.0013277694' '0.046320684']\n ['shock' '0.06874052' '0.25036877' '0.0006638847' '0.039911266']]


In [None]:
pd.reset_option('all')

In [None]:
_df_jc.to_excel('jaccard.xls', index=False)
_df_le.to_excel('label-entropy.xls', index=False)