In [None]:
#| eval: false
! [ -e /content ] && pip install -Uqq xcube # upgrade xcube on colab

In [None]:
from fastai.data.core import *
from xcube.l2r.all import *

In [None]:
%load_ext autoreload
%autoreload 2

# Boot L2R 

> Bootstrapping a learning-to-rank model

In this tutorial we will find a needle in the haystack with mutual infomation gain:

In [None]:
paths = make_paths(Path.cwd(), 'mimic3-9k')

#### Mutual-Information Computation

In [None]:
source = untar_xxx(XURLs.MIMIC3_L2R)

In [None]:
source.ls()

(#3) [Path('/home/deb/.xcube/data/mimic3_l2r/info.pkl'),Path('/home/deb/.xcube/data/mimic3_l2r/p_TL.pkl'),Path('/home/deb/.xcube/data/mimic3_l2r/mimic3-9k.csv')]

In [None]:
data = source/'mimic3-9k.csv'
df = pd.read_csv(data,
                 header=0,
                 names=['subject_id', 'hadm_id', 'text', 'labels', 'length', 'is_valid'],
                 dtype={'subject_id': str, 'hadm_id': str, 'text': str, 'labels': str, 'length': np.int64, 'is_valid': bool})
df[['text', 'labels']] = df[['text', 'labels']].astype(str)
len(df)

52726

In [None]:
df.head(3)

Unnamed: 0,subject_id,hadm_id,text,labels,length,is_valid
0,86006,111912,admission date discharge date date of birth sex f service surgery allergies patient recorded as having no known allergies to drugs attending first name3 lf chief complaint 60f on coumadin was found slightly drowsy tonight then fell down stairs paramedic found her unconscious and she was intubated w o any medication head ct shows multiple iph transferred to hospital1 for further eval major surgical or invasive procedure none past medical history her medical history is significant for hypertension osteoarthritis involving bilateral knee joints with a dependence on cane for ambulation chronic...,801.35;348.4;805.06;807.01;998.30;707.24;E880.9;427.31;414.01;401.9;V58.61;V43.64;707.00;E878.1;96.71,230,False
1,85950,189769,admission date discharge date service neurosurgery allergies sulfa sulfonamides attending first name3 lf chief complaint cc cc contact info major surgical or invasive procedure none history of present illness hpi 88m who lives with family had fall yesterday today had decline in mental status ems called pt was unresponsive on arrival went to osh head ct showed large r sdh pt was intubated at osh and transferred to hospital1 for further care past medical history cad s p mi in s p cabg in ventricular aneurysm at that time cath in with occluded rca unable to intervene chf reported ef 1st degre...,852.25;E888.9;403.90;585.9;250.00;414.00;V45.81;96.71,304,False
2,88025,180431,admission date discharge date date of birth sex f service surgery allergies no known allergies adverse drug reactions attending first name3 lf chief complaint s p fall major surgical or invasive procedure none history of present illness 45f etoh s p fall from window at feet found ambulating and slurring speech on scene intubated en route for declining mental status in the er the patient was found to be bradycardic to the s with bp of systolic she was given atropine dilantin and was started on saline past medical history unknown social history unknown family history unknown physical exam ex...,518.81;348.4;348.82;801.25;427.89;E882;V49.86;305.00;96.71;38.93,359,False


Note that performing some computations in this notebook on the full dataset is going to take a lot of time. But don't worry `untar_xxx` has already downloaded everything you need. But you can still run the following cells if you want to generate everything from scratch. Preferably, run the following cells on a sampled dataset for quick iterations. 

**Run the cell below only if you want to sample from the full dataset to create a tiny dataset for the purpose of quick iterations.**

*Technical Point:* If we want to sample to perform quick iterations, we need to make sure the number of data points in the sample is a multiple of `bs`. So that we do not have to do a `drop_last=True` while creating the `Dataloaders`. This is because we are about to do some probability computations, and dropping data points is not a good idea as probabilities would not sum to 1.

In [None]:
cut = len(df) - len(df)%8
df = df[:cut]
len(df)

52720

In [None]:
bs = 8
_arr = np.arange(0, len(df), bs)
mask = (_arr > 4000) & (_arr < 5000)
_n = np.random.choice(_arr[mask], 1)
df = df.sample(n=_n, random_state=89, ignore_index=True)
len(df)

4704

In [None]:
df.head(3)

Unnamed: 0,subject_id,hadm_id,text,labels,length,is_valid
0,2258,139169,admission date discharge date date of birth sex m service cardiothoracic surgery history of present illness the patient is a year old male with a past medical history significant for poorly controlled diabetes mellitus and hypertension as well as known coronary disease and a previous non q myocardial infarction and right coronary artery stenting in he was admitted to an outside hospital on the day prior to admission with unstable angina and found to have borderline positive troponin hypertension and st depressions in the lateral lead he was given aspirin nitrates beta blockers morphine and...,414.01;998.31;411.1;599.0;412;V45.82;250.00;401.9;530.81;36.13;37.22;36.15;36.19;39.61;39.64;88.56;88.53;33.23;96.56;33.24;78.41,1271,False
1,41217,161582,admission date discharge date date of birth sex m service medicine allergies no known allergies adverse drug reactions attending first name3 lf chief complaint new diagnosis of scc of base of tongue major surgical or invasive procedure egd w biopsy history of present illness yo man with h o cad heavy smoking and new diagnosis of scc of base of tongue with lymph node involvement pt was referred to dr last name stitle ent in for a rt neck mass at that time a cm rt cervical lymph node was palpated and fiberoptic laryngoscopy showed a cm rt base of tongue mass a ct and biopsy were recommended ...,141.0;507.0;196.0;293.0;519.09;786.30;286.9;427.89;790.29;276.52;414.01;338.3;280.0;272.0;412;V69.4;V15.82;V45.82;V66.7;E879.8;E932.0;31.42;25.01;42.23;43.11;96.6;38.93;99.25;38.93,2743,False
2,30204,172114,admission date discharge date date of birth sex f service medicine allergies etomidate norpace quinidine demerol penicillins lipitor attending doctor first name chief complaint cardiac tamponade s p pulmonary vein isolation major surgical or invasive procedure attempted pulmonary vein isolation pericardiocentesis history of present illness year old woman with a long history of paroxysmal atrial fibrillation refractory to mulitple pharmacologic interventions and multiple cardioversions who presents to the ccu with cardiac tamponade s p pulmonary vein isolation procedure past medical history...,427.31;998.2;423.3;423.9;573.0;276.6;E878.8;37.34;37.27;37.0;37.21,1764,False


**[Mutual Information](https://en.wikipedia.org/wiki/Mutual_information#)**

<img alt="Pictorial representation of simple neural network" width="400" src="info-gain.svg" caption="Pictorial representation of a simple neural network" id="img_simple_nn">

The mutual information of two jointly discrete random variables X and  Y is calculated as a double sum:

$$I(T;L) = \sum_{l \in \mathcal{L}} \sum_{t in \mathcal{T}} P_{(T,L)}(t,l) \log \Bigg(\frac{P_{(T,L)}(t,l)}{P_T(t) P_L(l)} \Bigg)$$

where $P_{(T,L)}$ is the [joint probability mass function](https://en.wikipedia.org/wiki/Joint_distribution) of $T$ and $L$, and $P_T$ and $P_L$ are the [marginal probability mass fucntions](https://en.wikipedia.org/wiki/Marginal_probability) of $T$ and $L$ respectively. To compute $I$, the only quantity we need to compute is the joint pmf $P_{(T,L)}$, as the marginal pmfs can be computed from the joint pmf.

With regard to implementation, $P_{(T,L)}$ can be thought of as a 2x2 tensor as shown below:

In [None]:
p_TL = pd.DataFrame(0, columns=['t', 'not t'], index=['lbl', 'not lbl'])
p_TL

Unnamed: 0,t,not t
lbl,0,0
not lbl,0,0


...and we need to compute this $P_{(T,L)}$ for every token-label pair. In other words, we need to fill in the `joint_pmf` dataframe shown below. Note that each cell in `joint_pmf` dataframe can be thought of to be further subdivided into a 2x2 grid containing the corresponding `p_TL`.

In [None]:
bs, chnk_sz = 8, 200
boot = MutualInfoGain(df, bs=bs, chnk_sz=chnk_sz)

In [None]:
%%time
dsets = boot.onehotify()

CPU times: user 2.52 s, sys: 430 ms, total: 2.95 s
Wall time: 12.8 s


In [None]:
toks, lbs = dsets.vocab
L(toks), L(lbs), len(toks)*len(lbs)

((#21736) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the'...],
 (#4602) ['005.81','007.4','008.45','008.5','008.69','008.8','009.0','009.1','011.90','018.03'...],
 100029072)

In [None]:
joint = pd.DataFrame(0, columns=range(len(lbs)), index=range(len(toks)))
joint.index.name = 'toks (T)'
joint.columns.name = 'lbs (L)'
joint

lbs (L),0,1,2,3,4,5,6,7,8,9,...,4592,4593,4594,4595,4596,4597,4598,4599,4600,4601
toks (T),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21732,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21733,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21734,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We can perform tensorized computation if we think of `p_TL` as a 4 dim tensor of size `(len(toks), len(lbs), 2, 2)`. Next, to be able to estimate `p_TL` we just need to iterate over the dataset and for each data point and each token-label pair record the `p_TL` information in the last two dimension of the tensor `p_TL`. And, at the end divide by size of the dataset. 

Some more implementation details (Skip this if not iterested): 

- We are going to one-hot encode the dataset (both `text` and `labels` field in the `df`). This is done by `onehot_dsets` 
- For efficieny, in reality we are not going to iterate over the dataset one by one, instead we are going to use a dataloader and perform `p_TL` computation on a mini-batch.
- Unless you are doing this in 2035 you probably do not have enogh GPU-RAM to fit the entire `p_TL` tensor of dimension `(len(toks), len(lbs), 2, 2)`. So we are going to split the lbs dimension into chunks. (Why the `lbs` dimension and not the `toks`? Because in XML datsets `toks` are approximately 60000, but the number of `lbs` could be really large of the order of millions.) With reagrd to implementation this would mean that instead of one dataloader we would roll with multiple dataloaders. And each dataloader would load the dataset in a way that mini-batches would contain the full one-hot encoding of the `text` field but only a certain `chunk` of the one-hot encoded `labels` field in `df`. Another way to think about this is that each datapoint, specifically the `labels` are splitted across multiple dataloaders. This way once we are done iterating over one such dataloader we would have filled a ceratin chunk of the `joint` dataframe shown above. And we would fill the entire `joint` only once we are done iterating over all the dataloaders. 

In [None]:
x, y = dsets[0]
test_eq(tensor(dsets.tfms[1][2].decode(y)), torch.where(y==1)[0])
test_eq(tensor(dsets.tfms[0][-1].decode(x)), torch.where(x==1)[0])

In [None]:
' '.join(L(toks)[torch.where(x==1)[0]])

'xxunk xxbos xxrep the and to of was with a on in for mg no patient is he blood at name or discharge s as day daily his last history were by had not be this admission pain date pt hospital normal an that p from there first has are have ct which medications but chest c hours well time given course stable after disease known x two continued days service m per hct prior un artery once been medical wbc past namepattern1 glucose instructions cardiac present acute exam physical i family inr plt edema iv transferred allergies likely due 3 rbc if then condition bid t hgb surgical during clear procedure k renal evidence mcv respiratory creat who rdw mch tube mchc head soft non failure diagnosis all rate bilaterally na placed found results birth lf abdominal ni l intact illness cl received social abdomen bowel secondary sex within following bilateral low floor urean initially hco3 neck name3 followup out namepattern4 disposition pneumonia bp ck units angap size back good moderate impression regu

In [None]:
lbs.map_ids(torch.where(y==1)[0])

(#27) ['070.44','276.0','276.2','285.1','286.7','303.91','38.95','38.97','39.95','427.5'...]

In [None]:
#| hide
splits = ColSplitter()(df)
splits

lm_vocab = torch.load(dls_lm_vocab_path)

@Transform
def Cleanser(toks): return [o for o in toks if o in lm_vocab]

class MyNumericalize(Transform):
    "Transform to remove tokens not present in `vocab`"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None):
        store_attr('vocab,min_freq,max_vocab,special_toks')
        self.o2i = None if vocab is None else defaultdict(int, {v: i for i,v in enumerate(vocab)})
    
    def setups(self, dsets):
        if dsets is None: return
        if self.vocab is None:
            count = dsets.counter if getattr(dsets, 'counter', None) is not None else Counter(p for o in dsets for p in o)
            if self.special_toks is None and hasattr(dsets, 'special_toks'):
                self.special_toks = dsets.special_toks
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab, special_toks=self.special_toks)
            self.o2i = defaultdict(int, {v:i for i,v in enumerate(self.vocab) if v != 'xxfake'})
    
    def encodes(self, o): return TensorText(tensor([self.o2i[o_] for o_ in o if o_ in self.vocab]))
    def decodes(self, o): return L(self.vocab[o_] for o_ in o)

# resort to this if anythiong goes wrong below
x_tfms = [Tokenizer.from_df('text', n_workers=num_cpus()), attrgetter("text"), Cleanser, MultiCategorize(vocab=lm_vocab), OneHotEncode()]
y_tfms = [ColReader('labels', label_delim=';'), MultiCategorize(), OneHotEncode()]
tfms = [x_tfms, y_tfms]

class Chunkifize(Transform):
    order = 4
    def __init__(self, num_chunks=3): store_attr('num_chunks')
    def encodes(self, o): 
        return list(torch.chunk(o, self.num_chunks))
    def decodes(self, o): 
        return torch.cat(o)

chnk_tfm = Chunkifize()
chnks = chnk_tfm(torch.arange(10))
test_eq(type(chnks), list)
test_eq(chnks, [tensor([0, 1, 2, 3]), tensor([4, 5, 6, 7]), tensor([8, 9])])
# test_fail(lambda: chnk_tfm.decode(chnks), tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
test_eq(chnk_tfm.decode(chnks), tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

# y_tfms = [ColReader('labels', label_delim=';'), MultiCategorize(), OneHotEncode(), Chunkifize()]
# tfmd_y = TfmdLists(df, tfms=y_tfms)
# tfmd_y.decode(tfmd_y[0])

In [None]:
dls = boot.lbs_chunked()

In [None]:
assert isinstance(dls[0], TfmdDL)
test_eq(len(dls),  np.ceil(len(lbs)/200))
test_eq(len(dls[0]), np.ceil(len(dsets)/bs)) # drop_last is False
# test to prove that the labels for each data point is split across multiple dataloaders
lbs_0 = torch.cat([yb[0] for dl in dls for _,yb in itertools.islice(dl, 1)])
y = y.to(default_device())
test_eq(lbs_0, y)

Now let's compute the `joint_pmf` table we had seen earlier. 

In [None]:
%%time
p_TL = boot.joint_pmf()

CPU times: user 3min 26s, sys: 10.1 s, total: 3min 36s
Wall time: 3min 47s


In [None]:
test_eq(p_TL.shape, (boot.toksize, boot.lblsize, 2, 2))

Technicality: `p_TL_full` is not really the joint pmf (yes, I lied before!) but contains all the information needed to compute the joint pmf `p_TxL` and mutual info gain `I_TL`. This computation is going to be comnputed by `compute`:

In [None]:
%%time
p_T, p_L, p_TxL, H_T, H_L, I_TL = boot.compute()

CPU times: user 170 ms, sys: 280 ms, total: 450 ms
Wall time: 513 ms


All this while if you have been working with the sampled dataset you can continue to do so for the rest of this notebook. But if you want a real feel of how things look, at this point you can load the pregenerated `p_TL` and `(p_T, p_L, p_TxL, H_T, H_L, I_TL)` for the full dataset which `untar_xxx` downloaded:

In [None]:
L(source.glob("**/*.pkl"))

(#2) [Path('/home/deb/.xcube/data/mimic3_l2r/info.pkl'),Path('/home/deb/.xcube/data/mimic3_l2r/p_TL.pkl')]

In [None]:
source/'p_TL.pkl'

Path('/home/deb/.xcube/data/mimic3_l2r/p_TL.pkl')

In [None]:
%%time 
p_TL = torch.load(source/'p_TL.pkl', map_location=torch.device('cpu'))
p_T, p_L, p_TxL, H_T, H_L, I_TL = torch.load(source/'info.pkl', map_location=torch.device('cpu'))

CPU times: user 0 ns, sys: 4.48 s, total: 4.48 s
Wall time: 6.92 s


Make sure that aren't any of those pesky nans or negs:

In [None]:
for o in (p_TL, p_T, p_L, p_TxL, H_T, H_L, I_TL):
    try:
        assert not o.isnan().all() # check for nans
        test_eq(torch.where(o>=0, True, False).all(), True) # check for negs
    except AssertionError:
        print(f"{namestr(o, globals())[0]} failed")

I_TL failed


Theoretically, Mutual-Info as defined [here](https://en.wikipedia.org/wiki/Mutual_information) is suposed to be non-negative (can be proved by tossing in [Jensen](https://en.wikipedia.org/wiki/Jensen%27s_inequality)). But, practically, it turns out `I_TL` has some negs because we distorted the `p_TL` and `p_TxL`  with `eps` in the `I_TL` computation.

In [None]:
torch.topk(I_TL.flatten(), 10, largest=False)

torch.return_types.topk(
values=TensorMultiCategory([-1.9016e-07, -1.8314e-07, -1.8314e-07, -1.7385e-07,
                     -1.7277e-07, -1.7277e-07, -1.6798e-07, -1.6798e-07,
                     -1.6798e-07, -1.6767e-07]),
indices=TensorMultiCategory([22423614,  2735913,  2731838,  1911099,  6393113,  6389159,
                      6695355,  6695018,  6693073, 32253137]))

In [None]:
howmany = torch.where(I_TL < 0, True, False).sum().item()
negs = torch.where(I_TL < 0, I_TL, I_TL.new_zeros(I_TL.shape))
negs.sum()/howmany

TensorMultiCategory(-3.9054e-08)

Those negs on an avg are pretty close to zero. So we need not worry. Let's roll!

In [None]:
test_eq(p_TL.shape, (boot.toksize, boot.lblsize, 2, 2))
test_eq(p_T.shape, (boot.toksize, 2, 1))
test_eq(p_L.shape, (boot.lblsize, 1, 2))
test_eq(p_TxL.shape, (boot.toksize, boot.lblsize, 2, 2))
test_eq(H_T.shape, [boot.toksize])
test_eq(H_L.shape, [boot.lblsize])
test_eq(I_TL.shape, (boot.toksize, boot.lblsize))

In [None]:
#| hide
# r_t, r_l = random.randrange(0, len(toks)), random.randrange(0, len(lbs))
# toks[r_t], lbs[r_l]

# test_close(p_TL[r_t,r_l].sum(), 1, eps=1e-1)
# test_eq(p_T[r_t].sum(), 1)
# test_eq(p_L[r_l].sum(), 1)

# p_TL[r_t,r_l].sum(-1), p_TL[r_t, 400].sum(-1) 

# p_T[r_t], p_L[r_l]
# I_TL[r_t,r_l]

Let's save the `info`, we will use this bootstrap the collab model:

In [None]:
eps = I_TL.new_empty(1).fill_(1e-15)
info_lbl_entropy = I_TL/(H_L + eps)
info_jaccard = I_TL/(H_T.unsqueeze(-1) + H_L.unsqueeze(0) - I_TL + eps)
assert not info_lbl_entropy.isnan().all(); assert not info_jaccard.isnan().all()
# collab_bootstrap = {'toks': toks, 'lbs': lbs, 'mut_info_lbl_entropy': info_lbl_entropy, 'mutual_info_jaccard': info_jaccard}

In [None]:
# torch.save(collab_bootstrap, collab_bootst_path)
# assert collab_bootst_path.exists()

#### Save those Mutual Information Gain values

Let's take a look at the *Mutual Information Gain* (`I_TL`) for each of the labels:

In [None]:
f = ColReader('labels', label_delim=';')
lbs_frqs = Counter()
for o in df.itertuples(): lbs_frqs.update(f(o))
with open(path.parent/'data'/'code_desc.pkl', 'rb') as f: lbs_desc = pickle.load(f)

In [None]:
def _gen(toks, lbs, lbs_frqs, lbs_desc, p_TL, p_T, p_L, info, H_T, H_L, k=5):
    sorted_by_tok, tok_idxs = torch.sort(info, dim=0, descending=True) 
    for i,o in enumerate(lbs):
        topk_tok_idxs = tok_idxs[:k, i].cpu()
        topk_toks = toks[topk_tok_idxs]
        topk_toks_probs = p_T.squeeze()[:,0][topk_tok_idxs].cpu().numpy()
        topk_info_gains = sorted_by_tok[:k, i].cpu().numpy()
        topk_jnt_probs = p_TL[topk_tok_idxs, [i]][:,0,0].cpu().numpy()
        lbl_entropy = H_L[i].cpu().numpy()
        topk_tok_entrops = H_T[topk_tok_idxs].cpu().numpy()
        yield (o, lbs_frqs[o], p_L[i][0,0].cpu().numpy(), lbl_entropy, lbs_desc.get(o, 'Not Found'), 
               array(list(zip(topk_toks, topk_toks_probs, topk_tok_entrops, topk_jnt_probs, topk_info_gains))))

In [None]:
def show_infogain(data, save_as=None):
    df = pd.DataFrame(data, columns=['label', 'freq', 'prob', 'entropy', 'description', 'top-k (token, prob, entropy, joint, info)'],)
    df[['prob', 'entropy',]] = df[['prob', 'entropy']].astype(np.float)
    df[['top-k (token, prob, entropy, joint, info)']] = df[['top-k (token, prob, entropy, joint, info)']].astype(np.str_) 
    if save_as is not None: df.to_feather(save_as)

In [None]:
%%time
eps = I_TL.new_empty(1).fill_(1e-15)
# info = I_TL/H_L
info = I_TL/(H_T.unsqueeze(-1) + H_L.unsqueeze(0) - I_TL + eps)
_data = _gen(array(toks), lbs, lbs_frqs, lbs_desc, p_TL, p_T, p_L, info, H_T, H_L, k=10)
show_infogain(_data, save_as='mut_info_jaccard.ft')

CPU times: user 14.5 s, sys: 593 ms, total: 15.1 s
Wall time: 17.3 s


#### Let's look at those Mutual-Information Gain values:

In [None]:
df_jc = pd.read_feather('mut_info_jaccard.ft')
df_le = pd.read_feather('mut_info_lbl_entropy.ft')

In [None]:
# df_lbs.sort_values(by='freq', ascending=False).head(20)

In [None]:
pd.options.display.max_colwidth = None
df_jc[df_jc.label == '032.9']

In [None]:
mask = (df_le.freq>50) & (df_le.freq<150)
# with pd.option_context('display.max_colwidth', 100):
# pd.reset_option('all')
_df_jc = df_jc[mask].reset_index(drop=True)
_df_le = df_le[mask].reset_index(drop=True)
len(_df_jc), len(_df_le)

(822, 822)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
_df_jc.head()

Unnamed: 0,label,freq,prob,entropy,description,"top-k (token, prob, entropy, joint, info)"
0,8.8,132,0.002504,0.017498,"Intestinal infection due to other organism, not elsewhere classified",[['gastroenteritis' '0.008327011' '0.048164062' '0.0018778453' '0.135108']\n ['gasteroenteritis' '0.000113808805' '0.0011472754' '7.587254e-05' '0.020989483']\n ['viral' '0.06646434' '0.24439576' '0.002124431' '0.018479552']\n ['norovirus' '0.0006259484' '0.0052429195' '0.000113808805' '0.017389983']\n ['watery' '0.013770865' '0.07268653' '0.00056904403' '0.01273447']\n ['monobasic' '5.6904402e-05' '0.00061311224' '3.793627e-05' '0.010679064']\n ['profuse' '0.006866465' '0.041045412' '0.00026555388' '0.008544236']\n ['gestures' '0.0008345979' '0.0067503336' '7.587254e-05' '0.00845181']\n ['virally' '0.000113808805' '0.0011472754' '3.793627e-05' '0.008393114']\n ['ksb' '0.000113808805' '0.0011472754' '3.793627e-05' '0.008393114']]
1,38.12,116,0.0022,0.015662,Not Found,[['carbacephems' '0.0061836117' '0.037613403' '0.0006638847' '0.040515352']\n ['carbapenems' '0.0064491658' '0.038956657' '0.0006638847' '0.03890332']\n ['staphylococci' '0.0067336876' '0.04038363' '0.0006828528' '0.038864423']\n ['combinations' '0.006904401' '0.041234046' '0.0006638847' '0.036400057']\n ['consultations' '0.0042109257' '0.027236082' '0.00041729896' '0.029816346']\n ['rifampin' '0.013694993' '0.072362244' '0.00092943857' '0.029683795']\n ['lactamase' '0.010944613' '0.060298413' '0.0006638847' '0.022703482']\n ['protochol' '0.001972686' '0.014257325' '0.00018968133' '0.018664824']\n ['dysthesia' '9.484067e-05' '0.0009733652' '5.6904402e-05' '0.017445711']\n ['fungi' '0.0027883158' '0.019186173' '0.00020864948' '0.016050713']]
2,38.19,148,0.002807,0.019298,Other staphylococcal septicemia,[['epidermidis' '0.0024279212' '0.01704282' '0.00024658575' '0.0187256']\n ['coagulase' '0.02348255' '0.111299396' '0.0011001518' '0.018020378']\n ['coag' '0.04516313' '0.18401921' '0.0015174507' '0.014196403']\n ['staph' '0.06320182' '0.23568806' '0.0018778453' '0.013870501']\n ['staphylococcus' '0.041388467' '0.17233191' '0.0011191199' '0.0092715']\n ['staphlococcus' '0.0003414264' '0.003066752' '5.6904402e-05' '0.008194677']\n ['surveillance' '0.021509863' '0.103858486' '0.0006259484' '0.007927067']\n ['mrse' '0.0008156298' '0.006615689' '7.587254e-05' '0.0076404638']\n ['rvg' '0.00013277694' '0.0013180688' '3.793627e-05' '0.007032873']\n ['oxacillin' '0.028414264' '0.12918602' '0.00070182094' '0.0067001204']]
3,38.2,85,0.001612,0.011978,Pneumococcal septicemia [Streptococcus pneumoniae septicemia],[['pneumococcus' '0.0013277694' '0.010122354' '0.00030349015' '0.060944773']\n ['streptococcal' '0.0020675266' '0.014845582' '0.00030349015' '0.043650616']\n ['pneumo' '0.0066767833' '0.040099256' '0.0006259484' '0.041905276']\n ['pneumococcal' '0.008421851' '0.048616834' '0.0006828528' '0.03765242']\n ['pneumoniae' '0.013808802' '0.0728485' '0.00091047044' '0.03456903']\n ['breakpoints' '0.00036039454' '0.0032176247' '9.484067e-05' '0.027405556']\n ['asplenia' '0.00037936267' '0.0033675581' '7.587254e-05' '0.019955589']\n ['streptococcus' '0.018209409' '0.09098615' '0.0007397572' '0.018613825']\n ['mus' '5.6904402e-05' '0.00061311224' '3.793627e-05' '0.016812751']\n ['mucousa' '5.6904402e-05' '0.00061311224' '3.793627e-05' '0.016812751']]
4,38.3,108,0.002049,0.014728,Septicemia due to anaerobes,[['septicum' '0.00085356605' '0.006884547' '0.000113808805' '0.017611679']\n ['perfringens' '0.00142261' '0.010747153' '0.00013277694' '0.015447679']\n ['megacolon' '0.0030728378' '0.020844972' '0.00018968133' '0.013581591']\n ['bacteroides' '0.0023141124' '0.01635513' '0.00013277694' '0.010470199']\n ['ulitmately' '0.000113808805' '0.0011472754' '3.793627e-05' '0.010368616']\n ['pancolitis' '0.0023520486' '0.016584992' '0.00013277694' '0.010324825']\n ['klebisella' '0.00013277694' '0.0013180688' '3.793627e-05' '0.009809668']\n ['clostridial' '0.00013277694' '0.0013180688' '3.793627e-05' '0.009809668']\n ['citracel' '0.00015174507' '0.0014860831' '3.793627e-05' '0.0093412995']\n ['culutures' '0.0001707132' '0.0016517199' '3.793627e-05' '0.008926498']]


In [None]:
_df_le.head()

Unnamed: 0,label,freq,prob,entropy,description,"top-k (token, prob, entropy, joint, info)"
0,8.8,132,0.002504,0.017498,"Intestinal infection due to other organism, not elsewhere classified",[['gastroenteritis' '0.008327011' '0.048164062' '0.0018778453' '0.44664875']\n ['viral' '0.06646434' '0.24439576' '0.002124431' '0.2715633']\n ['diarrhea' '0.23753795' '0.5482253' '0.0020295903' '0.10494876']\n ['vomiting' '0.31278452' '0.62131053' '0.002143399' '0.09137799']\n ['nausea' '0.3579097' '0.652206' '0.0021054628' '0.07120143']\n ['watery' '0.013770865' '0.07268653' '0.00056904403' '0.064807415']\n ['medicine' '0.47397572' '0.691792' '0.0022382399' '0.056979857']\n ['sick' '0.049734447' '0.19773257' '0.0008156298' '0.054789137']\n ['emesis' '0.06274659' '0.23445892' '0.0008725342' '0.051838394']\n ['ns' '0.12010623' '0.3671366' '0.0011191199' '0.047479752']]
1,38.12,116,0.0022,0.015662,Not Found,[['mrsa' '0.09195752' '0.30704355' '0.0019347497' '0.24584907']\n ['bacteremia' '0.068304256' '0.24923033' '0.0014984825' '0.17290637']\n ['rifampin' '0.013694993' '0.072362244' '0.00092943857' '0.16202216']\n ['vancomycin' '0.2591047' '0.5721184' '0.0020864948' '0.15388577']\n ['aureus' '0.05064492' '0.20040981' '0.001270865' '0.15034734']\n ['staph' '0.06320182' '0.23568806' '0.0013467375' '0.14798154']\n ['vegetations' '0.032720033' '0.14407371' '0.0011001518' '0.14625418']\n ['staphylococci' '0.0067336876' '0.04038363' '0.0006828528' '0.13387245']\n ['carbacephems' '0.0061836117' '0.037613403' '0.0006638847' '0.13245058']\n ['carbapenems' '0.0064491658' '0.038956657' '0.0006638847' '0.1305896']]
2,38.19,148,0.002807,0.019298,Other staphylococcal septicemia,[['staph' '0.06320182' '0.23568806' '0.0018778453' '0.18076809']\n ['coag' '0.04516313' '0.18401921' '0.0015174507' '0.14747754']\n ['vancomycin' '0.2591047' '0.5721184' '0.0024658574' '0.124441765']\n ['coagulase' '0.02348255' '0.111299396' '0.0011001518' '0.11979452']\n ['grew' '0.11272762' '0.3521803' '0.0016312596' '0.09334226']\n ['staphylococcus' '0.041388467' '0.17233191' '0.0011191199' '0.09122223']\n ['cultures' '0.28729135' '0.59970856' '0.0023141124' '0.09089726']\n ['line' '0.30273142' '0.61316085' '0.002143399' '0.06575568']\n ['bacteremia' '0.068304256' '0.24923033' '0.0010811837' '0.060057342']\n ['sepsis' '0.18167679' '0.4739266' '0.0016502277' '0.059517227']]
3,38.2,85,0.001612,0.011978,Pneumococcal septicemia [Streptococcus pneumoniae septicemia],[['pneumoniae' '0.013808802' '0.0728485' '0.00091047044' '0.23663042']\n ['strep' '0.048691202' '0.19464338' '0.0010811837' '0.19061059']\n ['pneumococcal' '0.008421851' '0.048616834' '0.0006828528' '0.1835643']\n ['pneumo' '0.0066767833' '0.040099256' '0.0006259484' '0.174864']\n ['ceftriaxone' '0.122742794' '0.3723544' '0.001270865' '0.1572411']\n ['streptococcus' '0.018209409' '0.09098615' '0.0007397572' '0.15708087']\n ['sepsis' '0.18167679' '0.4739266' '0.0012329287' '0.10871605']\n ['pneumococcus' '0.0013277694' '0.010122354' '0.00030349015' '0.1059879']\n ['pneumonia' '0.29294384' '0.60476655' '0.0014036419' '0.09822925']\n ['vancomycin' '0.2591047' '0.5721184' '0.0013277694' '0.09428377']]
4,38.3,108,0.002049,0.014728,Septicemia due to anaerobes,[['sepsis' '0.18167679' '0.4739266' '0.0014036419' '0.084921986']\n ['septic' '0.06329666' '0.23594368' '0.00092943857' '0.08369133']\n ['diff' '0.08988999' '0.3022832' '0.0010432474' '0.08100143']\n ['colitis' '0.05694234' '0.21847004' '0.00085356605' '0.07671062']\n ['flagyl' '0.14711685' '0.4176752' '0.0012329287' '0.07597934']\n ['clostridium' '0.034427166' '0.14980963' '0.0006828528' '0.07129074']\n ['difficile' '0.050929442' '0.20124286' '0.0007397572' '0.06350024']\n ['metronidazole' '0.051915783' '0.20411795' '0.00064491655' '0.048172895']\n ['vancomycin' '0.2591047' '0.5721184' '0.0013277694' '0.046320684']\n ['shock' '0.06874052' '0.25036877' '0.0006638847' '0.039911266']]


In [None]:
pd.reset_option('all')

In [None]:
_df_jc.to_excel('jaccard.xls', index=False)
_df_le.to_excel('label-entropy.xls', index=False)