# Semi-supervision Feasibility
This notebook evaluates the data to see how much unlabeled blood data exists in the refine.bio human compendium. If enough exists, it will be important to evaluate whether semi-supervision helps model performance

In [1]:
import collections
import json
import os
import pickle
from typing import Text, Dict, Union

import pandas as pd

from whistl import utils

In [2]:
data_dir = '../../data/'
map_file = os.path.join(data_dir, 'sample_classifications.pkl')

sample_to_label = utils.parse_map_file(map_file)
with open(map_file, 'rb') as in_file:
    label_to_sample = pickle.load(in_file)[0]

In [3]:
metadata_path = os.path.join(data_dir, 'human_compendium/aggregated_metadata.json')
with open(metadata_path) as json_file:
    metadata = json.load(json_file)

In [4]:
def get_tissue(sample_metadata: Dict, sample: Text) -> Union[Text, None]:
    '''Extract the tissue type for the given sample from the metadata
       
       Arguments:
       sample_metadata: A dictionary containing metadata about all samples in the dataset
       sample: The sample id
       
       Returns:
       A string containing the tissue, if thatt information is present.
       Otherwise returns None
    '''
    try:
        characteristics = sample_metadata[sample]['refinebio_annotations'][0]['characteristics_ch1']
        for characteristic in characteristics:
            if 'tissue:' in characteristic:
                tissue = characteristic.split(':')[1]
                tissue = tissue.strip().lower()
                return tissue
            
    # Catch exceptions caused by a field not being present
    except KeyError:
        return None
    
    # 'refinebio_annotations' is usually a length 1 list containing a dictionary.
    # Sometimes it's a length 0 list indicating there aren't annotations
    except IndexError:
        return None

In [5]:
sample_metadata = metadata['samples']

tissues = []
for sample in sample_metadata:
    tissue = get_tissue(sample_metadata, sample)
    if tissue is not None:
        tissues.append(tissue)

In [6]:
tissue_counts = collections.Counter(tissues)
tissue_counts.most_common()[:5]

[('blood', 8532),
 ('whole blood', 5988),
 ('peripheral blood', 5862),
 ('bone marrow', 3469),
 ('post-mortem brain', 2001)]

In [7]:
keys = tissue_counts.keys()
blood_keys = []
for key in keys:
    if 'blood' in key or 'pbmc' in key:
        blood_keys.append(key)
sorted(blood_keys)

['33% blood and 67% breast',
 '67% blood and 33% breast',
 'blood',
 'blood (buffy coat)',
 'blood (leukapheresis products)',
 'blood cells',
 'blood dendritic cells',
 'blood ds1-derived ips clone, expressing 4 reprogramming factors (klf4, c-myc, sox2, and oct4)',
 'blood leukocytes',
 'blood monocytes',
 'blood sample',
 'blood vessel',
 'blood vessels',
 'blood, isolated leukocytes',
 'bone marrow / peripheral blood',
 'bone marrow or peripheral blood',
 'bone marrow/blood >65% infiltration',
 'cd4+ t cells from pbmc',
 'cells from whole blood',
 'cord blood',
 'cord blood from newborn',
 'cultured peripheral blood mononuclear cells',
 'fresh venous blood anticoagulated with 50 g/ml thrombin-inhibitor lepirudin',
 'healthy human blood',
 'host peripheral blood',
 'human umbilical cord blood',
 'ipscs from human pbmc',
 'leukemic cells obtained from bone marrow or blood at diagnosis',
 'leukemic peripheral blood',
 'leukocytes from whole blood',
 'monocytes isolated from pbmc',
 'mon

In [8]:
# Keep whole blood and pbmcs, leave out samples containing a single cell type
# Also leave out umbilical cord blood because it's not quite the same thing
# https://pubmed.ncbi.nlm.nih.gov/12634410/
keys_to_keep = ['blood',
                'blood (buffy coat)',
                'blood cells',
                'blood monocytes',
                'blood sample',
                'cells from whole blood',
                'fresh venous blood anticoagulated with 50 g/ml thrombin-inhibitor lepirudin',
                'healthy human blood',
                'host peripheral blood',
                'leukemic peripheral blood',
                'monocytes isolated from pbmc',
                'normal peripheral blood cells',
                'pbmc',
                'pbmcs',
                'peripheral blood',
                'peripheral blood (pb)',
                'peripheral blood mononuclear cell',
                'peripheral blood mononuclear cell (pbmc)',
                'peripheral blood mononuclear cells',
                'peripheral blood mononuclear cells (pbmc)',
                'peripheral blood mononuclear cells (pbmcs)',
                'peripheral blood mononuclear cells (pbmcs) from healthy donors',
                'peripheral maternal blood',
                'peripheral whole blood',
                'periphral blood',
                'pheripheral blood',
                'whole blood',
                'whole blood (wb)',
                'whole blood, maternal peripheral',
                'whole venous blood'
               ]

In [9]:
blood_counts = dict((k, tissue_counts[k]) for k in keys_to_keep)

In [10]:
total_samples = sum(blood_counts.values())
total_samples

24517

## Count unlabeled blood cells
~25k blood samples is around 3x as many samples as we have labeled. Let's find exactly how much overlap there is between these samples and our labeled samples

In [11]:
labeled_samples = set(sample_to_label.keys())
print(len(labeled_samples))

# The label 'other' is given to samples whose disease or healthy
# status could not be determined
other_samples = set(label_to_sample['other'])
print(len(other_samples))
# Set difference
labeled_samples = labeled_samples - other_samples
print(len(labeled_samples))

10727
3237
7490


In [13]:
unlabeled_samples = []

for sample in sample_metadata:
    tissue = get_tissue(sample_metadata, sample)
    if tissue in keys_to_keep and sample not in labeled_samples:
        unlabeled_samples.append(sample)

print('{} samples unlabeled'.format(len(unlabeled_samples)))
print('{} samples labeled'.format(len(labeled_samples)))

19840 samples unlabeled
7490 samples labeled


## Conclusion
There is a large number of blood samples that don't have labels. These samples can be used for semi-supervised learning, and the number of unlabeled samples is large enough to make it worth trying.