# Dataset Statistics
This notebook keeps track of various statistics about the compendium dataset like how many samples are labled, which diseases are present, etc.

To skip straight to the stats, click [here](#stats)

In [1]:
from collections import Counter
import json
import os
import sys

# __file__ isn't defined for jupyter notebooks
curr_path = os.path.dirname(os.path.abspath(os.path.abspath('')))
whistl_path = os.path.join(curr_path, os.pardir, 'whistl')
sys.path.append(whistl_path)
import utils

In [2]:
map_file = os.path.join(curr_path, os.pardir, 'data', 'sample_classifications.pkl')
sample_to_label = utils.parse_map_file(map_file)
sample_ids = sample_to_label.keys()

In [3]:
# Not all labeled samples show up in the compendium, which causes pandas to panic. To fix this we have to 
# take the intersection of the accessions in sample_ids and the accessions in the compendium
compendium_path = os.path.join(curr_path, os.pardir, 'data', 'human_compendium', 'HOMO_SAPIENS.tsv')

header_ids = None
with open(compendium_path) as in_file:
    header = in_file.readline()
    header_ids = header.split('\t')

valid_sample_ids = [id_ for id_ in sample_ids if id_ in header_ids]

In [4]:
metadata_file = os.path.join(curr_path, os.pardir, 'data', 'all_metadata.json')
metadata_json = json.load(open(metadata_file))
sample_metadata = metadata_json['samples']

sample_to_study = {}

experiments = metadata_json['experiments']

for study in experiments:
    for accession in experiments[study]['sample_accession_codes']:
        sample_to_study[accession] = study

In [5]:
# Remove beadchip info as it will be removed on its own later
no_beadchip_ids = []
for sample in valid_sample_ids:
    if 'beadchip' not in sample_metadata[sample]['refinebio_platform'].lower():
        # Leave out 'other' labels as they won't be included in the analysis
        if sample_to_label[sample] != 'other':
            no_beadchip_ids.append(sample)
valid_sample_ids = no_beadchip_ids

In [6]:
studies = set()
platforms = []
diseases = []
disease_study_counts = {}

for sample in valid_sample_ids:
    study = sample_to_study[sample]
    studies.add(study)
    platform = sample_metadata[sample]['refinebio_platform'].lower()
    platforms.append(platform)
    
    disease = sample_to_label[sample]
    diseases.append(disease)
    
    if disease in disease_study_counts:
        if study not in disease_study_counts[disease]:
            disease_study_counts[disease].append(study)
    else:
        disease_study_counts[disease] = [study]
        

platforms = Counter(platforms)
diseases = Counter(diseases)

<a id='stats'></a>
## Stats
All code beyond this point exists solely to print statistics about the data

In [7]:
print('Number of samples used in the analyses: {}'.format(len(valid_sample_ids)))
print('Number of studies used in the analyses: {}'.format(len(studies)))

Number of samples used in the analyses: 6758
Number of studies used in the analyses: 74


In [8]:
print('Count\tPlatform:')
for platform, count in platforms.most_common():
    print('{}\t{}'.format(count, platform))

Count	Platform:
3816	affymetrix human genome u133 plus 2.0 array (hgu133plus2)
964	affymetrix ht hg-u133+ pm array plate (hthgu133pluspm)
666	illumina hiseq 2000 (illuminahiseq2000)
279	affymetrix human gene 1.1 st array (hugene11st)
214	affymetrix human gene 1.0 st array (hugene10st)
208	illumina hiseq 2500 (illuminahiseq2500)
181	affymetrix human genome u133a array (hgu133a)
162	affymetrix human genome u133a 2.0 array (hgu133a2)
160	affymetrix human genome u219 array (hgu219)
69	illumina genome analyzer iix (illuminagenomeanalyzeriix)
27	affymetrix human gene 2.0 st array (hugene20st)
12	affymetrix human genome u95 version 2 array (hgu95av2)


In [9]:
print('Samples\tStudies\tDisease')
for disease, count in diseases.most_common():
    print('{}\t{}\t{}'.format(count, len(disease_study_counts[disease]), disease))

Samples	Studies	Disease
1705	57	healthy
1229	21	sepsis
967	4	ra
565	11	lupus
543	1	ms
454	4	tb
274	2	parkinsons
152	1	ipf
125	1	depression
111	1	sjia
75	1	uremia
69	3	scleroderma
41	1	gpd
41	1	pv
40	1	psoriasis
38	1	pneumonia
37	4	lps
34	1	ltb
27	1	dengue
27	1	sjogrens
27	1	huntingtons
25	1	stills
20	1	meningitis
19	1	malaria
19	1	et
16	1	hidradenitis
16	1	kawasaki
10	1	melas
10	1	narcolepsy
9	1	pmf
8	1	cad
8	1	msa
8	1	psp
6	1	drd
2	1	cbd
1	1	vascular_dementia
