# Dataset statistics

This notebook includes the statistics analysis for each dataset

# Imports

In [1]:
import os
import json
import pandas as pd

from utils import DATA_DIR

# Load datasets

In [2]:
with open(os.path.join(DATA_DIR, 'transcriptomics', 'creed_harmonized_expression.json')) as f:
    creed = json.load(f)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'geo_harmonized_expression.json')) as f:
    geo = json.load(f)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'l1000_harmonized_expression.json')) as f:
    l1000 = json.load(f)
    
with open(os.path.join(DATA_DIR, 'transcriptomics', 'target_harmonized_expression.json')) as f:
    target = json.load(f)

In [3]:
MAP = {
    'creed': creed,
    'target': target,
    'geo': geo,
    'l1000': l1000
}

# Statistical analysis

In [4]:
df = pd.DataFrame(columns=[
    'dataset',
    'count', 
    'mean_gene_count', 
    'minimum_gene_count',
    'maximum_gene_count'
])

for dataset_name, data in MAP.items():
    
    data_count = len(data)
    
    gene_list = []
    
    for el in data:
        gene_list.append(len(data[el]))
    
    max_count = max(gene_list)
    min_count = min(gene_list)
    avg_count = sum(gene_list) / len(gene_list)
    
    tmp = pd.DataFrame(
        {'dataset': dataset_name,
        'count': data_count, 
        'mean_gene_count': avg_count, 
        'minimum_gene_count': min_count,
        'maximum_gene_count': max_count},
        index=[1,]
    )
    
    df = pd.concat([df, tmp], ignore_index=True)

In [5]:
df

Unnamed: 0,dataset,count,mean_gene_count,minimum_gene_count,maximum_gene_count
0,creed,39,556.974359,539,570
1,target,44,958.568182,1,3072
2,geo,18,6569.555556,26,18123
3,l1000,269,342.104089,3,2753
