# Download Publicly Available Hematopoietic Dataset

**Gregory Way, 2018**

Here, I download [GSE24759](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE24759) which is associated with [Novershtern et al. 2011](https://doi.org/10.1016/j.cell.2011.01.004).

This dataset includes 211 samples consisting of 38 distinct hematopoietic states in various stages of differentiation.

We hypothesized that our constructed feature identified through our interpret compression approach would have higher activation patterns in Monocytes.

In [1]:
import os
import csv
import pandas as pd
from sklearn import preprocessing

from scripts.utils import download_geo

In [2]:
base_url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE24nnn/GSE24759/suppl/'
name = 'GSE24759_data.sort.txt.gz'
directory = 'download'

In [3]:
download_geo(base_url, name, directory)

In [4]:
path = 'download/GSE24759_data.sort.txt.gz'
! sha256sum $path

98c248be560f0f422dbb07f24083306a58d208806eeb4d9b4f7da4f41de80905  download/GSE24759_data.sort.txt.gz


## Process the Data

In [5]:
# Load Additional File 3
geo_df = pd.read_table(path)

print(geo_df.shape)
geo_df.head(2)

(8968, 213)


Unnamed: 0,A_Name,A_Desc,HSC1_1,HSC1_13,HSC1_14,HSC1_2,HSC1_3,HSC1_4,HSC1_6,HSC1_7,...,TCELLA7_6,TCELLA7_7,TCELLA7_8,TCELLA8_10,TCELLA8_11,TCELLA8_3,TCELLA8_5,TCELLA8_6,TCELLA8_7,TCELLA8_8
0,8563,THOC5,-0.230509,-0.233209,0.093491,-0.366109,-0.343909,0.081691,-0.156809,-0.364609,...,-0.309709,0.421291,0.114291,-0.085009,0.128091,-0.463309,0.287691,0.388191,-0.509209,0.097591
1,8434,RECK,0.768078,0.172978,0.331778,0.628078,0.742478,-0.181122,0.597078,0.497378,...,0.684478,0.335278,0.559278,0.647978,-0.390622,0.931678,0.648878,0.619878,0.614878,0.746678


## Update Gene Names

In [6]:
# Load curated gene names from versioned resource 
commit = '721204091a96e55de6dcad165d6d8265e67e2a48'
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

symbol_to_entrez = dict(zip(gene_df.symbol,
                            gene_df.entrez_gene_id))

In [7]:
# Add alternative symbols to entrez mapping dictionary
gene_df = gene_df.dropna(axis='rows', subset=['synonyms'])
gene_df.synonyms = gene_df.synonyms.str.split('|')

all_syn = (
    gene_df.apply(lambda x: pd.Series(x.synonyms), axis=1)
    .stack()
    .reset_index(level=1, drop=True)
)

# Name the synonym series and join with rest of genes
all_syn.name = 'all_synonyms'
gene_with_syn_df = gene_df.join(all_syn)

# Remove rows that have redundant symbols in all_synonyms
gene_with_syn_df = (
    gene_with_syn_df
    
    # Drop synonyms that are duplicated - can't be sure of mapping
    .drop_duplicates(['all_synonyms'], keep=False)

    # Drop rows in which the symbol appears in the list of synonyms
    .query('symbol not in all_synonyms')
)

In [8]:
# Create a synonym to entrez mapping and add to dictionary
synonym_to_entrez = dict(zip(gene_with_syn_df.all_synonyms,
                             gene_with_syn_df.entrez_gene_id))

symbol_to_entrez.update(synonym_to_entrez)

In [9]:
# Load gene updater
commit = '721204091a96e55de6dcad165d6d8265e67e2a48'
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit)
updater_df = pd.read_table(url)
old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                             updater_df.new_entrez_gene_id))

In [10]:
# Update the symbol column to entrez_gene_id
geo_map = geo_df.A_Desc.replace(symbol_to_entrez)
geo_map = geo_map.replace(old_to_new_entrez)
geo_df.index = geo_map
geo_df.index.name = 'entrez_gene_id'
geo_df = geo_df.drop(['A_Name', 'A_Desc'], axis='columns')
geo_df = geo_df.loc[geo_df.index.isin(symbol_to_entrez.values()), :]

## Scale Data and Output to File

In [11]:
# Scale RNAseq data using zero-one normalization
geo_scaled_zeroone_df = preprocessing.MinMaxScaler().fit_transform(geo_df.transpose())
geo_scaled_zeroone_df = (
    pd.DataFrame(geo_scaled_zeroone_df,
                 columns=geo_df.index,
                 index=geo_df.columns)
    .sort_index(axis='columns')
    .sort_index(axis='rows')
)

geo_scaled_zeroone_df.columns = geo_scaled_zeroone_df.columns.astype(str)
geo_scaled_zeroone_df = geo_scaled_zeroone_df.loc[:, ~geo_scaled_zeroone_df.columns.duplicated(keep='first')]

os.makedirs('data', exist_ok=True)

geo_scaled_zeroone_df.columns = geo_scaled_zeroone_df.columns.astype(str)
geo_scaled_zeroone_df = geo_scaled_zeroone_df.loc[:, ~geo_scaled_zeroone_df.columns.duplicated(keep='first')]

file = os.path.join('data', 'GSE24759_processed_matrix.tsv.gz')
geo_scaled_zeroone_df.to_csv(file, sep='\t', compression='gzip')

geo_scaled_zeroone_df.head()

entrez_gene_id,2,9,14,16,18,19,20,22,23,25,...,100137049,100462977,100505741,100507436,100527963,100652748,102723547,105369230,107984923,107987479
BASO1_3,0.389491,0.342512,0.173372,0.196964,0.431818,0.405355,0.200679,0.580138,0.457497,0.475122,...,0.150598,0.297524,0.83302,0.41022,0.379413,0.271206,0.467457,0.842266,0.441286,0.384583
BASO1_4,0.308525,0.233696,0.281117,0.385034,0.540101,0.317294,0.262179,0.688354,0.453625,0.586759,...,0.268785,0.490944,0.77053,0.495008,0.660851,0.329928,0.36523,0.923228,0.487514,0.371348
BASO1_5,0.318499,0.30472,0.286889,0.198441,0.710227,0.337265,0.185538,0.729035,0.280381,0.273253,...,0.127869,0.139705,0.951369,0.641621,0.542492,0.176403,0.720273,0.784543,0.223465,0.468075
BASO1_6,0.478534,0.254761,0.094522,0.142994,0.649876,0.320514,0.205943,0.794756,0.427135,0.567184,...,0.177826,0.333096,0.883992,0.452337,0.352422,0.334565,0.417169,0.866243,0.286279,0.314256
BASO1_7,0.315282,0.447026,0.121143,0.196732,0.505682,0.303111,0.1691,0.562777,0.388825,0.266749,...,0.246057,0.343073,0.855247,0.627348,0.346892,0.298343,0.221455,0.626021,0.43116,0.387751


## Process Cell-Type Classification

Data acquired from Supplementary Table 1 of [Novershtern et al. 2011](https://doi.org/10.1016/j.cell.2011.01.004)

In [12]:
cell_class = {
    # Hematopoietic Stem Cells
    'HSC1': 'HSC',
    'HSC2': 'HSC',
    'HSC3': 'HSC',
    
    # Myeloid Progenitors
    'CMP': 'Myeloid',
    'MEP': 'Myeloid',
    'GMP': 'Myeloid',
    
    # Erythroid Populations
    'ERY1': 'Erythroid',
    'ERY2': 'Erythroid',
    'ERY3': 'Erythroid',
    'ERY4': 'Erythroid',
    'ERY5': 'Erythroid',
    
    # Megakaryocytic Populations
    'MEGA1': 'Megakaryocytic',
    'MEGA2': 'Megakaryocytic',
    
    # Granulocytic Populations
    'GRAN1': 'Granulocytic',
    'GRAN2': 'Granulocytic',
    'GRAN3': 'Granulocytic',
    
    # Monocyte Population
    'MONO1': 'Monocyte',
    'MONO2': 'Monocyte',
    
    # Basophil Population
    'BASO1': 'Basophil',
    
    # Eosinophil Population
    'EOS2': 'Eosinophil',
    
    # B Lymphoid Progenitors
    'PRE_BCELL2': 'B Lymphoid Progenitor',
    'PRE_BCELL3': 'B Lymphoid Progenitor',
    
    # Naive Lymphoid Progenitors
    'BCELLA1': 'Naive Lymphoid',
    'TCELLA6': 'Naive Lymphoid',
    'TCELLA2': 'Naive Lymphoid',
    
    # Differentiated B Cells
    'BCELLA2': 'Differentiated B Cell',
    'BCELLA3': 'Differentiated B Cell',
    'BCELLA4': 'Differentiated B Cell',
    
    # Differentiated T Cells
    'TCELLA7': 'Differentiated T Cell',
    'TCELLA8': 'Differentiated T Cell',
    'TCELLA1': 'Differentiated T Cell',
    'TCELLA3': 'Differentiated T Cell',
    'TCELLA4': 'Differentiated T Cell',
    
    # Natural Killer Population
    'NKA1': 'NK Cell',
    'NKA2': 'NK Cell',
    'NKA3': 'NK Cell',
    'NKA4': 'NK Cell',
    
    # Dendritic Cell
    'DENDA1': 'Dendritic',
    'DENDA2': 'Dendritic',
}

In [13]:
# Write data to file
cell_class_df = (
    pd.DataFrame(cell_class, index=[0])
    .transpose()
    .reset_index()
    .rename(columns={'index': 'label', 0: 'classification'})
)

cell_class_df.head()

Unnamed: 0,label,classification
0,HSC1,HSC
1,HSC2,HSC
2,HSC3,HSC
3,CMP,Myeloid
4,MEP,Myeloid


In [14]:
file = os.path.join('results', 'cell-type-classification.tsv')
cell_class_df.to_csv(file, sep='\t', index=False)