# Download Publicly Available Hematopoietic Dataset

**Gregory Way, 2018**

Here, I download [GSE24759](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE24759) which is associated with [Novershtern et al. 2011](https://doi.org/10.1016/j.cell.2011.01.004).

This dataset includes 211 samples consisting of 38 distinct hematopoietic states in various stages of differentiation.

We hypothesized that our constructed feature identified through our interpret compression approach would have higher activation patterns in Monocytes.

In [1]:
import os
import csv
import pandas as pd
from sklearn import preprocessing

from scripts.utils import download_geo

In [2]:
base_url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE24nnn/GSE24759/suppl/'
name = 'GSE24759_data.sort.txt.gz'
directory = 'download'

In [3]:
download_geo(base_url, name, directory)

In [4]:
path = 'download/GSE24759_data.sort.txt.gz'
! sha256sum $path

98c248be560f0f422dbb07f24083306a58d208806eeb4d9b4f7da4f41de80905  download/GSE24759_data.sort.txt.gz


## Process the Data

In [5]:
# Load Additional File 3
geo_df = pd.read_table(path)

print(geo_df.shape)
geo_df.head(2)

(8968, 213)


Unnamed: 0,A_Name,A_Desc,HSC1_1,HSC1_13,HSC1_14,HSC1_2,HSC1_3,HSC1_4,HSC1_6,HSC1_7,...,TCELLA7_6,TCELLA7_7,TCELLA7_8,TCELLA8_10,TCELLA8_11,TCELLA8_3,TCELLA8_5,TCELLA8_6,TCELLA8_7,TCELLA8_8
0,8563,THOC5,-0.230509,-0.233209,0.093491,-0.366109,-0.343909,0.081691,-0.156809,-0.364609,...,-0.309709,0.421291,0.114291,-0.085009,0.128091,-0.463309,0.287691,0.388191,-0.509209,0.097591
1,8434,RECK,0.768078,0.172978,0.331778,0.628078,0.742478,-0.181122,0.597078,0.497378,...,0.684478,0.335278,0.559278,0.647978,-0.390622,0.931678,0.648878,0.619878,0.614878,0.746678


## Update Gene Names

In [6]:
# Load gene updater
commit = '721204091a96e55de6dcad165d6d8265e67e2a48'
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit)
updater_df = pd.read_table(url)
old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                             updater_df.new_entrez_gene_id))

In [7]:
# Update the entrez gene IDs in the index
#entrez_ids = geo_
geo_df.index = geo_df.A_Name.replace(old_to_new_entrez)
geo_df.index.name = 'entrez_gene_id'
geo_df = geo_df.drop(['A_Name', 'A_Desc'], axis='columns')
geo_df.head(2)

Unnamed: 0_level_0,HSC1_1,HSC1_13,HSC1_14,HSC1_2,HSC1_3,HSC1_4,HSC1_6,HSC1_7,HSC1_8,HSC1_9,...,TCELLA7_6,TCELLA7_7,TCELLA7_8,TCELLA8_10,TCELLA8_11,TCELLA8_3,TCELLA8_5,TCELLA8_6,TCELLA8_7,TCELLA8_8
entrez_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8563,-0.230509,-0.233209,0.093491,-0.366109,-0.343909,0.081691,-0.156809,-0.364609,-0.193709,-0.150609,...,-0.309709,0.421291,0.114291,-0.085009,0.128091,-0.463309,0.287691,0.388191,-0.509209,0.097591
8434,0.768078,0.172978,0.331778,0.628078,0.742478,-0.181122,0.597078,0.497378,0.664978,-0.354722,...,0.684478,0.335278,0.559278,0.647978,-0.390622,0.931678,0.648878,0.619878,0.614878,0.746678


## Scale Data and Output to File

In [8]:
# Scale RNAseq data using zero-one normalization
geo_scaled_zeroone_df = preprocessing.MinMaxScaler().fit_transform(geo_df.transpose())
geo_scaled_zeroone_df = pd.DataFrame(geo_scaled_zeroone_df,
                                     columns=geo_df.index,
                                     index=geo_df.columns)

os.makedirs('data', exist_ok=True)

file = os.path.join('data', 'GSE24759_processed_matrix.tsv.gz')
geo_scaled_zeroone_df.to_csv(file, sep='\t', compression='gzip')

geo_scaled_zeroone_df.head()

entrez_gene_id,8563,8434,55099,4863,4628,1387,55854,79572,1785,2634,...,9125,2885,26276,3106,64757,9764,57819,192683,152559,2669
HSC1_1,0.312585,0.765356,0.294345,0.625128,0.608219,0.766648,0.621663,0.725602,0.670308,0.696596,...,0.291599,0.178822,0.59874,0.552248,0.295143,0.153165,0.963897,0.176392,0.923904,0.336856
HSC1_13,0.311769,0.561833,0.329615,0.401469,0.487599,0.637845,0.644225,0.713899,0.459758,0.667098,...,0.287931,0.248829,0.743992,0.565033,0.473721,0.156976,0.964679,0.307535,0.810295,0.314071
HSC1_14,0.410485,0.616142,0.17996,0.543215,0.530361,0.674898,0.666004,0.756731,0.157476,0.799837,...,0.372722,0.01768,0.507967,0.619049,0.269873,0.103857,0.928723,0.156098,0.863768,0.217736
HSC1_2,0.271612,0.717476,0.258946,0.446492,0.485524,0.652044,0.612616,0.69882,0.230008,0.74431,...,0.388237,0.268682,0.581883,0.443707,0.317715,0.201686,0.969813,0.192479,0.763137,0.499333
HSC1_3,0.27832,0.756601,0.234251,0.701579,0.549765,0.678577,0.657046,0.791054,0.276747,0.767851,...,0.45138,0.209612,0.466084,0.554301,0.279939,0.128677,0.83909,0.142195,0.829297,0.292691


## Process Cell-Type Classification

Data acquired from Supplementary Table 1 of [Novershtern et al. 2011](https://doi.org/10.1016/j.cell.2011.01.004)

In [9]:
cell_class = {
    # Hematopoietic Stem Cells
    'HSC1': 'HSC',
    'HSC2': 'HSC',
    'HSC3': 'HSC',
    
    # Myeloid Progenitors
    'CMP': 'Myeloid',
    'MEP': 'Myeloid',
    'GMP': 'Myeloid',
    
    # Erythroid Populations
    'ERY1': 'Erythroid',
    'ERY2': 'Erythroid',
    'ERY3': 'Erythroid',
    'ERY4': 'Erythroid',
    'ERY5': 'Erythroid',
    
    # Megakaryocytic Populations
    'MEGA1': 'Megakaryocytic',
    'MEGA2': 'Megakaryocytic',
    
    # Granulocytic Populations
    'GRAN1': 'Granulocytic',
    'GRAN2': 'Granulocytic',
    'GRAN3': 'Granulocytic',
    
    # Monocyte Population
    'MONO1': 'Monocyte',
    'MONO2': 'Monocyte',
    
    # Basophil Population
    'BASO1': 'Basophil',
    
    # Eosinophil Population
    'EOS2': 'Eosinophil',
    
    # B Lymphoid Progenitors
    'PRE_BCELL2': 'B Lymphoid Progenitor',
    'PRE_BCELL3': 'B Lymphoid Progenitor',
    
    # Naive Lymphoid Progenitors
    'BCELLA1': 'Naive Lymphoid',
    'TCELLA6': 'Naive Lymphoid',
    'TCELLA2': 'Naive Lymphoid',
    
    # Differentiated B Cells
    'BCELLA2': 'Differentiated B Cell',
    'BCELLA3': 'Differentiated B Cell',
    'BCELLA4': 'Differentiated B Cell',
    
    # Differentiated T Cells
    'TCELLA7': 'Differentiated T Cell',
    'TCELLA8': 'Differentiated T Cell',
    'TCELLA1': 'Differentiated T Cell',
    'TCELLA3': 'Differentiated T Cell',
    'TCELLA4': 'Differentiated T Cell',
    
    # Natural Killer Population
    'NKA1': 'NK Cell',
    'NKA2': 'NK Cell',
    'NKA3': 'NK Cell',
    'NKA4': 'NK Cell',
    
    # Dendritic Cell
    'DENDA1': 'Dendritic',
    'DENDA2': 'Dendritic',
}

In [10]:
cell_class_df = (
    pd.DataFrame(cell_class, index=[0])
    .transpose()
    .reset_index()
    .rename(columns={'index': 'label', 0: 'classification'})
)

cell_class_df.head()

Unnamed: 0,label,classification
0,HSC1,HSC
1,HSC2,HSC
2,HSC3,HSC
3,CMP,Myeloid
4,MEP,Myeloid


In [11]:
file = os.path.join('results', 'cell-type-classification.tsv')
cell_class_df.to_csv(file, sep='\t', index=False)