# Download Publicly Available Neutrophil Dataset

**Gregory Way, 2018**

Here, I download [GSE103706](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103706) which is associated with [Rincon et al. 2018](https://doi.org/10.1186/s12864-018-4957-6).

This dataset includes two acute myeloid leukemia (AML) cell lines; PLB-985 and HL-60.
There are 14 samples total in this dataset.
The cell lines are exposed to two treatments - DMSO and DMSO+Nutridoma.
The treatments are demonstrated to induce neutrophil differentiation in these cell lines.

We hypothesized that our constructed feature identified through our interpret compression approach would have higher activation patterns in the cell lines with induced neutrophil differentiation.

In [1]:
import os
import csv
import pandas as pd
from sklearn import preprocessing

from scripts.utils import download_geo

In [2]:
base_url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103706/suppl/'
name = 'GSE103706_Merged_data_FPKM_and_normalized.xlsx'
directory = 'download'

In [3]:
download_geo(base_url, name, directory)

In [4]:
path = 'download/GSE103706_Merged_data_FPKM_and_normalized.xlsx'
! sha256sum $path

8a56404922fc9307eb15de097867a33d97ed68a400a75ab4cd09d84e67def523  download/GSE103706_Merged_data_FPKM_and_normalized.xlsx


## Process the Data

In [5]:
# Load Data
geo_df = pd.read_excel(path, index_col=0, skiprows=1)
geo_df = geo_df[geo_df.symbol != 'N\A']

print(geo_df.shape)
geo_df.head(2)

(37466, 18)


Unnamed: 0_level_0,ens_gene_id,ncbi_gene_id,gene_short,symbol,"PLB-985, not differentiated, replicate 1","PLB-985, DMSO, day 6, replicate 1","PLB-985, DMSO+Nutridoma, day 4","PLB-985, DMSO+Nutridoma, day 6, replicate 1","HL-60, not differentiated, replicate 1","HL-60, DMSO, day 6, replicate 1","HL-60, DMSO+Nutridoma, day 6, replicate 1","PLB-985, not differentiated, replicate 2","PLB-985, DMSO, day 6, replicate 2","PLB-985, DMSO+Nutridoma, day 2","PLB-985, DMSO+Nutridoma, day 6, replicate 2","HL-60, not differentiated, replicate 2","HL-60, DMSO, day 6, replicate 2","HL-60, DMSO+Nutridoma, day 6, replicate 2"
tracking_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ENSG00000000003.14,ENSG00000000003.14,7105,TSPAN6,TSPAN6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000000005.5,ENSG00000000005.5,64102,TNMD,TNMD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Update Gene Names

In [6]:
# Load curated gene names from versioned resource 
commit = '721204091a96e55de6dcad165d6d8265e67e2a48'
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

symbol_to_entrez = dict(zip(gene_df.symbol,
                            gene_df.entrez_gene_id))

In [7]:
# Add alternative symbols to entrez mapping dictionary
gene_df = gene_df.dropna(axis='rows', subset=['synonyms'])
gene_df.synonyms = gene_df.synonyms.str.split('|')

all_syn = (
    gene_df.apply(lambda x: pd.Series(x.synonyms), axis=1)
    .stack()
    .reset_index(level=1, drop=True)
)

# Name the synonym series and join with rest of genes
all_syn.name = 'all_synonyms'
gene_with_syn_df = gene_df.join(all_syn)

# Remove rows that have redundant symbols in all_synonyms
gene_with_syn_df = (
    gene_with_syn_df
    
    # Drop synonyms that are duplicated - can't be sure of mapping
    .drop_duplicates(['all_synonyms'], keep=False)

    # Drop rows in which the symbol appears in the list of synonyms
    .query('symbol not in all_synonyms')
)

In [8]:
# Create a synonym to entrez mapping and add to dictionary
synonym_to_entrez = dict(zip(gene_with_syn_df.all_synonyms,
                             gene_with_syn_df.entrez_gene_id))

symbol_to_entrez.update(synonym_to_entrez)

In [9]:
# Load gene updater
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit)
updater_df = pd.read_table(url)
old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                             updater_df.new_entrez_gene_id))

In [10]:
# Update the symbol column to entrez_gene_id
geo_map = geo_df.symbol.replace(symbol_to_entrez)
geo_map = geo_map.replace(old_to_new_entrez)
geo_df.index = geo_map
geo_df.index.name = 'entrez_gene_id'
geo_df = geo_df.drop(['ens_gene_id', 'ncbi_gene_id', 'gene_short', 'symbol'], axis='columns')
geo_df = geo_df.loc[geo_df.index.isin(symbol_to_entrez.values()), :]

In [11]:
geo_df.head()

Unnamed: 0_level_0,"PLB-985, not differentiated, replicate 1","PLB-985, DMSO, day 6, replicate 1","PLB-985, DMSO+Nutridoma, day 4","PLB-985, DMSO+Nutridoma, day 6, replicate 1","HL-60, not differentiated, replicate 1","HL-60, DMSO, day 6, replicate 1","HL-60, DMSO+Nutridoma, day 6, replicate 1","PLB-985, not differentiated, replicate 2","PLB-985, DMSO, day 6, replicate 2","PLB-985, DMSO+Nutridoma, day 2","PLB-985, DMSO+Nutridoma, day 6, replicate 2","HL-60, not differentiated, replicate 2","HL-60, DMSO, day 6, replicate 2","HL-60, DMSO+Nutridoma, day 6, replicate 2"
entrez_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
7105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8813,73.3943,60.802,66.2544,61.972,65.6512,59.4705,51.3605,70.3526,55.0424,62.5207,55.9806,86.9173,49.9261,58.4312
6359,2.88954,5.40311,5.08473,5.84126,2.454,4.21288,4.56887,3.15789,5.20013,4.87725,5.39224,2.29803,4.33103,2.67404
55732,13.2372,2.11278,2.85447,3.48427,6.71764,1.63993,2.15884,12.6363,2.51963,3.72377,2.38798,9.21015,1.63526,2.29197


## Scale Data and Output to File

In [12]:
# Scale RNAseq data using zero-one normalization
geo_scaled_zeroone_df = preprocessing.MinMaxScaler().fit_transform(geo_df.transpose())
geo_scaled_zeroone_df = (
    pd.DataFrame(geo_scaled_zeroone_df,
                 columns=geo_df.index,
                 index=geo_df.columns)
    .sort_index(axis='columns')
    .sort_index(axis='rows')
)

geo_scaled_zeroone_df.columns = geo_scaled_zeroone_df.columns.astype(str)
geo_scaled_zeroone_df = geo_scaled_zeroone_df.loc[:, ~geo_scaled_zeroone_df.columns.duplicated(keep='first')]
os.makedirs('data', exist_ok=True)

file = os.path.join('data', 'GSE103706_processed_matrix.tsv.gz')
geo_scaled_zeroone_df.to_csv(file, sep='\t', compression='gzip')

geo_scaled_zeroone_df.head()

entrez_gene_id,1,2,9,12,13,14,15,16,18,19,...,105371242,105373377,105375205,105377595,105379554,106821730,107282092,107984155,107984923,107987479
"HL-60, DMSO+Nutridoma, day 6, replicate 1",0.928066,1.0,0.131285,0.0,0.0,0.258895,0.0,0.09745,0.899518,1.0,...,0.0,0.0,0.456063,0.0,0.230915,0.0,0.161967,0.0,0.233555,0.280561
"HL-60, DMSO+Nutridoma, day 6, replicate 2",0.520178,0.782566,0.217753,0.0,0.0,0.0,1.0,0.016712,0.321479,0.622087,...,0.0,0.0,0.263193,0.0,1.5e-05,5.445048e-210,0.23185,0.0,0.266596,0.163936
"HL-60, DMSO, day 6, replicate 1",1.0,0.0,0.514627,0.0,0.0,0.554921,0.0,0.025734,1.0,0.073798,...,0.0,0.0,0.650158,0.0,0.853286,1.0,0.044424,0.0,0.128725,0.473598
"HL-60, DMSO, day 6, replicate 2",0.609842,0.018811,0.066794,0.0,0.0,0.067757,0.0,0.0,0.792303,0.083006,...,0.0,0.0,0.335346,0.0,1.0,0.1324108,0.019657,0.0,0.042261,0.665437
"HL-60, not differentiated, replicate 1",0.066542,0.0,0.001149,0.0,0.0,0.132155,0.0,0.692236,0.034753,0.371344,...,0.0,0.0,0.003063,0.0,0.482713,1.214138e-94,0.551368,0.0,0.864482,0.092826
