# Download and Process Neuroblastoma RNAseq Data

**Gregory Way 2019**

We are downloading the dataset associated with [Harenza et al. 2017](https://doi.org/10.1038/sdata.2017.33). The data profiles RNAseq data from 39 commonly used neuroblastoma (NBL) cell lines.

We are interested in the MYCN amplification status of these cell lines. We will test if the MYCN amplification score learned through the BioBombe signature approach applied to TARGET data generalizes to this cell line dataset.

MYCN Amplification refers to the number of copies of the _MYCN_ gene. MYCN amplification is used as a biomarker for poor prognosis in neuroblastoma patients ([Huang and Weiss 2013](https://doi.org/10.1101/cshperspect.a014415)).

In [1]:
import os
import requests
import pandas as pd
from urllib.request import urlretrieve

from sklearn import preprocessing

In [2]:
url = "https://ndownloader.figshare.com/files/14138792"
name = "2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt"
path = os.path.join("download", name)

In [3]:
os.makedirs("download", exist_ok=True)

In [4]:
urlretrieve(url, path)

('download/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt',
 <http.client.HTTPMessage at 0x7f2d72ae0080>)

In [5]:
! md5sum "download/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt"

55ea0255d1aa7708eba2ebd0113eeb3f  download/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt


## Download Phenotype Data

In [6]:
url = "https://www.nature.com/articles/sdata201733/tables/4"
name = "nbl_cellline_phenotype.txt"
path = os.path.join("download", name)

In [7]:
if not os.path.isfile(path):
    html = requests.get(url).content

    pheno_df = pd.read_html(html)[0]
    pheno_df['Cell Line'] = pheno_df['Cell Line'].str.replace("-", "")

    pheno_df.to_csv(path, sep='\t', index=False)

else:
    pheno_df = pd.read_csv(path, sep="\t")

pheno_df.head()

Unnamed: 0,Cell Line,MYCN status,1p36 del,3p26 del,11q23 del,17q21-qter unbal gain,ALK mutation,p53 mutation
0,CHP134,Amplified,LOH p32.3-pter; Gain p34.3-p36.22; Loss p36.22...,Gain/AI p26.3,,Gain q12-qter,WT,WT
1,CHP212,Amplified,Loss p13.2-pter,Gain/AI p26.3,cnLOH 23.3,Gain q12-qter,WT,WT
2,COGN415,Amplified,Unknown,Unknown,Unknown,Unknown,F1174L,WT
3,COGN440,Amplified,Unknown,Unknown,Unknown,Unknown,WT,WT
4,COGN453,Amplified,Unknown,Unknown,Unknown,Unknown,F1174L,WT


In [8]:
! md5sum "download/nbl_cellline_phenotype.txt"

b08854900f32c037b0d1f516ab6b99b5  download/nbl_cellline_phenotype.txt


## Process RNAseq Data

In [9]:
raw_file = os.path.join("download", "2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt")

raw_df = pd.read_table(raw_file, sep='\t')
raw_df.head()

Unnamed: 0,GeneID,CHP134,CHP212,COGN415,COGN440,COGN453,COGN471,COGN496,COGN519,COGN534,...,RPE1,SHSY5Y,SKNAS,SKNBE2,SKNBE2C,SKNDZ,SKNFI,SKNSH,SMSKAN,SMSSAN
EAF1,EAF1,8.78321,6.863459,5.462931,10.104389,6.604241,4.866576,9.329809,4.401073,11.083115,...,4.363805,8.555519,6.260962,4.84232,4.028825,8.861676,19.39077,9.936771,6.871837,7.696442
SARNP,SARNP,12.752439,23.717824,22.162407,27.40176,19.256736,9.15738,23.000173,25.250549,23.622492,...,25.26471,31.705089,22.246654,24.115542,28.652249,32.9766,38.725041,15.290273,18.659111,31.636539
CXCR5,CXCR5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004727,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KRTAP4-2,KRTAP4-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MAPK7,MAPK7,2.778997,2.779451,2.718453,1.67738,3.709534,0.988977,1.575597,1.455281,2.519137,...,1.97122,5.174954,1.11536,1.67998,0.971465,3.33461,2.354082,2.788888,5.015511,3.019704


### Update Gene Names

In [10]:
# Load curated gene names from versioned resource 
commit = '721204091a96e55de6dcad165d6d8265e67e2a48'
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

symbol_to_entrez = dict(zip(gene_df.symbol,
                            gene_df.entrez_gene_id))

In [11]:
# Add alternative symbols to entrez mapping dictionary
gene_df = gene_df.dropna(axis='rows', subset=['synonyms'])
gene_df.synonyms = gene_df.synonyms.str.split('|')

all_syn = (
    gene_df.apply(lambda x: pd.Series(x.synonyms), axis=1)
    .stack()
    .reset_index(level=1, drop=True)
)

# Name the synonym series and join with rest of genes
all_syn.name = 'all_synonyms'
gene_with_syn_df = gene_df.join(all_syn)

# Remove rows that have redundant symbols in all_synonyms
gene_with_syn_df = (
    gene_with_syn_df
    
    # Drop synonyms that are duplicated - can't be sure of mapping
    .drop_duplicates(['all_synonyms'], keep=False)

    # Drop rows in which the symbol appears in the list of synonyms
    .query('symbol not in all_synonyms')
)

In [12]:
# Create a synonym to entrez mapping and add to dictionary
synonym_to_entrez = dict(zip(gene_with_syn_df.all_synonyms,
                             gene_with_syn_df.entrez_gene_id))

symbol_to_entrez.update(synonym_to_entrez)

In [13]:
# Load gene updater
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit)
updater_df = pd.read_table(url)
old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                             updater_df.new_entrez_gene_id))

In [14]:
gene_map = raw_df.GeneID.replace(symbol_to_entrez)
gene_map = gene_map.replace(old_to_new_entrez)

In [15]:
raw_df.index = gene_map
raw_df.index.name = 'entrez_gene_id'
raw_df = raw_df.drop(['GeneID'], axis='columns')
raw_df = raw_df.loc[raw_df.index.isin(symbol_to_entrez.values()), :]

print(raw_df.shape)
raw_df.head()

(19287, 40)


Unnamed: 0_level_0,CHP134,CHP212,COGN415,COGN440,COGN453,COGN471,COGN496,COGN519,COGN534,COGN549,...,RPE1,SHSY5Y,SKNAS,SKNBE2,SKNBE2C,SKNDZ,SKNFI,SKNSH,SMSKAN,SMSSAN
entrez_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10847,8.78321,6.863459,5.462931,10.104389,6.604241,4.866576,9.329809,4.401073,11.083115,5.18227,...,4.363805,8.555519,6.260962,4.84232,4.028825,8.861676,19.39077,9.936771,6.871837,7.696442
84324,12.752439,23.717824,22.162407,27.40176,19.256736,9.15738,23.000173,25.250549,23.622492,12.517533,...,25.26471,31.705089,22.246654,24.115542,28.652249,32.9766,38.725041,15.290273,18.659111,31.636539
643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004727,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5598,2.778997,2.779451,2.718453,1.67738,3.709534,0.988977,1.575597,1.455281,2.519137,1.208765,...,1.97122,5.174954,1.11536,1.67998,0.971465,3.33461,2.354082,2.788888,5.015511,3.019704


## Scale Data and Output

In [16]:
raw_scaled_df = preprocessing.MinMaxScaler().fit_transform(raw_df.transpose())
raw_scaled_df = (
    pd.DataFrame(raw_scaled_df,
                 columns=raw_df.index,
                 index=raw_df.columns)
    .sort_index(axis='columns')
    .sort_index(axis='rows')
)
raw_scaled_df.columns = raw_scaled_df.columns.astype(str)
raw_scaled_df = raw_scaled_df.loc[:, ~raw_scaled_df.columns.duplicated(keep='first')]

raw_scaled_df.head()

entrez_gene_id,1,2,9,12,13,14,15,16,18,19,...,105375787,106707243,106821730,106865373,107080638,107080644,107181291,107984155,107984923,107987479
CHP134,0.496142,0.0,0.031914,0.0,0.0,0.335754,1.0,0.527104,0.12302,0.101682,...,0.0,0.202001,0.35025,0.0,0.359538,0.0,0.0,0.0,0.398515,0.0
CHP212,0.130269,0.077607,0.384543,0.0,0.047631,0.483361,0.011138,0.680574,0.002162,0.136316,...,0.0,0.03031,0.673424,0.0,0.086474,0.0,0.0,0.0,0.556493,0.0
COGN415,0.356876,0.0,0.168576,0.0,0.0,0.303912,0.06808,0.1538,0.100531,0.033393,...,0.0,0.417738,0.199854,0.0,0.211183,0.0,0.0,0.0,0.726023,0.0
COGN440,0.512493,0.000593,1.0,0.001941,0.0,0.373208,0.102337,0.249516,0.217459,0.004284,...,0.0,0.230559,0.538564,0.0,0.228188,0.0,0.0,0.0,0.607477,0.0
COGN453,0.383474,0.0,0.086284,0.0,0.0,0.46489,0.113665,0.261963,0.125526,0.00172,...,0.0,0.301525,0.471939,0.0,0.272335,0.0,0.0,0.0,0.375054,0.0


In [17]:
os.makedirs('data', exist_ok=True)

file = os.path.join('data', 'nbl_celllines_processed_matrix.tsv.gz')
raw_scaled_df.to_csv(file, sep='\t', compression='gzip')