greenelab · gwaybio · Feb 14, 2019 · Feb 9, 2019 · Feb 9, 2019 · Feb 9, 2019
diff --git a/10.detect-separation/0.download-validation-data.ipynb b/10.detect-separation/0.download-validation-data.ipynb
diff --git a/10.detect-separation/1.separate.ipynb b/10.detect-separation/1.separate.ipynb
diff --git a/10.detect-separation/2.apply-mycn-signature.ipynb b/10.detect-separation/2.apply-mycn-signature.ipynb
diff --git a/10.detect-separation/3.visualize-separation.ipynb b/10.detect-separation/3.visualize-separation.ipynb
diff --git a/10.detect-separation/figures/full_separation_plot.pdf b/10.detect-separation/figures/full_separation_plot.pdf
diff --git a/10.detect-separation/figures/full_separation_plot.png b/10.detect-separation/figures/full_separation_plot.png
diff --git a/10.detect-separation/results/mycn_nbl_scores.tsv b/10.detect-separation/results/mycn_nbl_scores.tsv
@@ -0,0 +1,31 @@
+	vae_111	Cell Line	MYCN status	1p36 del	3p26 del	11q23 del	17q21-qter unbal gain	ALK mutation	p53 mutation
+0	6.2535210389801605	CHP134	Amplified	LOH p32.3-pter; Gain p34.3-p36.22; Loss p36.22-pter	Gain/AI p26.3	None	Gain q12-qter	WT	WT
+1	3.720754971584685	CHP212	Amplified	Loss p13.2-pter	Gain/AI p26.3	cnLOH 23.3	Gain q12-qter	WT	WT
+2	4.6585287640605415	COGN415	Amplified	Unknown	Unknown	Unknown	Unknown	F1174L	WT
+3	5.322958181312064	COGN440	Amplified	Unknown	Unknown	Unknown	Unknown	WT	WT
+4	5.996784654918624	COGN453	Amplified	Unknown	Unknown	Unknown	Unknown	F1174L	WT
+5	2.9205779539437198	COGN496	Amplified	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown
+6	4.31451601644515	COGN519	Amplified	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown
+7	4.618327996252203	COGN534	Non-amplified	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown
+8	0.5026096029697165	COGN549	Non-amplified	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown
+9	6.521142958282991	COGN561	Amplified	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown
+10	2.788649940091749	COGN573	Amplified	Unknown	Unknown	Unknown	Unknown	Unknown	Unknown
+11	2.8262726896993287	IMR32	Amplified	Loss p32.3-pter	Loss p12.3	cnLOH q23.1	Gain q21.2-qter	WT	WT
+12	3.3322286924277096	KELLY	Amplified	LOH p21.3-pter; Loss p36.32; Gain p36.33	Loss p26.2	Loss q23.3-qter	Gain q21.2-qter	F1174L	P177T
+13	6.05176374296287	LAN5	Amplified	Loss p33-pter	None	None	Gain q21.2-qter	R1275Q	WT
+14	2.066052610338665	LAN6	Non-amplified	p36.12-pter; cnLOH p35.3-p36.11	Loss p14.3-pter	Loss q13.4-qter	Gain q12-qter	WT	WT
+15	6.959814501080495	NB1	Amplified	Loss p32.2-pter	Gain p24.1-pter	cnLOH q23.1	Gain q22-qter	WT; amplified	WT
+16	4.888588114372714	NB69	Non-amplified	Loss p13.3-pter	None	cnLOH q23.1, q23.3	Gain q12-qter	WT	WT
+17	3.4507878865582886	NBLS	Non-amplified	None	None	Loss q14.1-qter	None	WT	WT
+18	5.249221496659604	NBSD	Amplified	Loss p21.3-pter	Loss p13-pter	cnLOH q23.2-23.3	Gain q12-qter	F1174L	C176F
+19	7.733387072609603	NGP	Amplified	cnLOH p32.3-pter	Gain p25.3-pter	Loss q22.1-qter	Gain q21.1-qter	WT	A159D, C141W
+20	4.758437841441052	NLF	Amplified	Loss p32.2-pter	Loss/AI p26.1; AI p26.2; Gain 26.3	Loss/AI q23.3-qter	Gain q21.2-qter	WT	V203M
+21	3.389675354926716	NMB	Amplified	cnLOH p34.2-pter	None	Loss q21-qter	LOH/Gain q11.2-qter	WT	G245S
+22	-0.6057913097742388	RPE1	Non-amplified	Unknown	Unknown	Unknown	Unknown	WT	WT
+23	5.872549313309574	SHSY5Y	Non-amplified	None	None	Loss q22.1-q24.2	Gain q21.31-qter	F1174L	WT
+24	1.9609609717231447	SKNAS	Non-amplified	Loss p36.22-36.32	Loss p14.2-pter	q13.4-qter	Gain q21.31-qter	WT	H168R
+25	5.999025641441866	SKNDZ	Amplified	None	Gain/AI p26.1-pter	Loss q21-qter	Gain q21.2-qter	WT	R110L
+26	-0.966567052075833	SKNFI	Non-amplified	None	None	None	Gain q21.31-qter	WT	M246R
+27	6.629928018975049	SKNSH	Non-amplified	None	None	None	Gain q21.31-qter	F1174L	WT
+28	5.4238228846357766	SMSKAN	Amplified	Loss p13.3-pter	None	Loss q14.2-q23.3	Gain q11.1-qter	WT	WT
+29	8.348093718130619	SMSSAN	Amplified	Loss p32.3-pter	None	Loss q25	Gain q21.2-qter	F1174L	WT
diff --git a/10.detect-separation/results/nbl_mycn_separation_target_t_test.tsv b/10.detect-separation/results/nbl_mycn_separation_target_t_test.tsv
diff --git a/10.detect-separation/results/sex_separation_gtex_t_test.tsv b/10.detect-separation/results/sex_separation_gtex_t_test.tsv
diff --git a/10.detect-separation/results/sex_separation_tcga_t_test.tsv b/10.detect-separation/results/sex_separation_tcga_t_test.tsv
diff --git a/10.detect-separation/results/sex_separation_tcga_test.tsv b/10.detect-separation/results/sex_separation_tcga_test.tsv
diff --git a/10.detect-separation/scripts/nbconverted/0.download-validation-data.py b/10.detect-separation/scripts/nbconverted/0.download-validation-data.py
@@ -0,0 +1,202 @@
+
+# coding: utf-8
+
+# # Download and Process Neuroblastoma RNAseq Data
+# 
+# **Gregory Way 2019**
+# 
+# We are downloading the dataset associated with [Harenza et al. 2017](https://doi.org/10.1038/sdata.2017.33). The data profiles RNAseq data from 39 commonly used neuroblastoma (NBL) cell lines.
+# 
+# We are interested in the MYCN amplification status of these cell lines. We will test if the MYCN amplification score learned through the BioBombe signature approach applied to TARGET data generalizes to this cell line dataset.
+
+# In[1]:
+
+
+import os
+import requests
+import pandas as pd
+from urllib.request import urlretrieve
+
+from sklearn import preprocessing
+
+
+# In[2]:
+
+
+url = "https://ndownloader.figshare.com/files/14138792"
+name = "2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt"
+path = os.path.join("download", name)
+
+
+# In[3]:
+
+
+os.makedirs("download", exist_ok=True)
+
+
+# In[4]:
+
+
+urlretrieve(url, path)
+
+
+# In[5]:
+
+
+get_ipython().system(' md5sum "download/2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt"')
+
+
+# ## Download Phenotype Data
+
+# In[6]:
+
+
+url = "https://www.nature.com/articles/sdata201733/tables/3"
+name = "nbl_cellline_phenotype.txt"
+path = os.path.join("download", name)
+
+
+# In[7]:
+
+
+html = requests.get(url).content
+
+pheno_df = pd.read_html(html)[0]
+pheno_df['Cell Line'] = pheno_df['Cell Line'].str.replace("-", "")
+
+pheno_df.to_csv(path, sep='\t', index=False)
+
+pheno_df.head()
+
+
+# In[8]:
+
+
+get_ipython().system(' md5sum "download/nbl_cellline_phenotype.txt"')
+
+
+# ## Process RNAseq Data
+
+# In[9]:
+
+
+raw_file = os.path.join("download", "2019-01-22-CellLineSTAR-fpkm-2pass_matrix.txt")
+
+raw_df = pd.read_table(raw_file, sep='\t')
+raw_df.head()
+
+
+# ### Update Gene Names
+
+# In[10]:
+
+
+# Load curated gene names from versioned resource 
+commit = '721204091a96e55de6dcad165d6d8265e67e2a48'
+url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(commit)
+gene_df = pd.read_table(url)
+
+# Only consider protein-coding genes
+gene_df = (
+    gene_df.query("gene_type == 'protein-coding'")
+)
+
+symbol_to_entrez = dict(zip(gene_df.symbol,
+                            gene_df.entrez_gene_id))
+
+
+# In[11]:
+
+
+# Add alternative symbols to entrez mapping dictionary
+gene_df = gene_df.dropna(axis='rows', subset=['synonyms'])
+gene_df.synonyms = gene_df.synonyms.str.split('|')
+
+all_syn = (
+    gene_df.apply(lambda x: pd.Series(x.synonyms), axis=1)
+    .stack()
+    .reset_index(level=1, drop=True)
+)
+
+# Name the synonym series and join with rest of genes
+all_syn.name = 'all_synonyms'
+gene_with_syn_df = gene_df.join(all_syn)
+
+# Remove rows that have redundant symbols in all_synonyms
+gene_with_syn_df = (
+    gene_with_syn_df
+
+    # Drop synonyms that are duplicated - can't be sure of mapping
+    .drop_duplicates(['all_synonyms'], keep=False)
+
+    # Drop rows in which the symbol appears in the list of synonyms
+    .query('symbol not in all_synonyms')
+)
+
+
+# In[12]:
+
+
+# Create a synonym to entrez mapping and add to dictionary
+synonym_to_entrez = dict(zip(gene_with_syn_df.all_synonyms,
+                             gene_with_syn_df.entrez_gene_id))
+
+symbol_to_entrez.update(synonym_to_entrez)
+
+
+# In[13]:
+
+
+# Load gene updater
+url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(commit)
+updater_df = pd.read_table(url)
+old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
+                             updater_df.new_entrez_gene_id))
+
+
+# In[14]:
+
+
+gene_map = raw_df.GeneID.replace(symbol_to_entrez)
+gene_map = gene_map.replace(old_to_new_entrez)
+
+
+# In[15]:
+
+
+raw_df.index = gene_map
+raw_df.index.name = 'entrez_gene_id'
+raw_df = raw_df.drop(['GeneID'], axis='columns')
+raw_df = raw_df.loc[raw_df.index.isin(symbol_to_entrez.values()), :]
+
+print(raw_df.shape)
+raw_df.head()
+
+
+# ## Scale Data and Output
+
+# In[16]:
+
+
+raw_scaled_df = preprocessing.MinMaxScaler().fit_transform(raw_df.transpose())
+raw_scaled_df = (
+    pd.DataFrame(raw_scaled_df,
+                 columns=raw_df.index,
+                 index=raw_df.columns)
+    .sort_index(axis='columns')
+    .sort_index(axis='rows')
+)
+raw_scaled_df.columns = raw_scaled_df.columns.astype(str)
+raw_scaled_df = raw_scaled_df.loc[:, ~raw_scaled_df.columns.duplicated(keep='first')]
+
+raw_scaled_df.head()
+
+
+# In[17]:
+
+
+os.makedirs('data', exist_ok=True)
+
+file = os.path.join('data', 'nbl_celllines_processed_matrix.tsv.gz')
+raw_scaled_df.to_csv(file, sep='\t', compression='gzip')
+