In [12]:
import os
import re

import pandas as pd
import numpy as np
import scanpy as sc
from mgitools.os_helpers import listfiles

In [10]:
clinical = pd.read_csv('../data/clinical/clinical.txt', sep='\t', index_col=0)
clinical

Unnamed: 0_level_0,age,gender,survival,status,dataset
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SA518851,67.0,male,5.0,Dead,bailey
SA528766,61.0,female,8.0,Dead,bailey
SA528763,52.0,male,36.0,Dead,bailey
SA518854,55.0,male,5.0,Alive,bailey
SA518806,51.0,female,14.0,Dead,bailey
...,...,...,...,...,...
T_11_02_A148a,68.0,male,20.0,Dead,kirby
T_11_04_A026b,64.0,female,49.0,Alive,kirby
T_11_05_A018b,76.0,male,36.0,Dead,kirby
T_11_06_A011a,68.0,female,10.0,Dead,kirby


###### TCGA

In [40]:
sample_map = pd.read_csv('../data/bulk_rna_seq/tcga/filename_case_fpkm_1624798607352.txt', sep='\t')
sample_map = {f.split('.')[0]:c[:12] for f, c in zip(sample_map['file_name'], sample_map['cases'])}
sample_map

{'8301fe5c-c7c9-4def-ac1c-edeee5b8a424': 'TCGA-E6-A1M0',
 '80c96dc1-c0a0-4613-b03a-369f00904460': 'TCGA-B5-A1MY',
 '2613fbb7-cf79-48ea-9f0c-e7f6af0e6ad0': 'TCGA-AX-A06F',
 'fdc50973-2298-4b0a-ba8c-96bf01882f72': 'TCGA-D1-A176',
 '64b5cc13-137b-49a3-8783-11febcc3a961': 'TCGA-B5-A11V',
 '9be93e32-5549-49b7-9a9d-b425871a765d': 'TCGA-BG-A220',
 '05ac38d7-29f0-46ae-b4d2-a908a4a9599b': 'TCGA-A5-A0G2',
 'e227370b-332f-4fba-9a48-b894f88df55f': 'TCGA-AW-A1PO',
 'e01732d2-be57-4d3c-aa99-326d38ec2226': 'TCGA-AX-A3FT',
 'c9936adb-0c96-4651-b37f-1df1c92ae307': 'TCGA-AP-A1DM',
 '013dc5b4-cf78-4a1a-90f0-2bc3fe230a5a': 'TCGA-AJ-A8CT',
 '8f769163-f20b-4331-a415-3b49813c6a3a': 'TCGA-BG-A0MS',
 'f3b6c5d9-6d0c-41db-b369-2b60bc2f44a6': 'TCGA-D1-A16G',
 '8cd31f85-1d2e-4d90-bd8b-7ca7ebd9241b': 'TCGA-EY-A2OP',
 '302d3923-3985-4669-b68a-2d51d508bb8d': 'TCGA-DF-A2KZ',
 'ee5cf540-f7c4-4cbb-89f6-41bf2583981b': 'TCGA-A5-A0R9',
 '8ec3195f-156f-4b0a-bcdf-c27ff378c0be': 'TCGA-D1-A17N',
 'ffea1798-2d46-4425-b43d-823c4

In [41]:
fps = sorted(listfiles('../data/bulk_rna_seq/tcga/gdc_download_20190908_175124.372067/', regex=r'.gz'))
fps

['../data/bulk_rna_seq/tcga/gdc_download_20190908_175124.372067/008f1fdc-a5e4-4094-a987-bccc44d5a7c7/7cccb817-cdff-45c0-a4cd-9de57d7d8c48.htseq.counts.gz',
 '../data/bulk_rna_seq/tcga/gdc_download_20190908_175124.372067/02425961-7ed2-40c0-9b61-4d63b3a61f94/c71dcbec-be8f-4246-b009-25294c19ed66.htseq.counts.gz',
 '../data/bulk_rna_seq/tcga/gdc_download_20190908_175124.372067/07119960-1ac2-48b2-993c-616b0f15eab0/cde0617f-fcd6-4c58-b3cf-dcfabc6fcda7.htseq.counts.gz',
 '../data/bulk_rna_seq/tcga/gdc_download_20190908_175124.372067/071439e7-4436-4d19-b54b-9d897e7d802c/67fd4b38-5a38-487d-a235-064580064739.htseq.counts.gz',
 '../data/bulk_rna_seq/tcga/gdc_download_20190908_175124.372067/09bd94d6-a115-4c5e-a9ce-db6277c386e1/4b377b21-fed6-4684-bd4c-93850e8b5b8a.htseq.counts.gz',
 '../data/bulk_rna_seq/tcga/gdc_download_20190908_175124.372067/0ada222c-bd8b-40cf-9ba8-7ed485982705/0e556719-18fe-49e4-9c2e-454bb11c7e03.htseq.counts.gz',
 '../data/bulk_rna_seq/tcga/gdc_download_20190908_175124.372067/

In [42]:
case_to_fp = {}
for fp in fps:
    sid = fp.split('/')[-1].split('.')[0]
    case = sample_map[sid]
    case_to_fp[case] = fp

In [53]:
combined = None
for case, fp in case_to_fp.items():
    df = pd.read_csv(fp, sep='\t', header=None)
    df.columns = ['gene_id', 'counts']
    df['gene_id'] = [g.split('.')[0] for g in df['gene_id']]
    df = df.groupby('gene_id').mean().astype(int)
    df.columns = [case]
    
    if combined is None:
        combined = df
    else:
        combined = pd.merge(combined, df, left_index=True, right_index=True)
combined

Unnamed: 0_level_0,TCGA-OE-A75W,TCGA-2J-AABT,TCGA-IB-7886,TCGA-IB-AAUU,TCGA-2J-AAB6,TCGA-LB-A8F3,TCGA-HZ-A4BH,TCGA-HV-A5A3,TCGA-IB-7646,TCGA-2J-AAB9,...,TCGA-L1-A7W4,TCGA-IB-A5SO,TCGA-IB-AAUW,TCGA-2J-AABF,TCGA-H8-A6C1,TCGA-IB-AAUN,TCGA-FB-AAPU,TCGA-HZ-8001,TCGA-IB-7651,TCGA-3A-A9IJ
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,1427,660,3936,1322,1132,2878,2212,1365,3507,1009,...,5226,1692,2473,2220,1324,1660,2207,2581,2647,396
ENSG00000000005,4,3,1,0,0,0,39,0,0,0,...,1,5,13,3,2,2,2,30,1,0
ENSG00000000419,1339,840,2530,1528,2014,1817,1571,1155,2471,687,...,3128,1536,1595,1640,1090,1542,1725,1962,2035,1035
ENSG00000000457,443,556,1362,856,401,1581,1046,896,638,349,...,737,795,1342,1468,1089,644,1651,767,1423,618
ENSG00000000460,193,142,356,266,404,410,441,247,320,95,...,703,273,187,538,340,220,830,266,405,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
__alignment_not_unique,16284963,8083902,18333058,13249989,9985393,19819925,10371771,7824718,16361712,6856156,...,12351904,16948806,15041285,19055954,15247436,13461731,17672755,22771920,35733587,11899471
__ambiguous,2660509,1532683,2471802,2132442,1782266,2772100,1705470,1063813,1855009,1022345,...,2081764,2705672,2426524,2781904,1786691,1780648,2587344,3070631,3805095,2072115
__no_feature,2905403,2209046,3258514,3270954,1922891,1902841,3147423,2423441,1730120,952224,...,4307045,4267876,4088531,4629378,4072214,2777955,4859956,2254060,4112235,1987813
__not_aligned,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
import mygene
mg = mygene.MyGeneInfo()
genes = list(combined.index)

results = mg.querymany(genes, scopes='ensemblgene', fields='symbol', species='human')
gene_id_to_symbol = {d['query']: d['symbol'] for d in results if 'symbol' in d}

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [54]:
combined = combined.loc[[g for g in combined.index if g in gene_id_to_symbol]]
combined.index = [gene_id_to_symbol[x] for x in combined.index]
combined.index.name = 'Gene'
combined

Unnamed: 0_level_0,TCGA-OE-A75W,TCGA-2J-AABT,TCGA-IB-7886,TCGA-IB-AAUU,TCGA-2J-AAB6,TCGA-LB-A8F3,TCGA-HZ-A4BH,TCGA-HV-A5A3,TCGA-IB-7646,TCGA-2J-AAB9,...,TCGA-L1-A7W4,TCGA-IB-A5SO,TCGA-IB-AAUW,TCGA-2J-AABF,TCGA-H8-A6C1,TCGA-IB-AAUN,TCGA-FB-AAPU,TCGA-HZ-8001,TCGA-IB-7651,TCGA-3A-A9IJ
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,1427,660,3936,1322,1132,2878,2212,1365,3507,1009,...,5226,1692,2473,2220,1324,1660,2207,2581,2647,396
TNMD,4,3,1,0,0,0,39,0,0,0,...,1,5,13,3,2,2,2,30,1,0
DPM1,1339,840,2530,1528,2014,1817,1571,1155,2471,687,...,3128,1536,1595,1640,1090,1542,1725,1962,2035,1035
SCYL3,443,556,1362,856,401,1581,1046,896,638,349,...,737,795,1342,1468,1089,644,1651,767,1423,618
C1orf112,193,142,356,266,404,410,441,247,320,95,...,703,273,187,538,340,220,830,266,405,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GIMAP1-GIMAP5,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
LINC02246,1,7,18,5,12,2,5,8,6,2,...,51,13,4,13,7,4,16,1,17,6
HERC2P7,0,0,0,0,0,0,1,0,0,0,...,1,0,1,2,2,1,0,1,2,0
SNORA50A,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [55]:
combined.to_csv('../data/bulk_rna_seq/tcga_counts.txt', sep='\t')

In [57]:
cpm = (combined / np.sum(combined.values, axis=0)) * 1000000
cpm

Unnamed: 0_level_0,TCGA-OE-A75W,TCGA-2J-AABT,TCGA-IB-7886,TCGA-IB-AAUU,TCGA-2J-AAB6,TCGA-LB-A8F3,TCGA-HZ-A4BH,TCGA-HV-A5A3,TCGA-IB-7646,TCGA-2J-AAB9,...,TCGA-L1-A7W4,TCGA-IB-A5SO,TCGA-IB-AAUW,TCGA-2J-AABF,TCGA-H8-A6C1,TCGA-IB-AAUN,TCGA-FB-AAPU,TCGA-HZ-8001,TCGA-IB-7651,TCGA-3A-A9IJ
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,28.673668,17.502197,64.992063,25.606964,27.225073,52.730224,48.006183,44.658003,72.466658,43.514206,...,99.658991,26.835207,43.917633,36.387745,29.145191,38.558152,33.977589,41.228084,31.783021,8.697942
TNMD,0.080375,0.079555,0.016512,0.000000,0.000000,0.000000,0.846402,0.000000,0.000000,0.000000,...,0.019070,0.079300,0.230865,0.049173,0.044026,0.046456,0.030791,0.479211,0.012007,0.000000
DPM1,26.905425,22.275523,41.775894,29.597157,48.437542,33.290763,34.094807,37.787541,51.059342,29.627611,...,59.650464,24.361039,28.325364,26.881037,23.994153,35.817271,26.557019,31.340373,24.434623,22.733257
SCYL3,8.901496,14.744275,22.489632,16.580606,9.644218,28.966812,22.700935,29.313971,13.183270,15.050999,...,14.054473,12.608741,23.832375,24.061806,23.972140,14.958705,25.417761,12.251817,17.086225,13.574060
C1orf112,3.878078,3.765624,5.878347,5.152385,9.716369,7.511950,9.570853,8.080972,6.612298,4.096977,...,13.406099,4.329794,3.320905,8.818291,7.484415,5.110117,12.778160,4.249001,4.862910,1.581444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GIMAP1-GIMAP5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.015860,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LINC02246,0.020094,0.185629,0.297220,0.096849,0.288605,0.036644,0.108513,0.261732,0.123981,0.086252,...,0.972562,0.206181,0.071035,0.213081,0.154091,0.092911,0.246326,0.015974,0.204122,0.131787
HERC2P7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.021703,0.000000,0.000000,0.000000,...,0.019070,0.000000,0.017759,0.032782,0.044026,0.023228,0.000000,0.015974,0.024014,0.000000
SNORA50A,0.000000,0.000000,0.016512,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.015860,0.000000,0.016391,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [58]:
cpm.to_csv('../data/bulk_rna_seq/tcga_cpm.txt', sep='\t')

#### bailey

In [62]:
counts = pd.read_csv('../data/bulk_rna_seq/bailey/counts.txt', sep='\t', index_col=0)
counts.index = [x.split('.')[0] for x in counts.index]
counts['gene'] = counts.index.to_list()
counts = counts.groupby('gene').mean().astype(int)
counts.index.name = 'Gene'
counts

Unnamed: 0_level_0,SA518851,SA528771,SA528766,SA528763,SA518817,SA518765,SA518854,SA518806,SA528767,SA518750,...,SA412299,SA411841,SA411797,SA411769,SA411833,SA411430,SA411406,SA411923,SA411305,SA411682
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,52,15,4,13,33,34,24,17,25,27,...,141,3,46,39,37,27,56,56,44,4
A1BG-AS1,31,8,2,24,33,40,24,9,25,18,...,61,0,44,55,64,19,86,75,44,5
A1CF,0,27,7,74,84,67,27,2,92,142,...,92,0,218,794,108,140,79,49,45,2
A2M,6845,1179,211,3675,12026,30786,22958,4022,4467,20599,...,8786,0,7326,16420,28881,13771,28693,12701,12130,43
A2M-AS1,13,14,2,23,20,88,36,47,23,98,...,5,3,44,29,37,5,63,17,20,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,753,423,117,362,730,1327,1037,974,424,966,...,1487,265,1200,916,1230,718,1138,861,1278,277
ZYG11A,11,36,9,27,22,25,29,19,70,25,...,10,1,24,19,28,6,12,27,16,89
ZYG11B,2121,407,112,777,1971,2985,1927,1292,672,2826,...,1993,950,1678,1737,3281,1165,1780,1649,4017,1286
ZYX,2867,1112,204,2186,3566,4959,1977,1722,1496,4182,...,891,1525,2152,3434,2916,2141,3936,2262,6167,982


In [63]:
counts.to_csv('../data/bulk_rna_seq/bailey_counts.txt', sep='\t')

In [64]:
cpm = (counts / np.sum(counts.values, axis=0)) * 1000000
cpm

Unnamed: 0_level_0,SA518851,SA528771,SA528766,SA528763,SA518817,SA518765,SA518854,SA518806,SA528767,SA518750,...,SA412299,SA411841,SA411797,SA411769,SA411833,SA411430,SA411406,SA411923,SA411305,SA411682
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.874958,1.786762,1.856055,1.127469,1.269091,0.843667,0.964207,0.968267,2.398710,1.166017,...,3.791148,0.131849,1.541958,1.457170,0.883652,1.270806,1.876736,2.859541,1.305894,0.218215
A1BG-AS1,1.117763,0.952940,0.928028,2.081481,1.269091,0.992549,0.964207,0.512612,2.398710,0.777345,...,1.640142,0.000000,1.474916,2.054983,1.528479,0.894271,2.882131,3.829743,1.305894,0.272769
A1CF,0.000000,3.216171,3.248097,6.417899,3.230413,1.662519,1.084732,0.113914,8.827252,6.132388,...,2.473657,0.000000,7.307538,29.666485,2.579309,6.589362,2.647539,2.502098,1.335574,0.109108
A2M,246.809354,140.439484,97.906926,318.726717,462.487463,763.915288,922.343928,229.080486,428.601488,889.584891,...,236.234240,0.000000,245.573509,613.505895,689.750109,648.157901,961.592856,648.554129,360.011341,2.345812
A2M-AS1,0.468739,1.667644,0.928028,1.994752,0.769146,2.183608,1.446310,2.676972,2.206813,4.232211,...,0.134438,0.131849,1.474916,1.083537,0.883652,0.235334,2.111329,0.868075,0.593588,0.218215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,27.150832,50.386685,54.289623,31.395666,28.073827,32.927811,41.661759,55.475980,40.682120,41.717511,...,39.981825,11.646682,40.224981,34.224811,29.375459,33.794014,38.137966,43.965444,37.930296,15.111397
ZYG11A,0.396626,4.288229,4.176125,2.341666,0.846061,0.620343,1.165083,1.082180,6.716388,1.079646,...,0.268876,0.043950,0.804500,0.709903,0.668710,0.282401,0.402158,1.378707,0.474871,4.855286
ZYG11B,76.476646,48.480806,51.969553,67.387934,75.799334,74.068964,77.417752,73.588262,64.477323,122.043153,...,53.586938,41.752255,56.247932,64.900106,78.358440,54.832906,59.653410,84.203272,119.222222,70.156159
ZYX,103.375079,132.458614,94.658829,189.588191,137.138724,123.051254,79.426516,98.079711,143.538801,180.603137,...,23.956830,67.023357,72.136799,128.305679,69.641332,100.770174,131.907764,115.505034,183.032971,53.571810


In [65]:
cpm.to_csv('../data/bulk_rna_seq/bailey_cpm.txt', sep='\t')

#### kirby

In [69]:
counts = pd.read_csv('../data/bulk_rna_seq/kirby/GSE79668_51_tumors_sharedgenecounts.txt', sep='\t', index_col=0)
counts.index = [x.split('_')[0].split('.')[0] for x in counts.index]
counts['gene'] = counts.index.to_list()
counts = counts.groupby('gene').mean().astype(int)
counts.index.name = 'Gene'
counts

Unnamed: 0_level_0,T_07_07_A082a,T_06_01_A033a,T_06_11_A168a,T_06_04_A296a,T_07_11_A090a,T_06_06_A349a,T_11_03_A138a,T_03_11_A244a,T_10_11_A059a,T_04_02_A274c,...,T_10_11_A100a,T_10_11_A128a,T_10_11_A130b,T_10_12_A057a,T_11_02_A148a,T_11_02_A171a_2,T_11_04_A026b,T_11_05_A018b,T_11_06_A011a,T_11_06_A104a
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S,8,9,6,7,7,11,10,7,7,7,...,8,6,4,8,8,13,8,10,8,8
7SK,9,15,7,9,8,8,9,11,8,9,...,10,7,4,11,10,7,7,12,8,9
A1BG,559,273,408,932,483,364,518,108,491,308,...,555,601,584,380,314,395,826,436,1188,371
A1CF,302,541,411,114,138,41,221,322,11,2,...,204,286,32,30,408,44,473,600,101,52
A2LD1,161,188,66,118,123,74,121,222,160,44,...,93,78,21,263,84,127,146,193,151,112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
snoZ5,56,56,31,40,28,44,63,43,13,30,...,32,51,29,109,28,31,80,39,44,41
snoZ6,0,0,1,0,0,2,2,0,1,0,...,1,0,1,0,1,0,0,3,0,0
snosnR60,1,0,2,1,4,0,2,1,2,1,...,2,2,1,2,2,1,1,2,1,2
snosnR66,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
counts.to_csv('../data/bulk_rna_seq/kirby_counts.txt', sep='\t')

In [71]:
cpm = (counts / np.sum(counts.values, axis=0)) * 1000000
cpm

Unnamed: 0_level_0,T_07_07_A082a,T_06_01_A033a,T_06_11_A168a,T_06_04_A296a,T_07_11_A090a,T_06_06_A349a,T_11_03_A138a,T_03_11_A244a,T_10_11_A059a,T_04_02_A274c,...,T_10_11_A100a,T_10_11_A128a,T_10_11_A130b,T_10_12_A057a,T_11_02_A148a,T_11_02_A171a_2,T_11_04_A026b,T_11_05_A018b,T_11_06_A011a,T_11_06_A104a
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S,0.117682,0.098976,0.106685,0.099553,0.111954,0.140617,0.135385,0.089474,0.111408,0.130327,...,0.130082,0.081110,0.054195,0.093106,0.097879,0.139345,0.105437,0.125233,0.101331,0.121324
7SK,0.132392,0.164960,0.124465,0.127997,0.127947,0.102267,0.121847,0.140602,0.127323,0.167563,...,0.162603,0.094629,0.054195,0.128021,0.122349,0.075032,0.092257,0.150280,0.101331,0.136489
A1BG,8.222999,3.002265,7.254548,13.254772,7.724803,4.653144,7.012945,1.380460,7.814451,5.734384,...,9.024464,8.124560,7.912458,4.422558,3.841748,4.233948,10.886372,5.460158,15.047616,5.626383
A1CF,4.442479,5.949543,7.307890,1.621292,2.207087,0.524118,2.992009,4.115815,0.175069,0.037236,...,3.317100,3.866263,0.433559,0.349149,4.991826,0.471630,6.233963,7.513979,1.279301,0.788603
A2LD1,2.368341,2.067494,1.173530,1.678179,1.967186,0.945969,1.638159,2.837612,2.546460,0.819198,...,1.512207,1.054435,0.284523,3.060875,1.027729,1.361295,1.924225,2.416996,1.912618,1.698531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
snoZ5,0.823771,0.615849,0.551203,0.568874,0.447815,0.562468,0.852926,0.549628,0.206900,0.558544,...,0.520329,0.689439,0.392913,1.268576,0.342576,0.332285,1.054370,0.488409,0.557319,0.621784
snoZ6,0.000000,0.000000,0.017781,0.000000,0.000000,0.025567,0.027077,0.000000,0.015915,0.000000,...,0.016260,0.000000,0.013549,0.000000,0.012235,0.000000,0.000000,0.037570,0.000000,0.000000
snosnR60,0.014710,0.000000,0.035562,0.014222,0.063974,0.000000,0.027077,0.012782,0.031831,0.018618,...,0.032521,0.027037,0.013549,0.023277,0.024470,0.010719,0.013180,0.025047,0.012666,0.030331
snosnR66,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [72]:
cpm.to_csv('../data/bulk_rna_seq/kirby_cpm.txt', sep='\t')

#### cptac

In [92]:
counts = pd.read_csv('/diskmnt/Projects/Users/estorrs/sc_pdac/data/cptac/read_counts.txt', sep='\t', index_col=0)
counts.index = [x.split('.')[0] for x in counts.index]
counts['gene'] = counts.index.to_list()
counts = counts.groupby('gene').mean().astype(int)
counts.index.name = 'Gene'
counts

Unnamed: 0_level_0,C3L-00017,C3L-00102,C3L-00189,C3L-00277,C3L-00401,C3L-00589,C3L-00598,C3L-00599,C3L-00622,C3L-00625,...,C3N-03780,C3N-03839,C3N-03840,C3N-03853,C3N-03884,C3N-04119,C3N-04126,C3N-04282,C3N-04283,C3N-04284
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,0,0,0,0,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5_8S_rRNA,0,0,0,0,0,0,0,39,0,0,...,0,0,0,0,0,0,0,0,0,0
7SK,218590,3137,171174,11427,201480,244936,11765,241532,195775,196375,...,55480,220094,147672,9756,57249,167732,171055,204093,4873,23586
A1BG,5,6,15,20,4,11,16,15,18,15,...,16,6,8,14,7,9,15,8,12,10
A1BG-AS1,96,87,216,101,110,101,56,135,193,158,...,88,95,116,43,56,179,171,128,105,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZEF1,3920,3634,3834,5639,4243,4098,3732,4071,5003,3637,...,2342,2890,4482,3094,1601,3577,4100,3750,2693,5553
ZZZ3,3530,3375,2824,6274,3455,3838,2760,3307,3411,2942,...,2211,4217,3738,2952,1652,3801,4865,5332,5136,3367
hsa-mir-1253,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-mir-423,0,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
counts.to_csv('../data/bulk_rna_seq/cptac_counts.txt', sep='\t')

In [94]:
cpm = (counts / np.sum(counts.values, axis=0)) * 1000000
cpm

Unnamed: 0_level_0,C3L-00017,C3L-00102,C3L-00189,C3L-00277,C3L-00401,C3L-00589,C3L-00598,C3L-00599,C3L-00622,C3L-00625,...,C3N-03780,C3N-03839,C3N-03840,C3N-03853,C3N-03884,C3N-04119,C3N-04126,C3N-04282,C3N-04283,C3N-04284
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.024259,0.027337,0.000000,0.023713,0.030691,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5_8S_rRNA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.127125,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7SK,5658.066616,84.468331,4786.898238,276.863514,6134.974556,5941.976311,321.625412,6980.428143,4642.359082,6027.019462,...,2919.755048,5537.587887,4204.952540,371.802918,3192.549213,4564.493513,4254.926342,4823.856203,123.586669,705.320583
A1BG,0.129422,0.161559,0.419477,0.484578,0.121798,0.266852,0.437400,0.433510,0.426829,0.460371,...,0.842035,0.150961,0.227800,0.533543,0.390362,0.244917,0.373119,0.189085,0.304338,0.299042
A1BG-AS1,2.484900,2.342603,6.040462,2.447118,3.349450,2.450189,1.530899,3.901586,4.576556,4.849238,...,4.631190,2.390210,3.303094,1.638738,3.122897,4.871130,4.253558,3.025354,2.662959,2.123199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZEF1,101.466769,97.850786,107.218198,136.626705,129.197424,99.414618,102.023463,117.654485,118.634772,111.624544,...,123.252818,72.712700,127.624717,117.912898,89.281407,97.340956,101.985899,88.633421,68.298564,166.058051
ZZZ3,91.371861,90.876831,78.973446,152.012049,105.203182,93.107200,75.451435,95.574400,80.884111,90.294036,...,116.358659,106.100158,106.439356,112.501252,92.125475,103.436672,121.014976,126.024907,130.256748,100.687459
hsa-mir-1253,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
hsa-mir-423,0.000000,0.000000,0.000000,0.000000,0.152248,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [95]:
cpm.to_csv('../data/bulk_rna_seq/cptac_cpm.txt', sep='\t')