# Process PBTA data

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import pickle

from ponyo import utils
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

In [2]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

# Read in config variables
config_filename = os.path.abspath(
    os.path.join(base_dir, "configs", "config_human_general.tsv")
)

params = utils.read_config(config_filename)

local_dir = params["local_dir"]
processed_template_filename = params["processed_template_filename"]
pbta_dir = os.path.join(local_dir, "openPBTA")

## Load RDS objects

In [3]:
readRDS = ro.r["readRDS"]

In [4]:
polya_matrix = readRDS(
    os.path.join(pbta_dir, "pbta-gene-counts-rsem-expected_counts-collapsed.polya.rds")
)
ribo_matrix = readRDS(
    os.path.join(
        pbta_dir, "pbta-gene-counts-rsem-expected_counts-collapsed.stranded.rds"
    )
)

In [5]:
with localconverter(ro.default_converter + pandas2ri.converter):
    polya_matrix_values = ro.conversion.rpy2py(polya_matrix)
    ribo_matrix_values = ro.conversion.rpy2py(ribo_matrix)

In [6]:
polya_matrix_df = pd.DataFrame(
    data=polya_matrix_values,
    index=polya_matrix.rownames,
    columns=polya_matrix.colnames,
)
ribo_matrix_df = pd.DataFrame(
    data=ribo_matrix_values,
    index=ribo_matrix.rownames,
    columns=ribo_matrix.colnames,
)

In [7]:
print(polya_matrix_df.shape)
polya_matrix_df.head()

(46669, 58)


Unnamed: 0,BS_0VXZCRJS,BS_0ZA67BBC,BS_1N7MQZGR,BS_21ET39G7,BS_2JP7RBMB,BS_3AC3SRWH,BS_49CJNZ06,BS_4PPHAQXF,BS_4PWDGEB0,BS_58YXHGAJ,...,BS_XGDPK33A,BS_XM1AHBDJ,BS_XZM79E42,BS_YDEVMD24,BS_Z3RCA1T9,BS_ZD5HN296,BS_ZF6BSFNF,BS_ZQ76ZBEX,BS_ZQKGJD70,BS_S6Q7NKA3
MT-CO1,688399.97,1717344.69,1772806.69,3078167.31,2723878.47,6885942.15,680061.84,1042630.21,914337.23,617360.8,...,2203371.66,8427824.83,1845979.95,937137.43,1507132.42,617386.88,4215126.83,1007418.57,1039377.64,661645.64
GFAP,768375.44,1387179.44,1006019.19,5504771.92,1369694.01,695792.99,785787.39,249792.48,17581.45,477984.96,...,2666392.86,1882227.28,887470.48,1432621.67,825726.41,1703132.86,7106466.55,591081.83,330628.19,5580.13
MT-ND4,489755.9,1145123.9,924159.04,1938223.0,1450140.71,3068246.64,571805.0,651345.95,588803.0,398712.0,...,1488926.0,4068850.88,1981890.78,642831.75,916844.9,422183.97,2378307.94,649967.94,700372.95,422620.94
MT-RNR2,300844.97,671339.98,851424.99,1779758.08,1269937.9,7393594.16,648332.0,452683.97,659187.94,547278.97,...,703290.35,6781675.29,2695483.95,426025.95,587416.74,362185.98,2147389.0,1153760.15,566549.57,453366.94
MT-CO3,240854.75,496228.72,484837.67,812566.87,824195.97,2359968.34,218951.97,390247.42,349555.99,248921.83,...,746947.77,3584811.77,854314.13,258096.47,518438.62,203834.66,1240214.33,551228.39,424287.39,343456.62


In [8]:
print(ribo_matrix_df.shape)
ribo_matrix_df.head()

(53733, 978)


Unnamed: 0,BS_014EVM2D,BS_02NZT8CE,BS_03FT4S8B,BS_0448A413,BS_044XZ8ST,BS_052PZFMK,BS_063ERW0R,BS_06AMMCXR,BS_06XH7EVF,BS_07ANYSYQ,...,BS_ZV68CES9,BS_ZXYDSBM9,BS_8QB4S4VA,BS_FN07P04C,BS_HE0WJRW6,BS_SB12W1XT,BS_SHJA4MR0,BS_D7XRFE0R,BS_FXJY0MNH,BS_KABQQA0T
RN7SL1,3661396.92,3327174.16,5550046.25,1965923.79,6550357.98,2026325.55,3827095.63,5441769.72,3610145.11,5364004.95,...,6244881.31,6521883.72,911946.19,464069.57,1379029.11,848175.88,3028272.44,402058.39,482731.48,254950.07
RN7SL2,3431613.65,3851473.35,4320780.53,1310282.92,7419442.45,1921091.65,3338142.72,4397447.69,3421223.85,3583367.64,...,5128691.61,3780172.04,291970.09,47283.78,230960.76,206637.48,634336.3,145624.77,357861.53,66016.86
RN7SK,2206026.49,1290427.07,8088687.52,2358988.97,2420641.12,1290446.02,1776870.94,4407036.62,1101433.58,2208905.56,...,6886563.64,6707821.66,627252.31,398677.86,392958.33,306270.02,664259.99,359012.94,129729.0,292205.96
RPPH1,545758.22,398569.8,1723165.49,496045.92,1029613.26,356049.5,533014.13,1015454.4,474172.88,652301.49,...,884634.01,2675293.05,105108.93,73934.94,137787.02,60266.62,188434.02,47832.27,25258.24,29995.14
GFAP,1737227.0,546908.0,4639.0,536.0,1534147.0,28696.0,435881.0,206504.0,502.0,1466138.0,...,4774.0,86558.0,173166.0,5966.0,944.0,277575.0,837906.0,1086107.0,1544412.0,353106.0


## Get matching samples

In [9]:
# Load metadata that maps RNA sample ids in expression matrices above
# to patient sample id
patient_metadata_filename = "https://raw.githubusercontent.com/kgaonkar6/OpenPBTA-analysis/532c29ab743bc643e687044bdb3e90241925186a/analyses/tp53_nf1_score/results/tp53_altered_status.tsv"

patient_metadata = pd.read_csv(
    patient_metadata_filename, sep="\t", index_col=0, header=0
)

In [10]:
patient_metadata.head(10)

Unnamed: 0_level_0,Kids_First_Biospecimen_ID_DNA,Kids_First_Biospecimen_ID_RNA,cancer_predispositions,tp53_score,SNV_indel_counts,CNV_loss_counts,HGVSp_Short,CNV_loss_evidence,hotspot,activating,tp53_altered
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
7316-10,BS_1RFBH1SP,BS_GDHH6T5A,NF-1,0.233945,0,0,,,0,0,Other
7316-100,BS_SFZ3A07S,BS_BHR08WGW,None documented,0.408286,0,0,,,0,0,Other
7316-101,BS_4RS1SC48,BS_QV51J756,None documented,0.610211,0,0,,,0,0,Other
7316-1038,,BS_VPBMDMQX,None documented,0.779996,0,0,,,0,0,Other
7316-104,BS_MRX1SPFB,BS_Y08D5P8N,None documented,0.25191,0,0,,,0,0,Other
7316-1052,,BS_TGS109T3,None documented,0.852086,0,0,,,0,0,Other
7316-1055,,BS_H1ZS5FMZ,None documented,0.903618,0,0,,,0,0,Other
7316-1057,,BS_58YXHGAJ,None documented,0.575641,0,0,,,0,0,Other
7316-1059,,BS_C41DJZ1F,NF-1,0.442893,0,0,,,0,0,Other
7316-1060,,BS_Z8F3GM85,None documented,0.845412,0,0,,,0,0,Other


In [11]:
# Select those patient sample ids (`sample_id) with multiple measurements
patient_metadata_tmp = patient_metadata[patient_metadata.index.value_counts() > 1]

# Select those with RNA sample ids (`Kids_First_Biospecimen_ID_RNA`) available
patient_metadata_selected = patient_metadata_tmp[
    patient_metadata_tmp["Kids_First_Biospecimen_ID_RNA"].isnull() == False
]

  


In [12]:
# Create dictionary to map patient sample ids with those RNA ids
# for polyA-selection and ribo-depleted processing (column ids from gene expression matrices)
patient_sample_ids = list(patient_metadata_selected.index.unique())
polya_sample_ids = list(polya_matrix_df.columns)
ribo_sample_ids = list(ribo_matrix_df.columns)

patient_to_polya_id = {}
patient_to_ribo_id = {}
for patient_id in patient_sample_ids:
    rna_sample_ids = patient_metadata_selected.loc[
        patient_id, "Kids_First_Biospecimen_ID_RNA"
    ]
    for rna_sample_id in rna_sample_ids:
        if rna_sample_id in polya_sample_ids:
            patient_to_polya_id[patient_id] = rna_sample_id
        if rna_sample_id in ribo_sample_ids:
            patient_to_ribo_id[patient_id] = rna_sample_id

patient_to_polya_id

{'7316-1455': 'BS_HWGWYCY7',
 '7316-161': 'BS_X0XXN9BK',
 '7316-255': 'BS_W4H1D4Y6',
 '7316-3214': 'BS_F0JB4EAK',
 '7316-536': 'BS_QKT3TJVK',
 '7316-85': 'BS_BYCX6VK1',
 '7316-87': 'BS_JGA9BP3A'}

In [13]:
patient_to_ribo_id

{'7316-14': 'BS_Z7PKVY9J',
 '7316-1455': 'BS_HE0WJRW6',
 '7316-1463': 'BS_RR12T74P',
 '7316-158': 'BS_TV5B86ZD',
 '7316-161': 'BS_SHJA4MR0',
 '7316-1746': 'BS_R5SNFB5B',
 '7316-1763': 'BS_6WP1FHTE',
 '7316-1765': 'BS_M7QJ8VF0',
 '7316-1769': 'BS_40MP5BWR',
 '7316-178': 'BS_RSRPX3S8',
 '7316-1893': 'BS_M85CXHDV',
 '7316-195': 'BS_P9JP6JFA',
 '7316-2151': 'BS_GXTFW99H',
 '7316-2176': 'BS_5GNQC2FF',
 '7316-2189': 'BS_PGK832G2',
 '7316-255': 'BS_FN07P04C',
 '7316-3058': 'BS_BWBDH9GM',
 '7316-536': 'BS_8QB4S4VA',
 '7316-85': 'BS_QYPHA40N',
 '7316-87': 'BS_1HQ76V6D',
 '7316-913': 'BS_M8EA6R2A'}

In [14]:
# Select patient sample ids with both polyA-selected and ribo-depleted measurements
shared_patient_ids = list(
    set(patient_to_polya_id.keys()).intersection(patient_to_ribo_id.keys())
)

# Check that these patient ids were consistent with previous analysis comparing TP53 status across platform:
# https://github.com/AlexsLemonade/OpenPBTA-analysis/pull/930
shared_patient_ids

['7316-255', '7316-87', '7316-161', '7316-1455', '7316-85', '7316-536']

## Select expression data

In [15]:
select_polya_ids = [patient_to_polya_id[x] for x in shared_patient_ids]
select_ribo_ids = [patient_to_ribo_id[x] for x in shared_patient_ids]

In [16]:
select_polya_expression = polya_matrix_df[select_polya_ids].T
select_ribo_expression = ribo_matrix_df[select_ribo_ids].T

## Format data matrix

* include only those genes that were used in our analysis --> take intersection with our compendium
    - Note: gene ENSEMBL ids already mapped to HGNC ids: https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/collapse-rnaseq

* Create metadata dataframe with grouping information for DE analysis. Make sure the samples are ordered by pairs

- Do we need to do anything with expression values to run DE????

In [17]:
# Read template file
template_SOPHIE = pd.read_csv(
    os.path.join(base_dir, "human_general_analysis", processed_template_filename),
    sep="\t",
    index_col=0,
    header=0,
)

In [18]:
template_SOPHIE.head()

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A4GALT,A4GNT,AAAS,AACS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
SRR493937,244,396,6,144077,2089,7,946,7,941,752,...,214,161,202,569,1618,9,768,12758,5060,773
SRR493938,230,384,8,142807,2062,8,968,3,978,776,...,219,171,215,576,1655,7,808,12925,5061,783
SRR493939,308,396,11,77651,1064,4,321,2,1633,1518,...,345,372,198,387,1315,30,842,4339,3304,743
SRR493940,303,351,13,77739,1125,12,325,0,1637,1481,...,318,317,155,448,1322,36,795,4400,3308,714
SRR493941,203,327,0,235645,2438,6,945,15,1520,1027,...,191,71,240,533,2006,4,1331,14318,4776,936


In [19]:
# Get SOPHIE gene ids
SOPHIE_gene_ids = list(template_SOPHIE.columns)

In [20]:
# Get shared gene ids between polyA and ribo
shared_platform_gene_ids = set(select_polya_expression.columns).intersection(
    select_ribo_expression.columns
)
print(len(shared_platform_gene_ids))

shared_gene_ids = list(set(shared_platform_gene_ids).intersection(SOPHIE_gene_ids))
print(len(shared_gene_ids))

46560
17665


In [21]:
# Select shared genes
select_polya_expression = select_polya_expression[shared_gene_ids]
select_ribo_expression = select_ribo_expression[shared_gene_ids]

print(select_polya_expression.shape)
print(select_ribo_expression.shape)

(6, 17665)
(6, 17665)


In [22]:
select_polya_expression.head()

Unnamed: 0,ATP6V1G1,ONECUT2,ZNF333,FSIP2,GTPBP1,PNRC1,MPDZ,IP6K1,NACC1,BCKDHB,...,CAMSAP1,SPPL2A,HERC6,LEF1-AS1,P2RX6,AOC2,ARHGAP31,ZNF200,CAMK2B,MYL4
BS_W4H1D4Y6,6460.0,17.0,1839.2,39.0,5462.53,4370.0,10079.0,7006.0,18097.38,1465.0,...,6458.0,5637.0,779.0,550.19,319.0,20.0,8081.0,1309.66,2561.0,2.0
BS_JGA9BP3A,5978.0,2824.0,2048.8,108.12,3247.58,2821.0,5600.0,5884.0,4728.77,1278.0,...,12255.0,3002.0,169.0,13.47,85.0,58.0,2136.0,807.0,1722.0,12.0
BS_X0XXN9BK,5432.0,55.0,1600.9,13.21,5960.19,15966.0,4216.0,9104.0,8064.61,1365.0,...,5412.0,5874.0,633.0,128.76,507.0,54.0,4167.0,728.0,290.0,70.0
BS_HWGWYCY7,13435.0,92.0,432.1,25.0,1760.55,3915.0,7393.0,6330.0,6829.0,2362.0,...,2535.0,4527.0,1384.0,4.0,329.0,14.0,3071.0,453.0,437.0,2.0
BS_BYCX6VK1,4809.0,11.0,1080.42,33.31,4363.99,2918.0,6198.0,7098.0,11791.27,1982.0,...,4715.0,5191.0,555.0,390.65,358.0,18.0,7897.0,568.0,8226.0,5.0


In [23]:
select_ribo_expression.head()

Unnamed: 0,ATP6V1G1,ONECUT2,ZNF333,FSIP2,GTPBP1,PNRC1,MPDZ,IP6K1,NACC1,BCKDHB,...,CAMSAP1,SPPL2A,HERC6,LEF1-AS1,P2RX6,AOC2,ARHGAP31,ZNF200,CAMK2B,MYL4
BS_FN07P04C,1126.0,41.0,2150.0,7.0,3851.0,4685.0,9291.0,3605.0,8293.0,773.0,...,3950.0,1988.0,692.0,629.0,124.0,168.0,9789.0,817.0,1152.0,2.0
BS_1HQ76V6D,2165.0,2225.0,1011.0,18.0,1007.0,1956.0,2289.0,2286.0,1297.0,503.0,...,5948.0,1355.0,84.0,12.0,56.0,53.0,813.0,381.0,799.0,9.0
BS_SHJA4MR0,2474.0,151.0,1569.0,6.0,12928.0,19766.0,11650.0,12717.0,11205.0,1388.0,...,10373.0,7219.0,996.0,27.0,63.0,26.0,11738.0,910.0,655.0,14.0
BS_HE0WJRW6,3273.0,422.0,560.0,24.0,2031.0,3052.0,8273.0,5690.0,5955.0,1210.0,...,3786.0,2150.0,1223.0,10.0,719.0,121.0,4240.0,403.0,375.0,4.0
BS_QYPHA40N,2866.0,128.0,1522.0,5.0,2891.0,6625.0,6957.0,4955.0,7917.0,1319.0,...,5301.0,3298.0,213.46,11.0,208.85,67.0,8840.0,571.0,3004.0,3.0


In [24]:
# Concatenate expression data
select_expression = pd.concat([select_polya_expression, select_ribo_expression])

select_expression.head(12)

Unnamed: 0,ATP6V1G1,ONECUT2,ZNF333,FSIP2,GTPBP1,PNRC1,MPDZ,IP6K1,NACC1,BCKDHB,...,CAMSAP1,SPPL2A,HERC6,LEF1-AS1,P2RX6,AOC2,ARHGAP31,ZNF200,CAMK2B,MYL4
BS_W4H1D4Y6,6460.0,17.0,1839.2,39.0,5462.53,4370.0,10079.0,7006.0,18097.38,1465.0,...,6458.0,5637.0,779.0,550.19,319.0,20.0,8081.0,1309.66,2561.0,2.0
BS_JGA9BP3A,5978.0,2824.0,2048.8,108.12,3247.58,2821.0,5600.0,5884.0,4728.77,1278.0,...,12255.0,3002.0,169.0,13.47,85.0,58.0,2136.0,807.0,1722.0,12.0
BS_X0XXN9BK,5432.0,55.0,1600.9,13.21,5960.19,15966.0,4216.0,9104.0,8064.61,1365.0,...,5412.0,5874.0,633.0,128.76,507.0,54.0,4167.0,728.0,290.0,70.0
BS_HWGWYCY7,13435.0,92.0,432.1,25.0,1760.55,3915.0,7393.0,6330.0,6829.0,2362.0,...,2535.0,4527.0,1384.0,4.0,329.0,14.0,3071.0,453.0,437.0,2.0
BS_BYCX6VK1,4809.0,11.0,1080.42,33.31,4363.99,2918.0,6198.0,7098.0,11791.27,1982.0,...,4715.0,5191.0,555.0,390.65,358.0,18.0,7897.0,568.0,8226.0,5.0
BS_QKT3TJVK,5939.0,2520.0,2098.68,12.06,9789.14,6536.0,4343.0,8971.0,3269.51,1679.0,...,7091.0,5331.0,380.0,31.78,12.0,25.0,5276.0,905.0,715.0,0.0
BS_FN07P04C,1126.0,41.0,2150.0,7.0,3851.0,4685.0,9291.0,3605.0,8293.0,773.0,...,3950.0,1988.0,692.0,629.0,124.0,168.0,9789.0,817.0,1152.0,2.0
BS_1HQ76V6D,2165.0,2225.0,1011.0,18.0,1007.0,1956.0,2289.0,2286.0,1297.0,503.0,...,5948.0,1355.0,84.0,12.0,56.0,53.0,813.0,381.0,799.0,9.0
BS_SHJA4MR0,2474.0,151.0,1569.0,6.0,12928.0,19766.0,11650.0,12717.0,11205.0,1388.0,...,10373.0,7219.0,996.0,27.0,63.0,26.0,11738.0,910.0,655.0,14.0
BS_HE0WJRW6,3273.0,422.0,560.0,24.0,2031.0,3052.0,8273.0,5690.0,5955.0,1210.0,...,3786.0,2150.0,1223.0,10.0,719.0,121.0,4240.0,403.0,375.0,4.0


In [25]:
# Create metadata grouping matrix
polya_ids = list(select_polya_expression.index)
ribo_ids = list(select_ribo_expression.index)

sample_ids = polya_ids + ribo_ids
labels = [1] * 6 + [2] * 6

sample_grouping_metadata = pd.DataFrame(data={"Sample": sample_ids, "group": labels})

sample_grouping_metadata.set_index("Sample", inplace=True)
sample_grouping_metadata

Unnamed: 0_level_0,group
Sample,Unnamed: 1_level_1
BS_W4H1D4Y6,1
BS_JGA9BP3A,1
BS_X0XXN9BK,1
BS_HWGWYCY7,1
BS_BYCX6VK1,1
BS_QKT3TJVK,1
BS_FN07P04C,2
BS_1HQ76V6D,2
BS_SHJA4MR0,2
BS_HE0WJRW6,2


## Save
- Save metadata df and expression data df

In [26]:
expression_data_filename = "polya_ribo_expression.tsv"
sample_grouping_filename = "polya_ribo_sample_grouping.tsv"

select_expression.to_csv(expression_data_filename, sep="\t")
sample_grouping_metadata.to_csv(sample_grouping_filename, sep="\t")