In [12]:
%load_ext autoreload
%autoreload 2
from IPython.display import display
import functools
import polars as pl
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import false_discovery_control
import pickle
import os
os.chdir("/zata/zippy/ramirezc/splice-model-benchmark/golden_standard")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
with open('intron_file_paths.pkl', 'rb') as f:
    paths = pickle.load(f)

In [None]:
paths_list = list(paths)
print(paths_list)
sample_dict = {
    'path': paths_list,
    'biosample': [path.split('/')[-2] for path in paths_list],
    'accession': [path.split('/')[-1].split('.')[0] for path in paths_list]
}
sample_df = pl.from_dict(sample_dict)

# id is reads_per_million
dataframes = []
for row in sample_df.iter_rows(named=True):
    id = row['biosample'] + '_' + row['accession']
    dataframes.append(
        pl.read_csv(
            row['path'], separator='\t'
        ).filter(
            pl.col('reads') > 18
        ).with_columns(
            np.log2(pl.col('reads_per_million')).alias('reads_per_million')
        )
        # .with_columns(
        #     ((pl.col('reads_per_million') - pl.col('reads_per_million').mean()) / pl.col('reads_per_million').std(ddof=1)).alias('reads_per_million')
        # )
        .select(
            'chrom', 'start', 'end', 'strand', 'gene_name', 'reads_per_million'
        ).rename(
            {'reads_per_million': id}
        ).to_pandas()
    )

['downloads/ENCBS481WHG/ENCFF564ONS.introns.filtered.tab', 'downloads/ENCBS914REQ/ENCFF502LAB.introns.filtered.tab', 'downloads/ENCBS756BZF/ENCFF623IBV.introns.filtered.tab', 'downloads/ENCBS075YII/ENCFF899RYN.introns.filtered.tab', 'downloads/ENCBS020DQC/ENCFF835KGV.introns.filtered.tab', 'downloads/ENCBS060KJN/ENCFF320GCF.introns.filtered.tab', 'downloads/ENCBS119CST/ENCFF045ZQI.introns.filtered.tab', 'downloads/ENCBS777OZT/ENCFF745DHX.introns.filtered.tab', 'downloads/ENCBS555BGE/ENCFF044LIA.introns.filtered.tab', 'downloads/ENCBS820SXP/ENCFF049QIP.introns.filtered.tab', 'downloads/ENCBS023JCN/ENCFF600UQU.introns.filtered.tab', 'downloads/ENCBS705XRW/ENCFF173JOL.introns.filtered.tab', 'downloads/ENCBS511ZVG/ENCFF100YYU.introns.filtered.tab', 'downloads/ENCBS021NLC/ENCFF100EPL.introns.filtered.tab', 'downloads/ENCBS112SEO/ENCFF007EDQ.introns.filtered.tab', 'downloads/ENCBS869GJQ/ENCFF259PEY.introns.filtered.tab', 'downloads/ENCBS996RLF/ENCFF903XBV.introns.filtered.tab', 'downloads/EN

In [4]:
# TYSM: https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
full_dataframe = functools.reduce(lambda left, right: pd.merge(left, right, on=['chrom', 'start', 'end', 'strand', 'gene_name'], how='outer'), dataframes)

In [5]:
full_df_pl = pl.from_pandas(full_dataframe)
bed_columns = ['chrom', 'start', 'end', 'strand', 'gene_name']
sample_columns = [col for col in full_df_pl.columns if col not in bed_columns]
non_zero_count = full_df_pl.with_columns(
    (pl.sum_horizontal([pl.col(sample_columns) > 0]).alias('non_null_count'))
)
filtered_df = non_zero_count.filter(pl.col('non_null_count') >= 3).drop('non_null_count')
display(filtered_df)

chrom,start,end,strand,gene_name,ENCBS481WHG_ENCFF564ONS,ENCBS914REQ_ENCFF502LAB,ENCBS756BZF_ENCFF623IBV,ENCBS075YII_ENCFF899RYN,ENCBS020DQC_ENCFF835KGV,ENCBS060KJN_ENCFF320GCF,ENCBS119CST_ENCFF045ZQI,ENCBS777OZT_ENCFF745DHX,ENCBS555BGE_ENCFF044LIA,ENCBS820SXP_ENCFF049QIP,ENCBS023JCN_ENCFF600UQU,ENCBS705XRW_ENCFF173JOL,ENCBS511ZVG_ENCFF100YYU,ENCBS021NLC_ENCFF100EPL,ENCBS112SEO_ENCFF007EDQ,ENCBS869GJQ_ENCFF259PEY,ENCBS996RLF_ENCFF903XBV,ENCBS710VQU_ENCFF349CHO,ENCBS961BBX_ENCFF395SXY,ENCBS806IYJ_ENCFF563QXX,ENCBS206PUV_ENCFF322UJU,ENCBS648NSQ_ENCFF545PJV,ENCBS632QXG_ENCFF018PZX,ENCBS365LJQ_ENCFF814ABW,ENCBS538ZJX_ENCFF193WEX,ENCBS734JMT_ENCFF118JEI,ENCBS735OAR_ENCFF132YCF,ENCBS580WTV_ENCFF840OVC,ENCBS271WJL_ENCFF291EKY,ENCBS975OUN_ENCFF219UJG,ENCBS741WNS_ENCFF693YCD,ENCBS629ZGV_ENCFF791WUV,…,ENCBS422TMB_ENCFF406GQU,ENCBS819UHF_ENCFF773RAW,ENCBS105DDE_ENCFF279ABL,ENCBS660AUE_ENCFF378STM,ENCBS155WDP_ENCFF049QGQ,ENCBS155WDP_ENCFF846YHI,ENCBS932EVS_ENCFF117DUA,ENCBS932EVS_ENCFF772MSZ,ENCBS034WOR_ENCFF925MYC,ENCBS572IFJ_ENCFF558QPF,ENCBS892JRQ_ENCFF472TSL,ENCBS251SEQ_ENCFF973OML,ENCBS779FCS_ENCFF058HQU,ENCBS823URA_ENCFF074WRN,ENCBS756ETL_ENCFF583MSU,ENCBS832MSN_ENCFF803KIA,ENCBS825BIV_ENCFF509GHY,ENCBS713BMM_ENCFF292UIE,ENCBS654ADT_ENCFF738RAA,ENCBS759AII_ENCFF745HHL,ENCBS472VJA_ENCFF644PGG,ENCBS692CKU_ENCFF437SYY,ENCBS231BSF_ENCFF779VVX,ENCBS034QSD_ENCFF344KQF,ENCBS232CCX_ENCFF971JDY,ENCBS420JRY_ENCFF757LOZ,ENCBS725WGH_ENCFF809QBD,ENCBS911SOZ_ENCFF911RNV,ENCBS601OTT_ENCFF936VUF,ENCBS958FWY_ENCFF939EUU,ENCBS825BLU_ENCFF731THW,ENCBS200JOD_ENCFF319JFG,ENCBS417JEU_ENCFF901XCR,ENCBS789UUH_ENCFF305AFY,ENCBS411JUI_ENCFF222UTL,ENCBS239VAD_ENCFF100RGC,ENCBS239VAD_ENCFF927MKK
str,i64,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""chr1""",924948,925921,"""+""","""SAMD11""",,,,,,3.731355,,,,,,,,,,,,,,,,,,,,,,,4.949077,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""chr1""",926013,930154,"""+""","""SAMD11""",,,,,,3.908233,,,,,,,,,,,,,,,,,,,,,,,5.069371,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""chr1""",930336,931038,"""+""","""SAMD11""",,,,,,3.85165,,,,,,,,,,,,,,,,,,,,3.565954,,,5.332405,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""chr1""",931089,935771,"""+""","""SAMD11""",,,,,,3.908233,,,,,,,,,,,,,,,,,,,,,,,5.23287,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""chr1""",935896,939039,"""+""","""SAMD11""",,,,,,4.015148,,,,,,,,,,,,,,,,,,,,3.565954,,,5.23287,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrY""",20579691,20582589,"""+""","""EIF1AY""",,,6.264525,,,,,,,5.641109,5.457032,6.615281,,,6.119373,6.322933,,,,,,,6.48223,,,,6.259985,6.761406,6.23287,,,,…,,,,,4.668331,,,5.269015,,,,,,,,,,,,,,,,,,4.786175,,,,,,,,,,4.855306,6.048332
"""chrY""",20582693,20584473,"""+""","""EIF1AY""",,,6.207942,,,,,,,5.618208,5.418557,6.615281,,,6.119373,6.228957,,,,,,,6.42083,,,,6.218565,6.74463,6.180402,,,,…,,,,,4.668331,4.635258,,5.105517,,,,,,,,,,,,,,,,,,4.786175,,,,,,,,,,4.784916,5.938707
"""chrY""",20584524,20588023,"""+""","""EIF1AY""",,,6.207942,,,,,,,5.594938,5.365608,6.615281,,,6.094711,6.186728,,,,,,,6.42083,,,,6.232504,6.74463,6.206875,,,,…,,,,,4.668331,4.635258,,4.985222,,,,,,,,,,,,,,,,,,4.786175,,,,,,,,,,4.784916,5.938707
"""chrY""",20588105,20589483,"""+""","""EIF1AY""",,,6.227051,,,,,,,5.641109,5.457032,6.637307,,,6.131548,6.322933,,,,,,,6.451857,,,,6.273531,6.777988,6.23287,,,,…,,,,,4.668331,4.635258,,5.216548,,,,,,,,,,,,,,,,,,4.786175,,,,,,,,,,4.784916,5.976182


In [6]:
metadata = pl.read_csv('biosample_matched_rna_seq_experiments.tsv', separator='\t')
biosample_to_term_name = metadata.select('Biosample accession', 'Biosample term name')
biosample_dict = dict(zip(
    biosample_to_term_name['Biosample accession'],
    biosample_to_term_name['Biosample term name']
))
biosample_dict

{'ENCBS481WHG': 'dorsolateral prefrontal cortex',
 'ENCBS660AUE': 'dorsolateral prefrontal cortex',
 'ENCBS422TMB': 'dorsolateral prefrontal cortex',
 'ENCBS644WSW': 'dorsolateral prefrontal cortex',
 'ENCBS105DDE': 'dorsolateral prefrontal cortex',
 'ENCBS819UHF': 'dorsolateral prefrontal cortex',
 'ENCBS914REQ': 'heart left ventricle',
 'ENCBS756BZF': 'heart left ventricle',
 'ENCBS075YII': 'Caco-2',
 'ENCBS020DQC': 'Caco-2',
 'ENCBS060KJN': 'A673',
 'ENCBS119CST': 'A673',
 'ENCBS777OZT': 'Panc1',
 'ENCBS555BGE': 'PC-3',
 'ENCBS820SXP': 'endothelial cell of umbilical vein',
 'ENCBS023JCN': 'endothelial cell of umbilical vein',
 'ENCBS705XRW': 'Right ventricle myocardium inferior',
 'ENCBS511ZVG': 'PC-9',
 'ENCBS021NLC': 'PC-9',
 'ENCBS112SEO': 'OCI-LY7',
 'ENCBS869GJQ': 'OCI-LY7',
 'ENCBS996RLF': 'mammary epithelial cell',
 'ENCBS710VQU': 'mammary epithelial cell',
 'ENCBS961BBX': 'Calu3',
 'ENCBS806IYJ': 'Calu3',
 'ENCBS206PUV': 'K562',
 'ENCBS648NSQ': 'IMR-90',
 'ENCBS632QXG': 'hea

In [None]:
biosample_groups = {}
for col in sample_columns:
    biosample_id = col.split('_')[0]
    biosample_term_name = biosample_dict.get(biosample_id).replace(' ', '_')
    biosample_groups.setdefault(biosample_term_name, []).append(col)
    
result_df = filtered_df.select(bed_columns).clone()

all_sample_data = filtered_df.select(sample_columns).to_numpy()
all_values = all_sample_data.flatten()
valid_mask = ~np.isnan(all_values)
global_mean = np.mean(all_values[valid_mask])
global_std = np.std(all_values[valid_mask], ddof=1)

non_nan_count = np.zeros(len(filtered_df))

for biosample_term_name, columns in biosample_groups.items():
    group_data = filtered_df.select(columns).to_numpy()
    group_means = np.nanmean(group_data, axis=1)
    z_scores = np.full(len(group_means), np.nan)
    p_values = np.full(len(group_means), np.nan)  
     
    valid_indices = ~np.isnan(group_means)
    if np.any(valid_indices) and global_std > 0:
        z_scores[valid_indices] = (group_means[valid_indices] - global_mean) / global_std
        p_values[valid_indices] = 2 * (1 - stats.norm.cdf(np.abs(z_scores[valid_indices])))
    
        adjusted_p_values = false_discovery_control(p_values[valid_indices], method='bh')
        
        full_adjusted_p_values = np.full_like(p_values, np.nan)
        full_adjusted_p_values[valid_indices] = adjusted_p_values
    
    non_nan_mask = ~np.isnan(group_data)
    row_has_value = np.any(non_nan_mask, axis=1)
    non_nan_count += row_has_value.astype(int) 
    
    write_df = result_df.with_columns([
        pl.Series(z_scores).alias("zscore"),
        pl.Series(p_values).alias("pvalue"),
        pl.Series(full_adjusted_p_values).alias("adj_pvalue"),
        pl.Series(non_nan_count).alias("n_samples")
    ]).sort("adj_pvalue")
    write_df.write_csv(f"sig_calcs/{biosample_term_name}.tsv", separator='\t')

  group_means = np.nanmean(group_data, axis=1)


In [None]:
# import subprocess

# genome_sizes = subprocess.run(["cut", "-f1,2", "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.fai"],
#                                      capture_output=True, text=True)
# with open('genome.sizes', 'w') as f:
#     f.write(genome_sizes.stdout)