In [1]:
import numpy as np
import pandas as pd
import qnorm

In [2]:
ref_tf_expressions = pd.read_csv('reference_tf/reference_TPM_tf_expr.csv',header=0,sep='\t',index_col=[0])

ref = qnorm.quantile_normalize(ref_tf_expressions).mean(axis=1)

In [3]:
corgi_samples = np.load('/project/deeprna_data/pretraining_data_final2/tf_expression.npy')
corgi_samples = ((corgi_samples - 1) ** 10) - 0.1 # undo the log transform, convert to raw tpm
corgi_samples.shape

(600, 2891)

In [4]:
with open('/project/deeprna/data/tf_genes/trans_regulators_final_hgnc.txt', 'r') as f:
    tf_list = f.read().strip().split()

In [5]:
tf_intersection = {x:i for i,x in enumerate(tf_list) if x in ref_tf_expressions.index}
len(tf_intersection)

695

In [6]:
# Tissue 124 (brain)

tissue_124 = pd.DataFrame(index=ref_tf_expressions.index, columns=['1'])
tissue_124.loc[list(tf_intersection), '1'] = corgi_samples[124, list(tf_intersection.values())]
tissue_124 = tissue_124.fillna(0)
tissue_124 = qnorm.quantile_normalize(tissue_124, target=ref.values)
tissue_124.head()

Unnamed: 0,1
AFP,4.180882
AHR,3.719742
AIRE,0.540725
ALX1,10.196103
ALX3,2.19655


In [7]:
tissue_124.to_csv('/project/deeprna_data/epigept/corgi_benchmark/tissue_124_qn.csv')

In [8]:
# Tissue 192 (heart)

tissue_192 = pd.DataFrame(index=ref_tf_expressions.index, columns=['1'])
tissue_192.loc[list(tf_intersection), '1'] = corgi_samples[192, list(tf_intersection.values())]
tissue_192 = tissue_192.fillna(0)
tissue_192 = qnorm.quantile_normalize(tissue_192, target=ref.values)
tissue_192.head()

Unnamed: 0,1
AFP,3.436351
AHR,22.262956
AIRE,5.898566
ALX1,10.170887
ALX3,10.170887


In [9]:
tissue_192.to_csv('/project/deeprna_data/epigept/corgi_benchmark/tissue_192_qn.csv')

In [16]:
for tissue in [213, 277, 323, 59]:
    tissue_192 = pd.DataFrame(index=ref_tf_expressions.index, columns=['1'])
    tissue_192.loc[list(tf_intersection), '1'] = corgi_samples[tissue, list(tf_intersection.values())]
    tissue_192 = tissue_192.fillna(0)
    tissue_192 = qnorm.quantile_normalize(tissue_192, target=ref.values)
    print(tissue, tissue_192.head())
    tissue_192.to_csv(f'/project/deeprna_data/epigept/corgi_benchmark/tissue_{tissue}_qn.csv')
    

213               1
AFP    5.359495
AHR   28.667548
AIRE   5.780752
ALX1   2.523568
ALX3  12.416238
277               1
AFP    6.068110
AHR    7.931629
AIRE  19.447317
ALX1  19.447317
ALX3  19.447317
323               1
AFP    8.800755
AHR   13.248534
AIRE   6.121095
ALX1  11.298947
ALX3  10.290151
59               1
AFP    4.180882
AHR    5.646183
AIRE  14.729605
ALX1   5.646183
ALX3  14.729605


## Bed file for epigept input

In [10]:
def parse_bed_file_with_coords(bed_path):
    """
    Parse a BED file and return a list of tuples (chr, start, end).
    Assumes no header and at least three columns.
    """
    df = pd.read_csv(bed_path, sep='\t', header=None, names=["chr", "start", "end", "fold"])
    coords = []
    for _, row in df.iterrows():
        coords.append((row["chr"], int(row["start"]), int(row["end"])))
    return coords

def tile_region(chrom, start, end, sequence_length, stride, drop_last=True):
    """
    Given a region defined by (chrom, start, end),
    create tiles of length sequence_length with the given stride.
    Returns a list of tuples: (chrom, tile_start, tile_end)
    """
    tiles = []
    region_length = end - start
    if region_length < sequence_length:
        return tiles
    current_start = start
    while current_start + sequence_length <= end:
        tiles.append((chrom, str(current_start), str(current_start + sequence_length)))
        current_start += stride
    # Optionally, if drop_last is False, add a shifted tile.
    return tiles

def tile_regions(regions, sequence_length, stride, drop_last=True):
    """
    For a list of regions (each as (chr, start, end)), return all tiles.
    """
    tiled = []
    for region in regions:
        chrom, start, end = region
        tiled.extend(tile_region(chrom, start, end, sequence_length, stride, drop_last))
    return tiled

In [11]:
bed = parse_bed_file_with_coords('/project/deeprna/benchmark/fold3_notf_merged.bed')

tiles = tile_regions(bed, 128000, 128000)

len(tiles)

2224

In [12]:
with open('/project/deeprna_data/epigept/fold3_notf_epigept.bed', 'w') as f:
    f.write('\n'.join(['\t'.join(x) for x in tiles]))

In [13]:
tiles2 = tile_regions([(x,y,z) for (x,y,z) in bed if x == 'chr8'], 128000, 128000)
len(tiles2)

209

In [14]:
for i in range(4):
    with open(f'/project/deeprna_data/epigept/fold3_notf_epigept_chr8_{i}.bed', 'w') as f:
        f.write('\n'.join(['\t'.join(x) for x in tiles2[i*50:(i+1)*50]]))