In [49]:
import gzip
import h5py
import numpy as np

SAMPLE="HSC"

BED_PKS= f"/work/aaa/projects/chrombpnet-devmult/pipeline/resources/peaks/{SAMPLE}/{SAMPLE}.no_blacklist.bed"
H5_SCORES= f"/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/pretrained_bias/{SAMPLE}/mean/modisco/{SAMPLE}_mean.counts_scores.modisco.h5"
BEDPE_OUT=f"/work/aaa/projects/chrombpnet-devmult/scratch/bed/HSC_mean.seqlet_peak_pairs.bedpe.gz"

WINDOW_SIZE=500

In [2]:
# Open the HDF5 file in read mode
with h5py.File(H5_SCORES, "r") as f:
    # Function to recursively print groups, datasets, and attributes
    def explore_h5(name, obj):
        if isinstance(obj, h5py.Group):
            print(f"Group: {name}")
        elif isinstance(obj, h5py.Dataset):
            print(f"Dataset: {name} | Shape: {obj.shape} | Type: {obj.dtype}")
            if obj.attrs:  # Check for attributes
                print(f"  Attributes: {dict(obj.attrs)}")

    f.visititems(explore_h5)

Group: pos_patterns
Group: pos_patterns/pattern_0
Dataset: pos_patterns/pattern_0/contrib_scores | Shape: (30, 4) | Type: float64
Dataset: pos_patterns/pattern_0/hypothetical_contribs | Shape: (30, 4) | Type: float64
Group: pos_patterns/pattern_0/seqlets
Dataset: pos_patterns/pattern_0/seqlets/contrib_scores | Shape: (45009, 30, 4) | Type: float32
Dataset: pos_patterns/pattern_0/seqlets/end | Shape: (45009,) | Type: int64
Dataset: pos_patterns/pattern_0/seqlets/example_idx | Shape: (45009,) | Type: int64
Dataset: pos_patterns/pattern_0/seqlets/hypothetical_contribs | Shape: (45009, 30, 4) | Type: float32
Dataset: pos_patterns/pattern_0/seqlets/is_revcomp | Shape: (45009,) | Type: bool
Dataset: pos_patterns/pattern_0/seqlets/n_seqlets | Shape: (1,) | Type: int64
Dataset: pos_patterns/pattern_0/seqlets/sequence | Shape: (45009, 30, 4) | Type: float32
Dataset: pos_patterns/pattern_0/seqlets/start | Shape: (45009,) | Type: int64
Dataset: pos_patterns/pattern_0/sequence | Shape: (30, 4) | T

In [13]:
peak_rows = None
with open(BED_PKS, 'r') as peaks_file:
    peak_rows = peaks_file.read().splitlines()

In [39]:
first_set= None
with h5py.File(H5_SCORES, "r") as grp:
    for (pattern_name, datasets) in grp['pos_patterns'].items():
        # Process each seqlet within the pattern.
        for idx in range(datasets['seqlets']['start'].shape[0])[:6]:
            seqlet_name = f'{pattern_name}.{idx}'
            row_num = datasets['seqlets']['example_idx'][idx]
            contrib_score = datasets['seqlets']['contrib_scores'][idx]
            hyp_contribs = datasets['seqlets']['hypothetical_contribs'][idx]
            print(f'seqlet: {seqlet_name}')
            print(f'target peak: {row_num}')
            print(np.sum(np.abs(contrib_score)), np.sum(contrib_score), np.sum(contrib_score**2))
            print(np.sum(np.abs(hyp_contribs)), np.sum(hyp_contribs), np.sum(hyp_contribs**2))
            
        break

seqlet: pattern_0.0
target peak: 61788
1.854023 1.6346931 0.27259195
4.9486613 2.8651013 0.4445321
seqlet: pattern_0.1
target peak: 13795
1.9545879 1.8569584 0.26404402
4.9942217 2.6684313 0.44210005
seqlet: pattern_0.2
target peak: 69053
1.810647 1.6594934 0.2686778
4.750456 1.8176861 0.45345098
seqlet: pattern_0.3
target peak: 27606
1.6140528 1.4647436 0.16965793
4.2320867 3.4660268 0.28540647
seqlet: pattern_0.4
target peak: 43389
1.6914406 1.651722 0.19399147
4.369465 2.1167815 0.34056684
seqlet: pattern_0.5
target peak: 52791
1.6808987 1.6808987 0.23862782
4.6276927 2.4096303 0.4094422


In [40]:
peak_rows[0]

'chr1\t1615174\t1615675\tchr1:1615174-1615675\t958.759\t*\t.\t-1\t-1\t250'

In [51]:
with gzip.open(BEDPE_OUT, 'wt') as f_out:
    with h5py.File(H5_SCORES, "r") as grp:
        for contribution_dir in ['pos', 'neg']:
            patterns_category = f'{contribution_dir}_patterns'
            if patterns_category not in grp:
              continue
            for (pattern_name, datasets) in grp[patterns_category].items():
                print(f'{contribution_dir}_{pattern_name}')
                # Process each seqlet within the pattern.
                for idx in range(datasets['seqlets']['start'].shape[0]):
                    seqlet_name = f'{contribution_dir}_patterns.{pattern_name}.{idx}'
    
                    target_idx = datasets['seqlets']['example_idx'][idx]
                    peak_row = peak_rows[target_idx].split('\t')
                    # chrom(s)
                    chrom = peak_row[0]
    
                    # positions
                    target_start = int(peak_row[1])
                    target_end = int(peak_row[2])
                    
                    target_center = (target_end + target_start) // 2
                    window_center_offset = WINDOW_SIZE // 2
                    seqlet_start_offset = datasets['seqlets']['start'][idx] + 1
                    seqlet_end_offset = datasets['seqlets']['end'][idx]
                    seqlet_start = target_center - window_center_offset + seqlet_start_offset
                    seqlet_end = target_center - window_center_offset + seqlet_end_offset
    
                    # scores
                    target_score = peak_row[4]
                    contrib_score = datasets['seqlets']['contrib_scores'][idx]
                    seqlet_score = np.sum(contrib_score**2)
    
                    # strand
                    seqlet_strand = '-' if bool(datasets['seqlets']['is_revcomp'][idx]) is True else '+'
                    target_strand = peak_row[5]
    
                    row_vars = [chrom, seqlet_start, seqlet_end, 
                                chrom, target_start, target_end, 
                                seqlet_name, seqlet_score, 
                                seqlet_strand, target_strand, 
                                target_score]
                    f_out.write("\t".join(map(str, row_vars)) + "\n")

pos_pattern_0
pos_pattern_1
pos_pattern_10
pos_pattern_11
pos_pattern_12
pos_pattern_13
pos_pattern_14
pos_pattern_15
pos_pattern_16
pos_pattern_17
pos_pattern_18
pos_pattern_19
pos_pattern_2
pos_pattern_20
pos_pattern_21
pos_pattern_22
pos_pattern_23
pos_pattern_24
pos_pattern_25
pos_pattern_26
pos_pattern_27
pos_pattern_28
pos_pattern_29
pos_pattern_3
pos_pattern_30
pos_pattern_31
pos_pattern_32
pos_pattern_33
pos_pattern_34
pos_pattern_35
pos_pattern_36
pos_pattern_37
pos_pattern_38
pos_pattern_39
pos_pattern_4
pos_pattern_40
pos_pattern_41
pos_pattern_42
pos_pattern_43
pos_pattern_44
pos_pattern_45
pos_pattern_46
pos_pattern_47
pos_pattern_48
pos_pattern_49
pos_pattern_5
pos_pattern_50
pos_pattern_51
pos_pattern_52
pos_pattern_53
pos_pattern_54
pos_pattern_55
pos_pattern_56
pos_pattern_57
pos_pattern_58
pos_pattern_59
pos_pattern_6
pos_pattern_60
pos_pattern_61
pos_pattern_62
pos_pattern_63
pos_pattern_64
pos_pattern_7
pos_pattern_8
pos_pattern_9
