# Window master file

In [1]:
import os
import numpy as np
import pandas as pd
import pybedtools
from pybedtools.featurefuncs import greater_than

In [2]:
TMP_DIR = "/data7/deepro/starrseq/4_ml_classification_fragment_category/data/tmp"
pybedtools.helpers.set_tempdir(TMP_DIR)

In [3]:
def make_windows_faster(in_bed, out_bed, window_size=500, window_stride=50):
    """
    Break the ROIs into fragments of an user defined window size and stride
    """
    window = pybedtools.BedTool().window_maker(b=in_bed, w=window_size, s=window_stride)
    # only keep windows of length greater than w-s-1
    window_new_filtered = window.filter(greater_than, window_size - window_stride + 1).saveas(out_bed)
    return window_new_filtered

In [4]:
master_bed = "/data7/deepro/starrseq/4_ml_classification_fragment_category/data/master_filtered.bed"
master_window_bed = "/data7/deepro/starrseq/4_ml_classification_fragment_category/data/master_filtered_w1000.bed"

master_bed = make_windows_faster(master_bed, master_window_bed, window_size=1000, window_stride=100)

# For each peak file, find >50% overlap with window and annotate them as peak

In [5]:
libs = ["CC", "ATF2", "CTCF", "FOXA1", "LEF1", "SCRT1", "TCF7L2", "16P12_1"]

In [6]:
LIB_DIR  = "/data7/deepro/starrseq/4_ml_classification_fragment_category/data"
lib_peak_files = [os.path.join(LIB_DIR, l, "peaks.bed.gz") for l in libs]

In [7]:
lib_peak_beds = [pybedtools.BedTool(lpf) for lpf in lib_peak_files]

In [8]:
lib_peak_beds

[<BedTool(/data7/deepro/starrseq/4_ml_classification_fragment_category/data/CC/peaks.bed.gz)>,
 <BedTool(/data7/deepro/starrseq/4_ml_classification_fragment_category/data/ATF2/peaks.bed.gz)>,
 <BedTool(/data7/deepro/starrseq/4_ml_classification_fragment_category/data/CTCF/peaks.bed.gz)>,
 <BedTool(/data7/deepro/starrseq/4_ml_classification_fragment_category/data/FOXA1/peaks.bed.gz)>,
 <BedTool(/data7/deepro/starrseq/4_ml_classification_fragment_category/data/LEF1/peaks.bed.gz)>,
 <BedTool(/data7/deepro/starrseq/4_ml_classification_fragment_category/data/SCRT1/peaks.bed.gz)>,
 <BedTool(/data7/deepro/starrseq/4_ml_classification_fragment_category/data/TCF7L2/peaks.bed.gz)>,
 <BedTool(/data7/deepro/starrseq/4_ml_classification_fragment_category/data/16P12_1/peaks.bed.gz)>]

In [9]:
label_df = pd.DataFrame()
for lib, lpb in zip(libs, lib_peak_beds):
    intersect_df = master_bed.intersect(b=lpb, c=True, F=0.25).to_dataframe()
    intersect_df = intersect_df.rename(columns={"chrom": "chrm", "name": lib}).set_index(["chrm", "start", "end"])
    label_df = pd.concat((label_df, intersect_df), axis=1)

In [10]:
label_df = label_df.clip(upper=1).reset_index()

In [11]:
label_df

Unnamed: 0,chrm,start,end,CC,ATF2,CTCF,FOXA1,LEF1,SCRT1,TCF7L2,16P12_1
0,chr1,910305,911305,0,0,0,0,0,0,0,0
1,chr1,910405,911405,0,0,0,0,0,0,0,0
2,chr1,910505,911505,0,0,0,0,0,0,0,0
3,chr1,910605,911605,0,0,0,0,0,0,0,0
4,chr1,910705,911705,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
49843,chrX,154370359,154371291,0,0,0,0,0,0,0,0
49844,chrX,154799022,154799934,0,0,0,0,0,0,0,0
49845,chrX,154800044,154801044,0,0,0,0,0,0,0,0
49846,chrX,154800144,154801144,0,0,0,0,0,0,0,0


In [12]:
label_df["split"] = "train"
split_ratio=(70,15,15)
a = np.arange(len(label_df))
np.random.shuffle(a)
for arr_idx, split_val in zip(np.split(a, [int(split_ratio[0]/100 * len(a)), int((split_ratio[0]/100 + split_ratio[1]/100) * len(a))]), ["train", "valid", "test"]):
    label_df.loc[arr_idx, "split"] = split_val

In [13]:
label_df = label_df.loc[(label_df.end-label_df.start)==1000]

In [14]:
save_file = "/data7/deepro/starrseq/4_ml_classification_fragment_category/data/MTL/resnet_1000.h5"
label_df.to_hdf(save_file, index=False, key="samples")

In [15]:
pybedtools.helpers.cleanup(verbose=False, remove_all=True)