In [1]:
%%javascript
require(["codemirror/keymap/sublime", "notebook/js/cell", "base/js/namespace"],
    function(sublime_keymap, cell, IPython) {
        cell.Cell.options_default.cm_config.keyMap = 'sublime';
        var cells = IPython.notebook.get_cells();
        for(var cl=0; cl< cells.length ; cl++){
            cells[cl].code_mirror.setOption('keyMap', 'sublime');
        }
    }
);

<IPython.core.display.Javascript object>

In [2]:
# change the cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
import warnings
warnings.filterwarnings('ignore')
from more_itertools import chunked
# import all relevant func and modules from a companion .py file
from supp_lib import *
# import all samples as dictionary ...
from samples import *

In [25]:
!scp ghpcc:/nl/umw_job_dekker/users/av90w/projects/for-paper/fig2/G4-CTCF-dots/intersect-all-NT-CTCF-NO-G4-centered-RAD21 ./

intersect-all-NT-CTCF-NO-G4-centered-RAD21    100%  284KB   1.9MB/s   00:00    


In [29]:
ctcf_df = bioframe.read_table(
    "intersect-all-NT-CTCF-NO-G4-centered-RAD21",
    names=["chrom","start","end"],
    usecols=[0,1,2]
)
ctcf_df = ctcf_df.sort_values(["chrom","start"])
ctcf_df = ctcf_df[ ctcf_df["chrom"].isin(autosomal_chroms) ]
ctcf_df["type"] = "ctcf"
ctcf_df = ctcf_df.reset_index(drop=True)

In [30]:
# this is how one can read a DataFrame into memory:
# bf is short for bioframe here ...
feature_of_interest = "ActiveTSS-NO-CTCF-5bp.bed"
df = bioframe.read_table(
    feature_of_interest,
    names=["chrom","start","end","strand"]
)
df = df.sort_values(["chrom","start"]) # sort ...
df = df[df["chrom"].isin(autosomal_chroms)] # filter chroms
df["type"] = "tss"
df = df.reset_index(drop=True)

```python
# just an example-prototype on how we can do pairwise combinations dissected by orientation/type/etc
# also in this case we are combining 2 lists of intervals - to get to pairwise interactions between the two
# let's assume that we have 2 dataframes - df and df1 :
a = df.head().copy()
b = df1.head().copy()
a["type"] = "a"
b["type"] = "b"

xxx = pd.concat([a,b]).reset_index(drop=True)
print(xxx)

yyy = bioframe.pair_by_distance(xxx,min_sep=0,max_sep=100000000)
# select only interactions between different types ...
print(yyy[yyy["type_1"] != yyy["type_2"]].reset_index(drop=True))
```

In [31]:
dfs_concat = pd.concat([df,ctcf_df]).reset_index(drop=True)
print(dfs_concat.head())
bedpe = bioframe.pair_by_distance(dfs_concat, min_sep=50_000, max_sep=400_000, suffixes=('1','2'))
print(bedpe.head())

  chrom   start     end strand type
0  chr1   29365   29370      -  tss
1  chr1  568060  568065      -  tss
2  chr1  762897  762902      -  tss
3  chr1  762970  762975      +  tss
4  chr1  894631  894636      -  tss
  chrom1  start1    end1 strand1 type1 chrom2  start2    end2 strand2 type2
0   chr1  568060  568065       -   tss   chr1  804937  805691     NaN  ctcf
1   chr1  568060  568065       -   tss   chr1  875596  875897     NaN  ctcf
2   chr1  568060  568065       -   tss   chr1  894631  894636       -   tss
3   chr1  568060  568065       -   tss   chr1  895963  895968       +   tss
4   chr1  568060  568065       -   tss   chr1  901861  901866       +   tss


In [38]:
BEDPE_cols = ['chrom1','start1','end1','chrom2','start2','end2']

In [41]:
bedpe[((bedpe["type1"]=="ctcf")&(bedpe["type2"]=="tss")&(bedpe["strand2"]=="+"))][BEDPE_cols] \
    .reset_index(drop=True) \
    .to_csv("upstreamCTCF_pTSS.bedpe",index=False,sep="\t")

In [43]:
bedpe[((bedpe["type2"]=="ctcf")&(bedpe["type1"]=="tss")&(bedpe["strand1"]=="-"))][BEDPE_cols] \
    .reset_index(drop=True) \
    .to_csv("downstreamCTCF_mTSS.bedpe",index=False,sep="\t")

In [46]:
bedpe[((bedpe["type1"]=="ctcf")&(bedpe["type2"]=="tss")&(bedpe["strand2"]=="-"))][BEDPE_cols] \
    .reset_index(drop=True) \
    .to_csv("upstreamCTCF_mTSS_ctrl.bedpe",index=False,sep="\t")

In [47]:
bedpe[((bedpe["type2"]=="ctcf")&(bedpe["type1"]=="tss")&(bedpe["strand1"]=="+"))][BEDPE_cols] \
    .reset_index(drop=True) \
    .to_csv("downstreamCTCF_pTSS_ctrl.bedpe",index=False,sep="\t")

In [44]:
!scp upstreamCTCF_pTSS.bedpe ghpcc:/nl/umw_job_dekker/users/sv49w/ALV2021/

upstreamCTCF_pTSS.bedpe                       100%  352KB 381.6KB/s   00:00    


In [45]:
!scp downstreamCTCF_mTSS.bedpe ghpcc:/nl/umw_job_dekker/users/sv49w/ALV2021/

downstreamCTCF_mTSS.bedpe                     100%  330KB 457.5KB/s   00:00    


In [48]:
!scp downstreamCTCF_pTSS_ctrl.bedpe ghpcc:/nl/umw_job_dekker/users/sv49w/ALV2021/
!scp upstreamCTCF_mTSS_ctrl.bedpe ghpcc:/nl/umw_job_dekker/users/sv49w/ALV2021/

downstreamCTCF_pTSS_ctrl.bedpe                100%  339KB 410.4KB/s   00:00    
upstreamCTCF_mTSS_ctrl.bedpe                  100%  336KB 472.3KB/s   00:00    


In [51]:
# # here is very-very liberal list of CTCF calls in hg19, based on "MA0139.1.meme" from JASPAR
# # P-val cutoff was 1e-4 - let's see how it looks here ...
# !scp ghpcc:/nl/umw_job_dekker/users/sv49w/ALV2021/CTCF_motif/CTCF.hist.txt ./CTCF.MA0139.1.txt

# require tr -s '\t' - to remove duplicated tabs
ctcf_motif = bioframe.read_table(
    "CTCF.MA0139.1.txt",
    names=["name","chrom","start","end","strand","score","pval","seq"],
    sep="\t",
    index_col = None
)
ctcf_motif = ctcf_motif.sort_values(["chrom","start"])
ctcf_motif = ctcf_motif[ ctcf_motif["chrom"].isin(autosomal_chroms) ]
ctcf_motif = ctcf_motif.reset_index(drop=True)
# ctcf_motif = bf.merge(ctcf_motif, min_dist=merge_radius)
ctcf_motif.head()

Unnamed: 0,name,chrom,start,end,strand,score,pval,seq
0,MA0139.1,chr1,10471,10489,-,11.6393,1.48e-05,GGGCCGGCTGAGGGTACCG
1,MA0139.1,chr1,11164,11182,-,9.88525,3.3e-05,CTGCAAGCAAGGGGCGGTC
2,MA0139.1,chr1,11223,11241,-,26.4262,2.2e-10,TCGCCAGCAGGGGGCGCCC
3,MA0139.1,chr1,11281,11299,-,24.6885,1.97e-09,GCGCCAGCAGGGGGCGCTG
4,MA0139.1,chr1,11340,11358,-,14.377,3.72e-06,CTGCCAGCAGGCGGCGTGC


In [52]:
ctcf_df = bioframe.read_table(
    "intersect-all-NT-CTCF-NO-G4-centered-RAD21",
    names=["chrom","start","end"],
    usecols=[0,1,2]
)
ctcf_df = ctcf_df.sort_values(["chrom","start"])
ctcf_df = ctcf_df[ ctcf_df["chrom"].isin(autosomal_chroms) ]
ctcf_df["type"] = "ctcf"
ctcf_df = ctcf_df.reset_index(drop=True)

In [69]:
_xxx = bioframe.overlap(ctcf_df,ctcf_motif,how="left",suffixes=('1','2'))
_xxx[~pd.isnull(_xxx["chrom2"])][["chrom1","start1","end1","strand2","pval2"]].groupby(["chrom1","start1","end1"])#.drop_duplicates()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f45bd0d86a0>

In [55]:
bioframe.overlap(ctcf_df,ctcf_motif,how="inner",suffixes=('1','2')).groupby()

Unnamed: 0,chrom_1,start_1,end_1,type_1,name_2,chrom_2,start_2,end_2,strand_2,score_2,pval_2,seq_2
0,chr1,10151,10508,ctcf,MA0139.1,chr1,10471,10489,-,11.63930,1.480000e-05,GGGCCGGCTGAGGGTACCG
1,chr1,804937,805691,ctcf,MA0139.1,chr1,805098,805116,+,9.19672,4.450000e-05,aggtcgcaggaggGCACCC
2,chr1,804937,805691,ctcf,MA0139.1,chr1,805296,805314,-,11.44260,1.620000e-05,GCGCCCCCTGGTGGCAGCC
3,chr1,804937,805691,ctcf,MA0139.1,chr1,805298,805316,+,24.65570,2.050000e-09,CTGCCACCAGGGGGCGCGC
4,chr1,804937,805691,ctcf,MA0139.1,chr1,805415,805433,-,9.19672,4.450000e-05,GCCCCGCTGGAGGGCAGGG
...,...,...,...,...,...,...,...,...,...,...,...,...
17888,chr9,34403617,34404245,ctcf,MA0139.1,chr9,34403602,34403620,+,10.36070,2.670000e-05,TGGCCATCAGGTGACCCAG
17889,chr9,35095991,35097350,ctcf,MA0139.1,chr9,35095975,35095993,+,12.81970,8.320000e-06,tggacaacagagggagacg
17890,chr9,94422028,94422480,ctcf,MA0139.1,chr9,94422027,94422045,-,7.88525,7.710000e-05,CCACCTGCAGGAAGAACCA
17891,chr9,94901653,94903141,ctcf,MA0139.1,chr9,94901652,94901670,-,10.72130,2.260000e-05,CTTCCACCAGGGGCAGTTA
