# Identify STRs that are at splice sites
Like the ENCODE paper, define a splice site as being within 100bp of an exon. All of the following uses the GRCh37 alignments (GENCODE v19 annotations), also like the paper. 
Links to [GENCODE data](https://www.gencodegenes.org/human/release_37lift37.html)

In [2]:
import pyranges as pr
import numpy as np
import pandas as pd
from pyliftover import LiftOver
from tqdm.notebook import tqdm

tqdm.pandas(desc = "Progress")

Load the STR data

In [2]:
col_names = ["Chromosome", "Start", "End", "class", 
            "length", "Strand", "num_units", 
            "actual_repeat", "gene", "gene_start",
            "gene_stop", "gene_strand", "annotation",
            "promoter", "dist_to_tss"]
dtype_dict = {"Chromosome": np.str, "Start":np.int, "End":np.int, "length": np.int, 
             "Strand": np.str} #, "gene_start": np.int, "gene_stop": np.int, "dist_to_tss": np.int}
str_df = pd.read_csv("data/msdb_data.tsv", sep = '\t', dtype = dtype_dict, names = col_names, index_col = False, na_values = "")
str_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Chromosome,Start,End,class,length,Strand,num_units,actual_repeat,gene,gene_start,gene_stop,gene_strand,annotation,promoter,dist_to_tss
0,chr1,10000,10108,AACCCT,108,+,18,TAACCC,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1766
1,chr1,10108,10149,AACCCT,41,+,6,AACCCT,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1725
2,chr1,10147,10179,AACCCT,32,+,5,CCCTAA,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1695
3,chr1,10172,10184,AACCT,12,+,2,CCTAA,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1690
4,chr1,10177,10233,AACCCT,56,+,9,CCTAAC,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1641


Convert MSMB data to GRCh37/hg19 (format of eCLIP data)

In [3]:
lo = LiftOver("hg38", "hg19")

In [7]:
a = pd.DataFrame({"a": [1,2], "b":[3,4], "c": [5,6]})
a.head()

def testf(x):
    return pd.Series({"a": x["a"]-5, "c": x["b"]-2})

a.progress_apply(testf, axis = 1)

Progress:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,a,c
0,-4,1
1,-3,2


In [4]:
from tqdm.notebook import trange, tqdm
from time import sleep

for i in trange(3, desc='1st loop'):
    for j in tqdm(range(100), desc='2nd loop'):
        sleep(0.01)

1st loop:   0%|          | 0/3 [00:00<?, ?it/s]

2nd loop:   0%|          | 0/100 [00:00<?, ?it/s]

2nd loop:   0%|          | 0/100 [00:00<?, ?it/s]

2nd loop:   0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
def convert_coords(df_row):
    new_start = lo.convert_coordinate(df_row['Chromosome'], df_row['Start'], df_row['Strand'])
    new_end = lo.convert_coordinate(df_row['Chromosome'], df_row['End'], df_row['Strand'])
    cols = ["Chromosome", "Start", "End", "Strand"]
    if len(new_start) > 0 and len(new_end)>0:
        data = [new_start[0][0], new_start[0][1], new_end[0][1], new_start[0][2]]
    else:
        data = [df_row["Chromosome"], None, None, df_row["Strand"]]
    return pd.Series(dict(zip(cols, data)))


new_df = str_df.progress_apply(convert_coords, axis = 1)
new_df.head()

Progress:   0%|          | 0/4396272 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [71]:
print(lo.convert_coordinate("chr1", 1004500, '+'))
print(lo.convert_coordinate("chr1", 1004500, '-'))

[('chr1', 939880, '+', 20849626768)]
[('chr1', 939880, '-', 20849626768)]


In [6]:
str_pr = pr.PyRanges(str_df)
str_pr

Unnamed: 0,Chromosome,Start,End,class,length,Strand,num_units,actual_repeat,gene,gene_start,gene_stop,gene_strand,annotation,promoter,dist_to_tss
0,chr1,10000,10108,AACCCT,108,+,18,TAACCC,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1766
1,chr1,10108,10149,AACCCT,41,+,6,AACCCT,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1725
2,chr1,10147,10179,AACCCT,32,+,5,CCCTAA,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1695
3,chr1,10172,10184,AACCT,12,+,2,CCTAA,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1690
4,chr1,10177,10233,AACCCT,56,+,9,CCTAAC,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4396267,chrY,59363486,59363498,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,59360854,-,Intergenic,Non-Promoter,-2632
4396268,chrY,59363501,59363513,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,59360854,-,Intergenic,Non-Promoter,-2647
4396269,chrY,59363514,59363543,ACACCC,29,-,4,GTGTGG,uc011ncc.1,59358329,59360854,-,Intergenic,Non-Promoter,-2660
4396270,chrY,59363540,59363552,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,59360854,-,Intergenic,Non-Promoter,-2686


Load the UCSC introns

In [7]:
col_names = ["Chromosome", "Start", "End", "ucsc_id", "unk", "Strand"]
intron_df = pd.read_csv("data/ucsc_introns.bed", sep = '\t', names = col_names, index_col = False, skiprows = [0])
intron_df.head()

Unnamed: 0,Chromosome,Start,End,ucsc_id,unk,Strand
0,chr1,12227,12612,uc001aaa.3_intron_0_0_chr1_12228_f,0,+
1,chr1,12721,13220,uc001aaa.3_intron_1_0_chr1_12722_f,0,+
2,chr1,12227,12645,uc010nxr.1_intron_0_0_chr1_12228_f,0,+
3,chr1,12697,13220,uc010nxr.1_intron_1_0_chr1_12698_f,0,+
4,chr1,12227,12594,uc010nxq.1_intron_0_0_chr1_12228_f,0,+


In [9]:
intron_pr = pr.PyRanges(intron_df)
intron_pr

Unnamed: 0,Chromosome,Start,End,ucsc_id,unk,Strand
0,chr1,12227,12612,uc001aaa.3_intron_0_0_chr1_12228_f,0,+
1,chr1,12721,13220,uc001aaa.3_intron_1_0_chr1_12722_f,0,+
2,chr1,12227,12645,uc010nxr.1_intron_0_0_chr1_12228_f,0,+
3,chr1,12697,13220,uc010nxr.1_intron_1_0_chr1_12698_f,0,+
4,chr1,12227,12594,uc010nxq.1_intron_0_0_chr1_12228_f,0,+
...,...,...,...,...,...,...
659322,chrY,27216988,27218792,uc011nbv.2_intron_1_0_chrY_27216989_r,0,-
659323,chrY,27218868,27245878,uc011nbv.2_intron_2_0_chrY_27218869_r,0,-
659324,chrY,27329895,27330860,uc004fwt.3_intron_0_0_chrY_27329896_r,0,-
659325,chrY,59359508,59360006,uc011ncc.1_intron_0_0_chrY_59359509_r,0,-


Next, get all the intervals of short tandem repeats that are located in introns. The new PyRanges Start, End will now reflect only the region of the repeat that overlapped the intron, not the length of the whole repeat (this information is retained in the `length` column)

In [10]:
str_intron_intersect = str_pr.intersect(intron_pr, strandedness= False)

In [11]:
str_intron_intersect
# drop duplicate rows in the intersection PyRanges
str_intron_intersect_nodup = str_intron_intersect.drop_duplicate_positions()
str_intron_intersect_nodup

Unnamed: 0,Chromosome,Start,End,class,length,Strand,num_units,actual_repeat,gene,gene_start,gene_stop,gene_strand,annotation,promoter,dist_to_tss
0,chr1,15240,15255,AGGGCC,15,+,2,GGGCCA,uc009viv.2,14407,29370,-,Exon,Non-Promoter,14115
1,chr1,15383,15395,AGGCGC,12,+,2,GCAGGC,uc009viv.2,14407,29370,-,Exon,Non-Promoter,13975
2,chr1,17476,17488,AGCCG,12,+,2,CCGAG,uc009vjc.1,16858,17751,-,Exon,Promoter,263
3,chr1,17801,17814,ATCCC,13,+,2,CCATC,uc009vjd.2,15796,18061,-,Exon,Promoter,247
4,chr1,18453,18466,AGGCC,13,+,2,GCCAG,uc009vit.3,14362,19759,-,Intron,Non-Promoter,1293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973757,chrY,59342100,59342121,AC,21,-,10,TG,uc004fxm.1,59330252,59343488,+,Intron,Non-Promoter,11848
1973758,chrY,59354264,59354277,AGGCC,13,-,2,GGCCT,uc004fxo.1,59352973,59356131,+,Intron,Non-Promoter,1291
1973759,chrY,59355249,59355261,AGCCG,12,-,2,GGCTC,uc022cpg.1,59354985,59358336,+,Intron,Promoter,264
1973760,chrY,59357345,59357357,AGGCGC,12,-,2,GCCTGC,uc022cpg.1,59354985,59358336,+,Intron,Non-Promoter,2360


Now, get the STRs that are within 300kb of an intron. First, find the nearest intron to each STR:

In [13]:
nearest_intron_pr = str_pr.nearest(intron_pr)
nearest_intron_pr

Unnamed: 0,Chromosome,Start,End,class,length,Strand,num_units,actual_repeat,gene,gene_start,...,gene_strand,annotation,promoter,dist_to_tss,Start_b,End_b,ucsc_id,unk,Strand_b,Distance
0,chr1,15240,15255,AGGGCC,15,+,2,GGGCCA,uc009viv.2,14407,...,-,Exon,Non-Promoter,14115,14829,15795,uc009viq.3_intron_0_0_chr1_14830_r,0,-,0
1,chr1,15383,15395,AGGCGC,12,+,2,GCAGGC,uc009viv.2,14407,...,-,Exon,Non-Promoter,13975,14829,15795,uc009viq.3_intron_0_0_chr1_14830_r,0,-,0
2,chr1,17476,17488,AGCCG,12,+,2,CCGAG,uc009vjc.1,16858,...,-,Exon,Promoter,263,17055,17605,uc009viq.3_intron_3_0_chr1_17056_r,0,-,0
3,chr1,17801,17814,ATCCC,13,+,2,CCATC,uc009vjd.2,15796,...,-,Exon,Promoter,247,17368,17914,uc009vix.2_intron_3_0_chr1_17369_r,0,-,0
4,chr1,18453,18466,AGGCC,13,+,2,GCCAG,uc009vit.3,14362,...,-,Intron,Non-Promoter,1293,18061,24737,uc009vix.2_intron_4_0_chr1_18062_r,0,-,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4390671,chrY,59363486,59363498,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,...,-,Intergenic,Non-Promoter,-2632,59360115,59360500,uc011ncc.1_intron_1_0_chrY_59360116_r,0,-,2987
4390672,chrY,59363501,59363513,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,...,-,Intergenic,Non-Promoter,-2647,59360115,59360500,uc011ncc.1_intron_1_0_chrY_59360116_r,0,-,3002
4390673,chrY,59363514,59363543,ACACCC,29,-,4,GTGTGG,uc011ncc.1,59358329,...,-,Intergenic,Non-Promoter,-2660,59360115,59360500,uc011ncc.1_intron_1_0_chrY_59360116_r,0,-,3015
4390674,chrY,59363540,59363552,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,...,-,Intergenic,Non-Promoter,-2686,59360115,59360500,uc011ncc.1_intron_1_0_chrY_59360116_r,0,-,3041


Then, get everything 

Next, want to get the eCLIP peaks that are in these STRs. To start, let's try 