# Identify STRs that are at splice sites
Like the ENCODE paper, define a splice site as being within 100bp of an exon. All of the following uses the GRCh37 alignments (GENCODE v19 annotations), also like the paper. 
Links to [GENCODE data](https://www.gencodegenes.org/human/release_37lift37.html)

In [1]:
import pyranges as pr
import numpy as np
import pandas as pd

Load the STR data

In [21]:
col_names = ["Chromosome", "Start", "End", "class", 
            "length", "Strand", "num_units", 
            "actual_repeat", "gene", "gene_start",
            "gene_stop", "gene_strand", "annotation",
            "promoter", "dist_to_tss"]
str_df = pd.read_csv("data/msdb_data.tsv", sep = '\t', names = col_names, index_col = False, na_values = "")
str_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Chromosome,Start,End,class,length,Strand,num_units,actual_repeat,gene,gene_start,gene_stop,gene_strand,annotation,promoter,dist_to_tss
0,chr1,10000,10108,AACCCT,108,+,18,TAACCC,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1766
1,chr1,10108,10149,AACCCT,41,+,6,AACCCT,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1725
2,chr1,10147,10179,AACCCT,32,+,5,CCCTAA,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1695
3,chr1,10172,10184,AACCT,12,+,2,CCTAA,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1690
4,chr1,10177,10233,AACCCT,56,+,9,CCTAAC,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1641


In [20]:
str_pr = pr.PyRanges(str_df)
str_pr

Unnamed: 0,Chromosome,Start,End,class,length,Strand,num_units,actual_repeat,gene,gene_start,gene_stop,gene_strand,annotation,promoter,dist_to_tss
0,chr1,10000,10108,AACCCT,108,+,18,TAACCC,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1766
1,chr1,10108,10149,AACCCT,41,+,6,AACCCT,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1725
2,chr1,10147,10179,AACCCT,32,+,5,CCCTAA,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1695
3,chr1,10172,10184,AACCT,12,+,2,CCTAA,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1690
4,chr1,10177,10233,AACCCT,56,+,9,CCTAAC,uc001aaa.3,11874,14409,+,Intergenic,Non-Promoter,-1641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4396267,chrY,59363486,59363498,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,59360854,-,Intergenic,Non-Promoter,-2632
4396268,chrY,59363501,59363513,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,59360854,-,Intergenic,Non-Promoter,-2647
4396269,chrY,59363514,59363543,ACACCC,29,-,4,GTGTGG,uc011ncc.1,59358329,59360854,-,Intergenic,Non-Promoter,-2660
4396270,chrY,59363540,59363552,ACACCC,12,-,2,GTGTGG,uc011ncc.1,59358329,59360854,-,Intergenic,Non-Promoter,-2686


Load the UCSC introns

In [34]:
col_names = ["Chromosome", "Start", "End", "ucsc_id", "unk", "Strand"]
intron_df = pd.read_csv("data/ucsc_introns.bed", sep = '\t', names = col_names, index_col = False, skiprows = [0])
intron_df.head()

Unnamed: 0,Chromosome,Start,End,ucsc_id,unk,Strand
0,chr1,12227,12612,uc001aaa.3_intron_0_0_chr1_12228_f,0,+
1,chr1,12721,13220,uc001aaa.3_intron_1_0_chr1_12722_f,0,+
2,chr1,12227,12645,uc010nxr.1_intron_0_0_chr1_12228_f,0,+
3,chr1,12697,13220,uc010nxr.1_intron_1_0_chr1_12698_f,0,+
4,chr1,12227,12594,uc010nxq.1_intron_0_0_chr1_12228_f,0,+


In [37]:
intron_pr = pr.PyRanges(intron_df)
intron_pr

Unnamed: 0,Chromosome,Start,End,ucsc_id,unk,Strand
0,chr1,12227,12612,uc001aaa.3_intron_0_0_chr1_12228_f,0,+
1,chr1,12721,13220,uc001aaa.3_intron_1_0_chr1_12722_f,0,+
2,chr1,12227,12645,uc010nxr.1_intron_0_0_chr1_12228_f,0,+
3,chr1,12697,13220,uc010nxr.1_intron_1_0_chr1_12698_f,0,+
4,chr1,12227,12594,uc010nxq.1_intron_0_0_chr1_12228_f,0,+
...,...,...,...,...,...,...
659322,chrY,27216988,27218792,uc011nbv.2_intron_1_0_chrY_27216989_r,0,-
659323,chrY,27218868,27245878,uc011nbv.2_intron_2_0_chrY_27218869_r,0,-
659324,chrY,27329895,27330860,uc004fwt.3_intron_0_0_chrY_27329896_r,0,-
659325,chrY,59359508,59360006,uc011ncc.1_intron_0_0_chrY_59359509_r,0,-


Next, get all the intervals of short tandem repeats that are located in introns. The new PyRanges Start, End will now reflect only the region of the repeat that overlapped the intron, not the length of the whole repeat (this information is retained in the `length` column)

In [39]:
str_intron_intersect = str_pr.intersect(intron_pr, strandedness= False)

In [41]:
str_intron_intersect
# drop duplicate rows in the intersection PyRanges
str_intron_intersect_nodup = str_intron_intersect.drop_duplicate_positions()
str_intron_intersect_nodup

Unnamed: 0,Chromosome,Start,End,class,length,Strand,num_units,actual_repeat,gene,gene_start,gene_stop,gene_strand,annotation,promoter,dist_to_tss
0,chr1,15240,15255,AGGGCC,15,+,2,GGGCCA,uc009viv.2,14407,29370,-,Exon,Non-Promoter,14115
1,chr1,15383,15395,AGGCGC,12,+,2,GCAGGC,uc009viv.2,14407,29370,-,Exon,Non-Promoter,13975
2,chr1,17476,17488,AGCCG,12,+,2,CCGAG,uc009vjc.1,16858,17751,-,Exon,Promoter,263
3,chr1,17801,17814,ATCCC,13,+,2,CCATC,uc009vjd.2,15796,18061,-,Exon,Promoter,247
4,chr1,18453,18466,AGGCC,13,+,2,GCCAG,uc009vit.3,14362,19759,-,Intron,Non-Promoter,1293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1973757,chrY,59342100,59342121,AC,21,-,10,TG,uc004fxm.1,59330252,59343488,+,Intron,Non-Promoter,11848
1973758,chrY,59354264,59354277,AGGCC,13,-,2,GGCCT,uc004fxo.1,59352973,59356131,+,Intron,Non-Promoter,1291
1973759,chrY,59355249,59355261,AGCCG,12,-,2,GGCTC,uc022cpg.1,59354985,59358336,+,Intron,Promoter,264
1973760,chrY,59357345,59357357,AGGCGC,12,-,2,GCCTGC,uc022cpg.1,59354985,59358336,+,Intron,Non-Promoter,2360


Next, want to get the eCLIP peaks that are in these STRs