In [9]:
import pandas as pd
from functools import reduce

In [2]:
# Read Coding coordinates
gene_regions = pd.read_csv("/gymreklab-tscc/helia/ensembl/experiments/coding_regions/coding_regions_sorted_corrected.bed", 
                           sep="\t", header=None)
gene_regions.columns = ['chrom', 'start', 'end', 'name', 'x', 'strand']

# Read TR coordinates
TR_regions = pd.read_csv("/gymreklab-tscc/helia/ensembl/experiments/coding_regions/all_TR_coordinates.bed",
                         sep="\t", header=None)
TR_regions.columns = ['chrom', 'start', 'end']

In [3]:
def create_input(chrom):
    gene_chrom = gene_regions[gene_regions['chrom'] == chrom]
    gene_chrom = gene_chrom.drop_duplicates(subset = ['chrom', 'start', 'end'])
    
    TR_chrom = TR_regions[TR_regions['chrom'] == chrom]
    TR_chrom = TR_chrom.drop_duplicates(subset = ['chrom', 'start', 'end'])
    
    with open(f"inputs/input_{chrom}.txt", "w") as f:
        f.write(f"{len(gene_chrom)} {len(TR_chrom)}")
        f.write("\n")
        for index,row in gene_chrom.iterrows():
            f.write(f"{row['start']} {row['end']}")
            f.write("\n")
            
        for index,row in TR_chrom.iterrows():
            f.write(f"{row['start']} {row['end']}")
            f.write("\n")  

In [14]:
#### Writing TR and coding regions to input file ####
for chrom in range(1,23):
    create_input(f"chr{chrom}")   

In [15]:
#### calling overlap_finder.cpp to find overlap and write them to output ####
!g++ overlap_finder.cpp --std c++11
for chrom in range(1,23):
    input_addr = f"inputs/input_chr{chrom}.txt"
    output_addr = f"inputs/overlap_chr{chrom}.txt"
    !./a.out < $input_addr > $output_addr
    

In [4]:
#### Load overlaps and merge ####
overlaps = pd.DataFrame(columns = ['chrom','TR_start', 'TR_end', 'gene_start', 'gene_end'])
for chrom in range(1,23):
    overlaps_chrom = pd.read_csv(f"inputs/overlap_chr{chrom}.txt", sep = "\t", header = None)
    overlaps_chrom.columns = ['TR_start', 'TR_end', 'gene_start', 'gene_end']
    overlaps_chrom['chrom'] = "chr" + str(chrom)
    overlaps = pd.concat([overlaps, overlaps_chrom])
overlaps['id'] = "id"
overlaps = overlaps.drop_duplicates(subset = ['chrom','TR_start', 'TR_end'])
overlaps[['TR_start', 'TR_end', 'gene_start', 'gene_end']] = overlaps[['TR_start', 'TR_end', 'gene_start', 'gene_end']].astype(int)
overlaps

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


Unnamed: 0,TR_end,TR_start,chrom,gene_end,gene_start,id
0,960798,960770,chr1,960800,960693,id
1,970857,970841,chr1,971006,970685,id
2,976199,976178,chr1,976269,976171,id
3,1232644,1232633,chr1,1233268,1232278,id
4,1232818,1232807,chr1,1233268,1232278,id
...,...,...,...,...,...,...
157,50603361,50603316,chr22,50603498,50603222,id
158,50604160,50604149,chr22,50605064,50603840,id
159,50604573,50604544,chr22,50605064,50603840,id
160,50604557,50604546,chr22,50605064,50603840,id


In [5]:
#### Compare with previous overlaps found with bedtools where TR shouldn't be entirely inside coding region ####

old_overlaps = pd.read_csv("/gymreklab-tscc/helia/ensembl/experiments/coding_regions/mapping.txt", sep = "\t", header = None)

cnt = 0
old_overlaps_entire = []
for index, row in old_overlaps.iterrows():
    if row[1] >= row[4] and row[2] <= row[5]: # check for entirely being inside the gene
        old_overlaps_entire.append(row)
old_overlaps_entire = pd.DataFrame(old_overlaps_entire)
old_overlaps_entire.columns =  ['chrom','TR_start', 'TR_end','chrom2', 'gene_start', 'gene_end','Gene_name','y','z']
old_overlaps_entire = old_overlaps_entire.drop_duplicates(subset = ['chrom','TR_start', 'TR_end'])
old_overlaps_entire[['TR_start', 'TR_end', 'gene_start', 'gene_end']] = old_overlaps_entire[['TR_start', 'TR_end', 'gene_start', 'gene_end']].astype(int)
compare = pd.merge(old_overlaps_entire, overlaps, left_on = ['chrom', 'TR_start', 'TR_end'], right_on = ['chrom', 'TR_start', 'TR_end'], how = "outer")
print(f"#TRs found by cpp code: {len(overlaps)}, #TRs found by bedtools: {len(old_overlaps_entire)}, #TRs found by both: {len(compare)}")
      
      
      

#TRs found by cpp code: 6492, #TRs found by bedtools: 6492, #TRs found by both: 6492


In [7]:
old_overlaps_entire[['chrom','TR_start','TR_end']].to_csv("TR_intersect.txt", sep = "\t", header=False, index = False)

In [8]:
AFR_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])
AMR_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])
EAS_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])
SAS_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])
EUR_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])

all_df = [EUR_df, SAS_df, EAS_df, AMR_df, AFR_df]


    

pops = ['EUR', 'SAS', 'EAS', 'AMR', 'AFR']

for i in range(5):
    for chrom in range(1,23):
        df = pd.read_csv(f"/gymreklab-tscc/helia/ensembl/experiments/coding_regions/heterozygosity/coding_het_{pops[i]}_{chrom}.tab",sep="\t")
        all_df[i] = pd.concat([all_df[i], df])
        
    all_df[i] = all_df[i].drop_duplicates(subset=["chrom","start","end"], keep='first')

In [16]:
df_all_pop = reduce(lambda  left,right: pd.merge(left,right,on=['chrom','start','end'],
                                            how='inner'), all_df)

df_all_pop.columns = ['chrom', 'start', 'end', 'EUR_het', 'SAS_het', 'EAS_het', 'AMR_het', 'AFR_het']
df_all_pop['end'] = df_all_pop['end'] - 1
df_all_pop = pd.merge(df_all_pop, old_overlaps_entire, left_on = ['chrom', 'start', 'end'], 
                      right_on = ['chrom', 'TR_start', 'TR_end'], how = "outer")
df_all_pop[df_all_pop['gene_start'].isnull()]

Unnamed: 0,chrom,start,end,EUR_het,SAS_het,EAS_het,AMR_het,AFR_het,TR_start,TR_end,chrom2,gene_start,gene_end,Gene_name,y,z
29,chr1,4125749,4125772,0.00000,0.00000,0.00171,0.00000,0.00000,,,,,,,,
30,chr1,4608759,4608777,0.48600,0.50500,0.50100,0.43900,0.45800,,,,,,,,
59,chr1,9077399,9077448,0.41700,0.55500,0.67900,0.57600,0.72100,,,,,,,,
64,chr1,10093692,10093712,0.00158,0.01160,0.00000,0.00815,0.00337,,,,,,,,
102,chr1,18354405,18354419,0.00000,0.00167,0.00000,0.00000,0.00000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6189,chr19,52315319,52315335,0.01100,0.00831,0.00514,0.01820,0.13300,,,,,,,,
6356,chr20,43265573,43265584,0.01100,0.02140,0.00000,0.00204,0.00000,,,,,,,,
6399,chr20,60416325,60416335,0.00000,0.00167,0.00000,0.01420,0.00336,,,,,,,,
6400,chr20,61456022,61456057,0.00158,0.00000,0.00000,0.00000,0.00112,,,,,,,,


In [20]:
old_overlaps_entire[old_overlaps_entire['TR_start'] == 4125759]

Unnamed: 0,chrom,TR_start,TR_end,chrom2,gene_start,gene_end,Gene_name,y,z
