In [57]:
import pandas as pd
from functools import reduce
from cyvcf2 import VCF

In [2]:
# Read Coding coordinates
gene_regions = pd.read_csv("/gymreklab-tscc/helia/ensembl/experiments/coding_regions/coding_regions_sorted_corrected.bed", 
                           sep="\t", header=None)
gene_regions.columns = ['chrom', 'start', 'end', 'name', 'x', 'strand']

# Read TR coordinates
TR_regions = pd.read_csv("/gymreklab-tscc/helia/ensembl/experiments/coding_regions/all_TR_coordinates.bed",
                         sep="\t", header=None)
TR_regions.columns = ['chrom', 'start', 'end']

In [3]:
def create_input(chrom):
    gene_chrom = gene_regions[gene_regions['chrom'] == chrom]
    gene_chrom = gene_chrom.drop_duplicates(subset = ['chrom', 'start', 'end'])
    
    TR_chrom = TR_regions[TR_regions['chrom'] == chrom]
    TR_chrom = TR_chrom.drop_duplicates(subset = ['chrom', 'start', 'end'])
    
    with open(f"inputs/input_{chrom}.txt", "w") as f:
        f.write(f"{len(gene_chrom)} {len(TR_chrom)}")
        f.write("\n")
        for index,row in gene_chrom.iterrows():
            f.write(f"{row['start']} {row['end']}")
            f.write("\n")
            
        for index,row in TR_chrom.iterrows():
            f.write(f"{row['start']} {row['end']}")
            f.write("\n")  

In [14]:
#### Writing TR and coding regions to input file ####
for chrom in range(1,23):
    create_input(f"chr{chrom}")   

In [15]:
#### calling overlap_finder.cpp to find overlap and write them to output ####
!g++ overlap_finder.cpp --std c++11
for chrom in range(1,23):
    input_addr = f"inputs/input_chr{chrom}.txt"
    output_addr = f"inputs/overlap_chr{chrom}.txt"
    !./a.out < $input_addr > $output_addr
    

In [4]:
#### Load overlaps and merge ####
overlaps = pd.DataFrame(columns = ['chrom','TR_start', 'TR_end', 'gene_start', 'gene_end'])
for chrom in range(1,23):
    overlaps_chrom = pd.read_csv(f"inputs/overlap_chr{chrom}.txt", sep = "\t", header = None)
    overlaps_chrom.columns = ['TR_start', 'TR_end', 'gene_start', 'gene_end']
    overlaps_chrom['chrom'] = "chr" + str(chrom)
    overlaps = pd.concat([overlaps, overlaps_chrom])
overlaps['id'] = "id"
overlaps = overlaps.drop_duplicates(subset = ['chrom','TR_start', 'TR_end'])
overlaps[['TR_start', 'TR_end', 'gene_start', 'gene_end']] = overlaps[['TR_start', 'TR_end', 'gene_start', 'gene_end']].astype(int)
overlaps

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


Unnamed: 0,TR_end,TR_start,chrom,gene_end,gene_start,id
0,960798,960770,chr1,960800,960693,id
1,970857,970841,chr1,971006,970685,id
2,976199,976178,chr1,976269,976171,id
3,1232644,1232633,chr1,1233268,1232278,id
4,1232818,1232807,chr1,1233268,1232278,id
...,...,...,...,...,...,...
157,50603361,50603316,chr22,50603498,50603222,id
158,50604160,50604149,chr22,50605064,50603840,id
159,50604573,50604544,chr22,50605064,50603840,id
160,50604557,50604546,chr22,50605064,50603840,id


In [33]:
#### Compare with previous overlaps found with bedtools where TR shouldn't be entirely inside coding region ####

old_overlaps = pd.read_csv("/gymreklab-tscc/helia/ensembl/experiments/coding_regions/mapping.txt", sep = "\t", header = None)

cnt = 0
old_overlaps_entire = []
for index, row in old_overlaps.iterrows():
    if row[1] >= row[4] and row[2] <= row[5]: # check for entirely being inside the gene
        old_overlaps_entire.append(row)
old_overlaps_entire = pd.DataFrame(old_overlaps_entire)
old_overlaps_entire.columns =  ['chrom','TR_start', 'TR_end','chrom2', 'gene_start', 'gene_end','Gene_name','y','z']
old_overlaps_entire = old_overlaps_entire.drop_duplicates(subset = ['chrom','TR_start', 'TR_end'])
old_overlaps_entire[['TR_start', 'TR_end', 'gene_start', 'gene_end']] = old_overlaps_entire[['TR_start', 'TR_end', 'gene_start', 'gene_end']].astype(int)
compare = pd.merge(old_overlaps_entire, overlaps, left_on = ['chrom', 'TR_start', 'TR_end'], right_on = ['chrom', 'TR_start', 'TR_end'], how = "outer")
print(f"#TRs found by cpp code: {len(overlaps)}, #TRs found by bedtools: {len(old_overlaps_entire)}, #TRs found by both: {len(compare)}")
      
      
      

#TRs found by cpp code: 6492, #TRs found by bedtools: 6492, #TRs found by both: 6492


In [32]:
old_overlaps_entire[['chrom','TR_start','TR_end']].to_csv("TR_intersect.txt", sep = "\t", header=False, index = False)

In [51]:
#### Reading heterozygousity outputs ####

AFR_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])
AMR_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])
EAS_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])
SAS_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])
EUR_df = pd.DataFrame(columns = ["chrom","start","end","het-1"])

all_df = [EUR_df, SAS_df, EAS_df, AMR_df, AFR_df]


    

pops = ['EUR', 'SAS', 'EAS', 'AMR', 'AFR']

for i in range(5):
    for chrom in range(1,23):
        df = pd.read_csv(f"/gymreklab-tscc/helia/ensembl/experiments/coding_regions/heterozygosity/coding_het_{pops[i]}_{chrom}.tab",sep="\t")
        all_df[i] = pd.concat([all_df[i], df])
    
    all_df[i]['end'] = all_df[i]['end'] - 1
    all_df[i] = pd.merge(all_df[i], old_overlaps_entire, left_on = ['chrom', 'start', 'end'], 
                       right_on = ['chrom', 'TR_start', 'TR_end'], how = "inner")
    all_df[i] = all_df[i].drop_duplicates(subset = ["chrom", "start","end"])
    all_df[i] = all_df[i][['chrom', 'start', 'end', "Gene_name", 'het-1']]
    

In [55]:
### Merging all pops ####

df_all_pop = reduce(lambda  left,right: pd.merge(left,right,on=['chrom','start','end', 'Gene_name'],
                                            how='inner'), all_df)
df_all_pop.columns = ['chrom', 'start', 'end', 'gene_name', 'EUR_het', 'SAS_het', 'EAS_het', 'AMR_het', 'AFR_het']
display(df_all_pop)


Unnamed: 0,chrom,start,end,gene_name,EUR_het,SAS_het,EAS_het,AMR_het,AFR_het
0,chr1,960770,960798,ENST00000338591.8_cds_0_0_chr1_960694_f,0.00321,0.00000,0.00000,0.0,0.00233
1,chr1,970841,970857,ENST00000379409.6_cds_4_0_chr1_970686_f,0.00000,0.00000,0.00000,0.0,0.00000
2,chr1,976178,976199,ENST00000341290.6_cds_0_0_chr1_976172_r,0.00000,0.00332,0.00000,0.0,0.00000
3,chr1,1232633,1232644,ENST00000379198.5_cds_0_0_chr1_1232279_f,0.00000,0.00000,0.00000,0.0,0.00000
4,chr1,1232807,1232818,ENST00000379198.5_cds_0_0_chr1_1232279_f,0.00000,0.00000,0.00000,0.0,0.00000
...,...,...,...,...,...,...,...,...,...
6487,chr22,50603316,50603361,ENST00000329492.6_cds_2_0_chr22_50603223_f,0.00000,0.00000,0.00341,0.0,0.00447
6488,chr22,50604149,50604160,ENST00000329492.6_cds_4_0_chr22_50603841_f,0.00000,0.00000,0.00000,0.0,0.00000
6489,chr22,50604544,50604573,ENST00000329492.6_cds_4_0_chr22_50603841_f,0.00000,0.00000,0.00684,0.0,0.00000
6490,chr22,50604546,50604557,ENST00000329492.6_cds_4_0_chr22_50603841_f,0.00000,0.00000,0.00000,0.0,0.00000


In [60]:
#### Load Motif ####
motifs = []
for chrom in range(1,23):
    vcf = VCF(f"/gymreklab-tscc/helia/ensembl/experiments/coding_regions/heterozygosity/coding_calls_{chrom}.vcf.gz")
    for record in vcf:
        motifs.append([record.CHROM, record.POS, record.INFO['END'], record.INFO['RU']])
motifs = pd.DataFrame(motifs, columns = ['chrom', 'start', 'end', 'motif'])
motifs = motifs.drop_duplicates(subset = ["chrom", "start","end"])
df_all_pop = pd.merge(df_all_pop, motifs, on = ['chrom', 'start', 'end'])
df_all_pop.to_csv("Supplementary_Table_7.csv", index = False, sep = "\t")