Used to make BED files of different regions of the genome when given a GFF, histone mod ChIP data, and methylation. These BED files will be used to calculate mutation rate differences in my duplex-seq data

In [None]:
import sys
import os
sys.path.append(os.getcwd() + '/../python_scripts') # this lets us import files in python_scripts (like gtools)
import gtools
if os.getcwd()[:8] != '/scratch': # switch to the scratch directory where all the data files are
    os.chdir(f'/scratch/cam02551/{os.getcwd().split("/")[-2]}')

import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from tqdm import tqdm
import numpy as np
plt.rcParams['svg.fonttype'] = 'none'

In [None]:
!mkdir -p tmp/bed_regions

# Make BED files of gff element types

In [None]:
# Load all the representative gene models, only transcripts in this list will be used
# confirmed that every gene in TAIR10 has exactly 1 representative transcript in the list
rep_gene_models = set()
with open('data/ref/TAIR10_representative_gene_models.txt', 'r') as f:
    for l in f:
        if l[:2] != 'AT':
            continue
        rep_gene_models.add(l.strip())
len(rep_gene_models)

In [None]:
# load the gff, convert to 0-base, and discard ChrC and ChrM
gff_cols = 'chrom source type start end score strand phase att'.split()
df_gff = pd.read_table('data/ref/ref.gff', names=gff_cols)
df_gff.start -= 1
df_gff = df_gff[(df_gff.chrom != 'ChrC') & (df_gff.chrom != 'ChrM')]
print(sum(df_gff.type == 'mRNA'), sum(df_gff.type == 'exon'), sum(df_gff.type == 'CDS'), sum(df_gff.type == 'five_prime_UTR'))
df_gff

In [None]:
# get the ID of the element (or ID of the parent if it has no ID), this will be the gene model (ATxxxxx.n) if the element
# is associated with a specific transcript or just the ID if not (ATxxxxx)
models = []
for att in df_gff.att:
    if 'ID=' in att:
        model = att.split('ID=', 1)[1].split(';')[0].split(',')[0].split('-')[0]
    else:
        model = att.split('Parent=', 1)[1].split(';')[0].split(',')[0].split('-')[0]
    models.append(model)
df_gff['model'] = models

In [None]:
# remove non-representative elements. I confirmed this works, no element types are completely removed after this 
df_gff['representative'] = df_gff.model.apply(lambda m: m in rep_gene_models if '.' in m else True)
df_gff = df_gff[df_gff.representative]
print(sum(df_gff.type == 'mRNA'), sum(df_gff.type == 'exon'), sum(df_gff.type == 'CDS'), sum(df_gff.type == 'five_prime_UTR'))

In [None]:
# # check amount of genome corresponding to each element type
# for t in set(df_gff.type):
#     df_tmp = df_gff[df_gff.type == t]
#     size = sum(df_tmp.end - df_tmp.start)
#     print(f'{t}\t{len(df_tmp)}\t{size / total_genome_size}')
#     print(df_tmp.iloc[0].att + '\n')

In [None]:
# load genome fasta
genome = gtools.load_genome('data/ref/ref.fa')
total_genome_size = sum([len(x) for x in genome.values()])

In [None]:
# make beds of elements I care about
for t in set(df_gff.type):
    df_bed = df_gff[df_gff.type == t].copy()
    df_bed['name'] = df_bed.model
    df_bed = df_bed['chrom start end name score strand'.split()]
    df_bed.to_csv(f'tmp/bed_regions/gff_{t}.bed', index=False, header=False, sep='\t')

# Load Keith Slotkin's TE annotation

In [None]:
df_tes = pd.read_table('data/ref/AthalianaTETranscripts/Panda_AT-TEs_annotation_v1.0.bed', names='chrom start end teid score strand sub_fam fam sup_fam len \
    len_cat copy_num copy_cat dist_cent pos_cat dist_gene rddm_silent rddm_active exp t_start t_stop t_strand t_id'.split(), comment='#')
df_tes = df_tes[(df_tes.chrom != 'ChrC') & (df_tes.chrom != 'ChrM')]

In [None]:
df_tes

In [None]:
df_bed = df_tes['chrom start end teid score strand'.split()].copy()
df_bed.to_csv(f'tmp/bed_regions/panda_te.bed', index=False, header=False, sep='\t')

# Make 5 exclusive categories

In [None]:
# Commands for making final exon, intron, promoter, TE, and intergenic BEDs
# these should be run from the tmp/bed_regions directory
# note: TE genes have exons, but no mRNA

bedtools intersect -a gff_gene.bed -b gff_mRNA.bed > protein_mRNA.bed

# TEs are done
cp panda_te.bed pure_te.bed

# only use exons in genes and not in TEs
bedtools intersect -a gff_exon.bed -b protein_mRNA.bed > mrna_exon.bed
bedtools subtract -a mrna_exon.bed -b panda_te.bed > pure_exon.bed

# subtract TEs and exons from mRNAs, introns now done
bedtools subtract -a protein_mRNA.bed -b panda_te.bed > non_te_mRNA.bed
bedtools subtract -a non_te_mRNA.bed -b gff_exon.bed > pure_intron.bed

# extend mRNAs 500bp upstream, subtract mRNAs and TEs, promoters now done
bedtools slop -l 500 -r 0 -s -g ../../data/ref/ref.fa.fai -i protein_mRNA.bed > mRNA_slop.bed
bedtools subtract -a mRNA_slop.bed -b protein_mRNA.bed > upstream_mRNA.bed
bedtools subtract -a upstream_mRNA.bed -b panda_te.bed > pure_promoter.bed

# subtract exons, introns, promoters, and TEs from everything, intergenic now done 
bedtools subtract -a gff_chromosome.bed -b pure_te.bed > int1.bed
bedtools subtract -a int1.bed -b pure_exon.bed > int2.bed
bedtools subtract -a int2.bed -b pure_intron.bed > int1.bed
bedtools subtract -a int1.bed -b pure_promoter.bed > pure_intergenic.bed

# sort and merge each bed with itself to get rid of overlapping elements (mostly only happens for promoters)
bedtools sort -i pure_te.bed | bedtools merge -i stdin > ../../data/region/te.bed
bedtools sort -i pure_exon.bed | bedtools merge -i stdin > ../../data/region/exon.bed
bedtools sort -i pure_intron.bed | bedtools merge -i stdin > ../../data/region/intron.bed
bedtools sort -i pure_promoter.bed | bedtools merge -i stdin > ../../data/region/promoter.bed
bedtools sort -i pure_intergenic.bed | bedtools merge -i stdin > ../../data/region/intergenic.bed

In [None]:
sizes = dict()
for region in 'exon intron te promoter intergenic'.split():
    df_tmp = pd.read_table(f'data/region/final_{region}.bed', names='chrom start end'.split())
    sizes[region] = sum(df_tmp.end - df_tmp.start)

In [None]:
fig, ax = plt.subplots()
ax.pie(sizes.values(), labels=sizes.keys(), autopct='%1.1f%%', textprops={'size': 14})
fig.savefig('figs/exclusive_bed_region_sizes.svg', dpi=300, bbox_inches='tight')

# Genes

In [None]:
bedtools sort -i protein_mRNA.bed | bedtools merge -i stdin > ../../data/region/protein_mRNA.bed

# ACRs

In [None]:
cat ../../data/peak/zefu_atac_1_peaks.narrowPeak ../../data/peak/zefu_atac_2_peaks.narrowPeak > union_acrs.bed
bedtools sort -i union_acrs.bed | bedtools merge -i stdin > ../../data/region/acrs.bed

# Pericentromere

In [None]:
# centromere positions determined by center of CEN178 arrays from PMID 37198485
# centromere_pos = {'Chr1':16029005, 'Chr2':5731747, 'Chr3':14711236, 'Chr4':6595103, 'Chr5':13790251} # don't remember the source of this
centromere_pos = {'Chr1':15256434, 'Chr2':3697368, 'Chr3':13599663, 'Chr4':3941890, 'Chr5':11880011} # this is from the Col-CEN paper (34762468) table S1 and figure S3, center of the centromere
with open('tmp/bed_regions/centromeres.bed', 'w') as f:
    for chrom in centromere_pos:
        f.write(f'{chrom}\t{centromere_pos[chrom]}\t{centromere_pos[chrom] + 1}\n') 

In [None]:
# define the pericentromere as a 10Mb region centered on the centromere repeats
bedtools slop -b 5000000 -g ../../data/ref/ref.fa.fai -i centromeres.bed > ../../data/region/pericentromere.bed

# Methylated cytosines

In [None]:
df_met = pd.read_table('data/methyl/bewick_bisulfite_1_methimpute.tsv', names='chrom pos strand context met total post_max post_met post_unmet status meth_lvl'.split(), header=1)
df_met.pos -= 1


In [None]:
nonconv_rate = sum(df_met[df_met.chrom == 'ChrM'].met) / sum(df_met[df_met.chrom == 'ChrM'].total)
print('nonconversion rate', nonconv_rate)

In [None]:
df_tmp = df_met[df_met.chrom == 'Chr1']
fig, axs = plt.subplots(2)
bottom = axs[0].hist(df_tmp[df_tmp.status == 'Unmethylated'].total, bins=range(40))[0]
bottom += axs[0].hist(df_tmp[df_tmp.status == 'Methylated'].total, bins=range(40), bottom=bottom)[0]
bottom += axs[0].hist(df_tmp[df_tmp.status == 'Intermediate'].total, bins=range(40), bottom=bottom)[0]

bottom = axs[1].hist(df_tmp[df_tmp.status == 'Unmethylated'].met / df_tmp[df_tmp.status == 'Unmethylated'].total, bins=[x /20 for x in range(22)])[0]
bottom += axs[1].hist(df_tmp[df_tmp.status == 'Methylated'].met / df_tmp[df_tmp.status == 'Methylated'].total, bins=[x /20 for x in range(22)], bottom=bottom)[0]
bottom += axs[1].hist(df_tmp[df_tmp.status == 'Intermediate'].met / df_tmp[df_tmp.status == 'Intermediate'].total, bins=[x /20 for x in range(22)], bottom=bottom)[0]
axs[1].set_yscale('log')

axs[0].legend(['unmethylated', 'methylated', 'intermediate'])
plt.show()


In [None]:
df_tmp[(df_tmp.pos > 9585650) & (df_tmp.pos < 9585750)]

In [None]:
# OLD code where I tried to call cytosines as methylated myself

# # calculate the probability of at least j nonconversions out of i covering reads
# stat_table = np.zeros((200, 200))
# for i in range(100):
#     for j in range(i):
#         stat_table[j, i] = 1 - stats.binom.cdf(j - 1, i, nonconv_rate)

# import scipy.stats as stats
# calls = []
# for r in tqdm(df_met.itertuples(), total=len(df_met)):
#     # for abnormally high coverage, don't call
#     if r.c >= 200:
#         calls.append(None)
#         continue
    
#     # calculate the probability of observing at least this many nonconversions
#     p_nonconv = stat_table[r.mc, r.c]
    
#     if r.c < 3: # for low coverage, don't call
#         calls.append(None)
#     elif p_nonconv < frac_met: # if it's unlikely to get this many nonconversions, call it as methylated
#         calls.append(True)
#     else:
#         calls.append(False)
# df_met.methylated = calls

In [None]:
f_met_cg = open('data/region/met_cg.bed', 'w')
f_unmet_cg = open('data/region/unmet_cg.bed', 'w')
f_met_chg = open('data/region/met_chg.bed', 'w')
f_unmet_chg = open('data/region/unmet_chg.bed', 'w')
f_met_chh = open('data/region/met_chh.bed', 'w')
f_unmet_chh = open('data/region/unmet_chh.bed', 'w')

for r in tqdm(df_met.itertuples(), total=len(df_met)):
    bed_entry = f'{r.chrom}\t{r.pos}\t{r.pos + 1}\n'
    
    if r.status == 'Intermediate' or r.total == 0:
        continue
    
    # put all sites labeled as 'Methylated' by methimpute in the met files and all other sites in the unmet files
    if r.context[1] == 'G' and r.status == 'Methylated':
        f_met_cg.write(bed_entry)
    elif r.context[1] == 'G':
        f_unmet_cg.write(bed_entry)
    elif r.context[2] == 'G' and r.status == 'Methylated':
        f_met_chg.write(bed_entry)
    elif r.context[2] == 'G':
        f_unmet_chg.write(bed_entry)
    elif r.status == 'Methylated':
        f_met_chh.write(bed_entry)
    else:
        f_unmet_chh.write(bed_entry)

f_met_cg.close()
f_unmet_cg.close()
f_met_chg.close()
f_unmet_chg.close()
f_met_chh.close()
f_unmet_chh.close()

In [None]:
# cg_met_gene = [False] * len(df_met)
# cg_unmet_gene = [False] * len(df_met)
# cn_met_nongene = [False] * len(df_met)
# cn_unmet_nongene = [False] * len(df_met)
# for i, row in tqdm(enumerate(df_met.itertuples()), total=len(df_met)):
#     if gene_arr[row.chrom][row.pos] == True and row.context[1] == 'G': # if CG in a gene
#         if (row.mc > 2) and (row.mc / row.c > 0.3):
#             cg_met_gene[i] = True
#         else:
#             cg_unmet_gene[i] = True
    
#     if gene_arr[row.chrom][row.pos] == False:
#         if (row.mc > 2) and (row.mc / row.c > 0.1):
#             cn_met_nongene[i] = True
#         else:
#             cn_unmet_nongene[i] = True

In [None]:
# bed_met = df_met.copy()
# bed_met['end'] = bed_met.pos + 1
# bed_met = bed_met['chrom pos end context mc strand'.split()]

In [None]:
# bed_met

In [None]:
# bed_met[cg_met_gene].to_csv('../data/regions/cg_met_gene.bed', index=False, header=False, sep='\t')
# bed_met[cg_unmet_gene].to_csv('../data/regions/cg_unmet_gene.bed', index=False, header=False, sep='\t')
# bed_met[cn_met_nongene].to_csv('../data/regions/cn_met_nongene.bed', index=False, header=False, sep='\t')
# bed_met[cn_unmet_nongene].to_csv('../data/regions/cn_unmet_nongene.bed', index=False, header=False, sep='\t')

In [None]:
# regions near a methylated cytosine
cat data/region/met_cg.bed data/region/met_chg.bed data/region/met_chh.bed | bedtools slop -i stdin -g data/ref/ref.fa.fai -b 50 | bedtools sort -i stdin | bedtools merge -i stdin > data/region/near_met.bed


# Estimate RdDM targets

In [None]:
with open('tmp/mCH_sites.bed', 'w') as f:
    met_arr = {chrom:np.zeros(len(genome[chrom]), dtype=bool) for chrom in genome}
    for i, row in tqdm(enumerate(df_met.itertuples()), total=len(df_met)):
        if row.context[1] != 'G' and (row.mc > 2) and (row.mc / row.c > 0.3):
            f.write(f'{row.chrom}\t{row.pos}\t{row.pos + 1}\tmCH\t.\t{row.strand}\n')

In [None]:
bedtools subtract -a tmp/mCH_peaks.broadPeak -b ../data/regions/bewick_chip-h3k9me2_1_peaks.broadPeak > ../data/regions/est_rddm_targets.bed