In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import sys
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import gridspec
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D
import scipy.stats
from mpl_toolkits.axes_grid1 import make_axes_locatable

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC
from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq

#for exporting to Adobe Illustrator
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

In [3]:
plt.style.use('ggplot')
plt.rcParams['lines.linewidth']=0
plt.rcParams['axes.facecolor']='1.0'
plt.rcParams['xtick.color']='black'
plt.rcParams['axes.grid']=False
plt.rcParams['axes.edgecolor']='black'
plt.rcParams['grid.color']= '1.0'
plt.rcParams.update({'font.size': 10})
plt.rc('font', family='serif')

####################################################################################################################################################################################

## [0] Load in Hs from *TopDis (spacer length = 4)* for INDELs, and get top 10 HTs & top 10 SSRs

####################################################################################################################################################################################

In [4]:
#load in INDELs
all_homoplasic_INDELs = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/homoplasies_detected_in_global_lineages/INDEL homoplasies/homoplasy_count_across_lineages_spacer_4.pkl')

In [5]:
all_homoplasic_INDELs.head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos,...,1,2,3,4A,4B,4C,5,6,total,num_isolates
47371,4408100,GC,G,Antibiotic Resistance,gid,Rv3919c,103,del,frameshift,35,...,41.0,27.0,51.0,43.0,11.0,29.0,0.0,0.0,202.0,388
47365,4408087,CG,C,Antibiotic Resistance,gid,Rv3919c,116,del,frameshift,39,...,23.0,11.0,35.0,33.0,10.0,18.0,0.0,0.0,130.0,212
47313,4407851,GC,G,Antibiotic Resistance,gid,Rv3919c,352,del,frameshift,118,...,21.0,9.0,40.0,20.0,6.0,19.0,0.0,0.0,115.0,182
25151,2288850,A,ACC,Antibiotic Resistance,pncA,Rv2043c,392,ins,frameshift,131,...,1.0,14.0,6.0,3.0,6.0,2.0,0.0,0.0,32.0,48
47315,4407851,G,GC,Antibiotic Resistance,gid,Rv3919c,352,ins,frameshift,118,...,5.0,1.0,12.0,5.0,7.0,1.0,0.0,0.0,31.0,52


In [6]:
np.shape(all_homoplasic_INDELs)

(46306, 22)

Note - SNPs & INDELs with homoplasy score $ = 0$ were dropped from DataFrames

#### mark *mmpR* (Rv0678) as an Antibiotic Resistance gene

In [7]:
all_homoplasic_INDELs.loc[all_homoplasic_INDELs['gene_id'] == 'Rv0678', 'gene_category'] = 'Antibiotic Resistance'
all_homoplasic_INDELs.loc[all_homoplasic_INDELs['gene_id'] == 'Rv0678', 'gene_name'] = 'mmpR'

all_homoplasic_INDELs.loc[all_homoplasic_INDELs['gene_id'] == 'Rv3696c', 'gene_category'] = 'Antibiotic Resistance'

####################################################################################################################################################################################

## [1] Separate INDELs into those occcurring in HT regions, SSR regions and those not in either

####################################################################################################################################################################################

### [1.1] HT

In [8]:
homopolymeric_regions_from_Luca = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/comparison_with_homopolymeric_regions/homopolymer regions from Luca/list_homopolymers_7bpmore_pure.csv')

**Note 1:** Luca's scheme starts at -1bp relative to the HT track since that's where variants are usually called

**Note 2:** two HT regions in *PPE13* are located right next to eachother, change chromEnd position of the *TTTTTTTT* HT so we don't double count indels

In [9]:
homopolymeric_regions_from_Luca[homopolymeric_regions_from_Luca.gene_name == 'PPE13']

Unnamed: 0,Chromosome,chromStart,chromEnd,polyNT,locus_tag,gene_name
80,NC_000962.3,976889,976897,TTTTTTTT,Rv0878c,PPE13
103,NC_000962.3,976897,976906,GGGGGGGGG,Rv0878c,PPE13


In [10]:
homopolymeric_regions_from_Luca.loc[80,'chromEnd'] = 976896

create a set of all Reference Positions that occur within a Homopolymeric Tract

In [11]:
all_HT_ref_postions = []
for HT_i_start, HT_i_end in zip(homopolymeric_regions_from_Luca.chromStart, homopolymeric_regions_from_Luca.chromEnd):
    
    HT_i_ref_positions = range(HT_i_start, HT_i_end+1)
    all_HT_ref_postions = all_HT_ref_postions + HT_i_ref_positions
    
all_HT_ref_postions = set(all_HT_ref_postions)

In [12]:
len(all_HT_ref_postions)

1160

### [1.2] SSR

In [13]:
SSR_H37Rv_df = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/simple_sequence_repeat_regions/SSR_H37Rv_2-6bp_at_least_3_repeats.pkl')

In [14]:
SSR_H37Rv_df.head()

Unnamed: 0,H37Rv_start,H37Rv_end,motif,repeats
0,561,569,CGC,3
1,562,570,GCC,3
2,643,648,AC,3
3,1121,1126,GC,3
4,1408,1413,CA,3


In [15]:
H37Rv_SSR_positions_list = []
for SSR_i_start, SSR_i_end in zip(SSR_H37Rv_df.H37Rv_start, SSR_H37Rv_df.H37Rv_end):
    
    SSR_i_H37Rv_coords = list(np.arange(SSR_i_start, SSR_i_end + 1))
    
    H37Rv_SSR_positions_list = H37Rv_SSR_positions_list + SSR_i_H37Rv_coords
    
H37Rv_SSR_positions_list = list(set(H37Rv_SSR_positions_list)) # remove duplicate positions from overlapping SSR regions

start at -1bp relative to the SSR track since that's where variants are usually called

In [16]:
SSR_H37Rv_df.loc[:, 'H37Rv_start'] = SSR_H37Rv_df.H37Rv_start - 1

In [17]:
SSR_H37Rv_df.head()

Unnamed: 0,H37Rv_start,H37Rv_end,motif,repeats
0,560,569,CGC,3
1,561,570,GCC,3
2,642,648,AC,3
3,1120,1126,GC,3
4,1407,1413,CA,3


**Note:** use this list when sorting INDEL variants that occur within an SSR region

In [18]:
H37Rv_SSR_positions_list = []
for SSR_i_start, SSR_i_end in zip(SSR_H37Rv_df.H37Rv_start, SSR_H37Rv_df.H37Rv_end):
    
    SSR_i_H37Rv_coords = list(np.arange(SSR_i_start, SSR_i_end + 1))
    
    H37Rv_SSR_positions_list = H37Rv_SSR_positions_list + SSR_i_H37Rv_coords
    
H37Rv_SSR_positions_list = list(set(H37Rv_SSR_positions_list)) # remove duplicate positions from overlapping SSR regions

In [19]:
len(H37Rv_SSR_positions_list)

112673

set of all Reference Positions that occur within a SSR region (including positions at -1bp relative to SSR region)

In [20]:
H37Rv_SSR_positions = set(H37Rv_SSR_positions_list)

In [21]:
len(H37Rv_SSR_positions)

112673

### [1.3] Annotate each INDEL based on whether it is in a SSR or HT or other

iterate through all INDELs and figure out which occur in SSR and those that don't

In [22]:
repeat_region_list = []
for indel_i_pos in all_homoplasic_INDELs.pos:
    
    if indel_i_pos in all_HT_ref_postions:
        repeat_region_list.append('HT')
    
    elif indel_i_pos in H37Rv_SSR_positions:
        repeat_region_list.append('SSR')
        
    else:
        repeat_region_list.append('no')
        
all_homoplasic_INDELs.loc[:,'repeat_region'] = repeat_region_list

In [23]:
all_homoplasic_INDELs.head()

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos,...,2,3,4A,4B,4C,5,6,total,num_isolates,repeat_region
47371,4408100,GC,G,Antibiotic Resistance,gid,Rv3919c,103,del,frameshift,35,...,27.0,51.0,43.0,11.0,29.0,0.0,0.0,202.0,388,no
47365,4408087,CG,C,Antibiotic Resistance,gid,Rv3919c,116,del,frameshift,39,...,11.0,35.0,33.0,10.0,18.0,0.0,0.0,130.0,212,no
47313,4407851,GC,G,Antibiotic Resistance,gid,Rv3919c,352,del,frameshift,118,...,9.0,40.0,20.0,6.0,19.0,0.0,0.0,115.0,182,SSR
25151,2288850,A,ACC,Antibiotic Resistance,pncA,Rv2043c,392,ins,frameshift,131,...,14.0,6.0,3.0,6.0,2.0,0.0,0.0,32.0,48,no
47315,4407851,G,GC,Antibiotic Resistance,gid,Rv3919c,352,ins,frameshift,118,...,1.0,12.0,5.0,7.0,1.0,0.0,0.0,31.0,52,SSR


####################################################################################################################################################################################

## [2] check HT & SSR regions against EBR scores

####################################################################################################################################################################################

### [2.1] load in EBR scores from Max

EBR (36 Isolates)

In [26]:
EBR_36_scores_npz = np.load('/n/data1/hms/dbmi/farhat/mm774/Snakemake_Pipelines/mtb-illumina-wgs-evaluation/Results/B_Extra_UsefulDataFiles/E_EBR_AggregateAndIndividual_Results/210112_EBR_V7_36CI.npz')
EBR_36_scores_array = EBR_36_scores_npz['arr_0']

In [27]:
len(EBR_36_scores_array)

4411532

In [28]:
EBR_36_scores_array[0:10]

array([0.91666667, 0.91666667, 0.91666667, 0.91666667, 0.91666667,
       0.91666667, 0.91666667, 0.91666667, 0.94444444, 0.94444444])

### [2.2] check HT regions against EBR scores

In [24]:
homopolymeric_regions_from_Luca.head()

Unnamed: 0,Chromosome,chromStart,chromEnd,polyNT,locus_tag,gene_name
0,NC_000962.3,36470,36477,CCCCCCC,Rv0032,bioF2
1,NC_000962.3,71453,71460,CCCCCCC,Rv0064,Rv0064
2,NC_000962.3,191391,191398,CCCCCCC,Rv0161,Rv0161
3,NC_000962.3,238807,238814,CCCCCCC,Rv0202c,mmpL11
4,NC_000962.3,587728,587735,CCCCCCC,Rv0497,Rv0497


In [52]:
# create list to flag if region falls below EBR threshold
HT_EBR_pass = []

for H37Rv_start,H37Rv_end in zip(homopolymeric_regions_from_Luca.chromStart, homopolymeric_regions_from_Luca.chromEnd):

    # pull array for EBR
    EBR_scores = EBR_36_scores_array[H37Rv_start:H37Rv_end]

    # check to see if any EBR scores for these positions < 0.90
    EBR_scores_HQ_bool = EBR_scores < 0.90
    
    if sum(EBR_scores_HQ_bool) >= 1: # at least 1 position had EBR score below threshold
        HT_EBR_pass.append(False)
    else:
        HT_EBR_pass.append(True)
        
# append col that indicates whether this region passed EBR to dataframe
homopolymeric_regions_from_Luca.loc[:,'EBR_pass'] = HT_EBR_pass

  # Remove the CWD from sys.path while we load stuff.


In [54]:
homopolymeric_regions_from_Luca.head()

Unnamed: 0,Chromosome,chromStart,chromEnd,polyNT,locus_tag,gene_name,EBR_pass
0,NC_000962.3,36470,36477,CCCCCCC,Rv0032,bioF2,True
1,NC_000962.3,71453,71460,CCCCCCC,Rv0064,Rv0064,True
2,NC_000962.3,191391,191398,CCCCCCC,Rv0161,Rv0161,True
3,NC_000962.3,238807,238814,CCCCCCC,Rv0202c,mmpL11,True
4,NC_000962.3,587728,587735,CCCCCCC,Rv0497,Rv0497,True


In [55]:
homopolymeric_regions_from_Luca[homopolymeric_regions_from_Luca.EBR_pass == True].shape

(121, 7)

In [56]:
homopolymeric_regions_from_Luca[homopolymeric_regions_from_Luca.EBR_pass == False].shape

(24, 7)

In [57]:
homopolymeric_regions_from_Luca[homopolymeric_regions_from_Luca.EBR_pass == False]

Unnamed: 0,Chromosome,chromStart,chromEnd,polyNT,locus_tag,gene_name,EBR_pass
11,NC_000962.3,929017,929024,CCCCCCC,Rv0834c,PE_PGRS14,False
12,NC_000962.3,969733,969740,CCCCCCC,Rv0872c,PE_PGRS15,False
14,NC_000962.3,1386010,1386017,CCCCCCC,Rv1243c,PE_PGRS23,False
15,NC_000962.3,1488557,1488564,CCCCCCC,Rv1325c,PE_PGRS24,False
18,NC_000962.3,1573160,1573167,CCCCCCC,Rv1396c,PE_PGRS25,False
19,NC_000962.3,1632349,1632356,CCCCCCC,Rv1450c,PE_PGRS27,False
23,NC_000962.3,1864037,1864044,CCCCCCC,Rv1651c,PE_PGRS30,False
26,NC_000962.3,1990551,1990558,CCCCCCC,Rv1759c,wag22,False
32,NC_000962.3,2339948,2339955,CCCCCCC,Rv2082,Rv2082,False
43,NC_000962.3,2804942,2804949,CCCCCCC,Rv2490c,PE_PGRS43,False


### [2.3] check SSR regions against EBR scores

In [63]:
SSR_H37Rv_df = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/simple_sequence_repeat_regions/SSR_H37Rv_2-6bp_at_least_3_repeats.pkl')
SSR_H37Rv_df.head()

Unnamed: 0,H37Rv_start,H37Rv_end,motif,repeats
0,561,569,CGC,3
1,562,570,GCC,3
2,643,648,AC,3
3,1121,1126,GC,3
4,1408,1413,CA,3


In [64]:
SSR_H37Rv_df.shape

(18316, 4)

In [65]:
# create list to flag if region falls below EBR threshold
SSR_EBR_pass = []

for H37Rv_start,H37Rv_end in zip(SSR_H37Rv_df.H37Rv_start, SSR_H37Rv_df.H37Rv_end):

    # pull array for EBR
    EBR_scores = EBR_36_scores_array[H37Rv_start-1:H37Rv_end-1]

    # check to see if any EBR scores for these positions < 0.90
    EBR_scores_HQ_bool = EBR_scores < 0.90
    
    if sum(EBR_scores_HQ_bool) >= 1: # at least 1 position had EBR score below threshold
        SSR_EBR_pass.append(False)
    else:
        SSR_EBR_pass.append(True)
        
# append col that indicates whether this region passed EBR to dataframe
SSR_H37Rv_df.loc[:,'EBR_pass'] = SSR_EBR_pass

  # Remove the CWD from sys.path while we load stuff.


In [66]:
SSR_H37Rv_df.head()

Unnamed: 0,H37Rv_start,H37Rv_end,motif,repeats,EBR_pass
0,561,569,CGC,3,True
1,562,570,GCC,3,True
2,643,648,AC,3,True
3,1121,1126,GC,3,True
4,1408,1413,CA,3,True


In [67]:
SSR_H37Rv_df[SSR_H37Rv_df.EBR_pass == False].shape

(627, 5)

In [68]:
SSR_H37Rv_df[SSR_H37Rv_df.EBR_pass == True].shape

(17689, 5)