In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import Bio

from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.Seq import MutableSeq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC
from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq

import os
import sys
from shutil import copy
import pickle
import shutil

#### check INDELs es upstream *espA* HT

In [3]:
#load in INDELs
all_homoplasic_INDELs = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/CSV_files/homoplasies_detected_in_global_lineages/INDEL homoplasies/homoplasy_count_across_lineages_spacer_4.pkl')

In [4]:
all_homoplasic_INDELs[(all_homoplasic_INDELs.pos >= 4056478) & (all_homoplasic_INDELs.pos <= 4056489)]

Unnamed: 0,pos,ref,alt,gene_category,gene_name,gene_id,gene_pos,ins_del,INDEL_type,codon_pos,...,1,2,3,4A,4B,4C,5,6,total,num_isolates
42184,4056480,GT,G,,espA_ephA,Rv3616c_Rv3617,,del,frameshift,,...,18.0,7.0,4.0,24.0,13.0,14.0,0.0,0.0,80.0,361
42185,4056480,G,GT,,espA_ephA,Rv3616c_Rv3617,,ins,frameshift,,...,7.0,4.0,1.0,3.0,4.0,0.0,0.0,0.0,19.0,34
42187,4056487,T,TC,,espA_ephA,Rv3616c_Rv3617,,ins,frameshift,,...,0.0,4.0,0.0,0.0,1.0,3.0,0.0,0.0,8.0,9
42186,4056486,T,TC,,espA_ephA,Rv3616c_Rv3617,,ins,frameshift,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1


These are the IDs for **L5 & L6 isolates** in Max's 36 complete assemblies

**N1272** L5

**N1176** L5

**N0091** L6

**N1177** L6

**N1202** L6

**RW-TB008** L8

#### Define function to get the upstream sequence of *espA* for each genome, along with details of whether it's the same HT sequence as in H37Rv

In [83]:
def get_esp_upstream_seq(Mtb_genome_tag, Lin):

    print(Lin + ' : ' + Mtb_genome_tag)

    # load the H37Rv-assembly mappings with the sequences to alter in each HT/SSR region
    repeat_region_mapping_df =  pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/H37Rv_to_assembly_mappings_2/HT_SSR_'+Mtb_genome_tag+'_mappings.pkl')

    # re-sort to order HT/SSR regions in 
    repeat_region_mapping_df.sort_values(by='chromStart', ascending=True, inplace=True)
    repeat_region_mapping_df.reset_index(inplace=True, drop=True)

    HT_start = int(repeat_region_mapping_df[repeat_region_mapping_df.locus_tag == 'inter_Rv3616c_Rv3617'].assembly_chromStart.values[0])
    HT_end = int(repeat_region_mapping_df[repeat_region_mapping_df.locus_tag == 'inter_Rv3616c_Rv3617'].assembly_chromEnd.values[0])
    
    print(repeat_region_mapping_df[repeat_region_mapping_df.locus_tag == 'inter_Rv3616c_Rv3617'].loc[:,['seq_diff_wrt_H37Rv']])

    # load FASTA of complete assembly sequence
    Mtb_genome = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/220829.36CI.ForRoger.Asms/'+Mtb_genome_tag+'.fna'
    for Mtb_genome in SeqIO.parse(Mtb_genome, "fasta"):
        Mtb_genome.seq.alphabet = IUPAC.unambiguous_dna

    print(Mtb_genome.seq[HT_start-5:HT_end+6])    
    print(HT_start)
    print('-----------------------------------------')
    print('')
    print('')

In [84]:
for Mtb_genome_tag, Lineage in zip(['N1272','N1176','N1177','N1202','RW-TB008'],['L5','L5','L6','L6','L8']):
    get_esp_upstream_seq(Mtb_genome_tag, Lineage)

L5 : N1272
   seq_diff_wrt_H37Rv
44               same
AACAGTTTTTTTCATGCT
4064440
-----------------------------------------


L5 : N1176
   seq_diff_wrt_H37Rv
44               same
AACAGTTTTTTTCATGCT
4069764
-----------------------------------------


L6 : N1177
   seq_diff_wrt_H37Rv
44               same
AACAGTTTTTTTCATGCT
4035449
-----------------------------------------


L6 : N1202
   seq_diff_wrt_H37Rv
44               same
AACAGTTTTTTTCATGCT
4041613
-----------------------------------------


L8 : RW-TB008
   seq_diff_wrt_H37Rv
44               same
AACAGTTTTTTTCATGCT
4023783
-----------------------------------------




#### Coudn't map this region in one of the genomes, investigate more closely

- check to see if we can find HT sequence in this genome assembly and at which coordinate it occurs

In [6]:
Mtb_genome_tag = 'N0091'
repeat_region_mapping_df =  pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/H37Rv_to_assembly_mappings_2/HT_SSR_'+Mtb_genome_tag+'_mappings.pkl')

# re-sort to order HT/SSR regions in 
repeat_region_mapping_df.sort_values(by='chromStart', ascending=True, inplace=True)
repeat_region_mapping_df.reset_index(inplace=True, drop=True)

repeat_region_mapping_df[repeat_region_mapping_df.locus_tag == 'inter_Rv3616c_Rv3617']

Unnamed: 0,Chromosome,INDEL_homoplasy,chromEnd,chromStart,gene_name,locus_tag,num_isolates_with_INDEL,polyNT,repeats,type,assembly_chromStart,assembly_polyNT,assembly_chromEnd,seq_diff_wrt_H37Rv,add_seq_into_assembly,expected_variant_call
44,NC_000962.3,108,4056487,4056480,inter_espA_ephA,inter_Rv3616c_Rv3617,405,TTTTTTT,1,HT,,no match,,no match,no match,no match


Load the FASTA sequence for this assembly and search for string matches for different permutations of HT + flanking seqs

In [7]:
# load FASTA of complete assembly sequence
Mtb_genome = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/220829.36CI.ForRoger.Asms/'+Mtb_genome_tag+'.fna'
for Mtb_genome in SeqIO.parse(Mtb_genome, "fasta"):
    Mtb_genome.seq.alphabet = IUPAC.unambiguous_dna

In [8]:
Mtb_genome

SeqRecord(seq=Seq('TTGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCGGTCGTCTCC...TCG', IUPACUnambiguousDNA()), id='gnl|XXX|KIOFIBPM_1', name='gnl|XXX|KIOFIBPM_1', description='gnl|XXX|KIOFIBPM_1', dbxrefs=[])

attempt 1 (**3bp flanking seq + 7T + 3bp flanking seq**) no match (this is what was systematically searched before with 3bp flanking seq on each side of 7T)

In [9]:
H37Rv_repeat_seq_and_flanks = 'CAGTTTTTTTCAT'
Mtb_genome_str = Mtb_genome.seq
x = [i for i in range(len(Mtb_genome_str)) if Mtb_genome_str.startswith(str(H37Rv_repeat_seq_and_flanks), i)]
x

[]

attempt 2 (**2bp flanking seq + 7T + 2bp flanking seq**) again no match

In [10]:
H37Rv_repeat_seq_and_flanks = 'AGTTTTTTTCA'
Mtb_genome_str = Mtb_genome.seq
x = [i for i in range(len(Mtb_genome_str)) if Mtb_genome_str.startswith(str(H37Rv_repeat_seq_and_flanks), i)]
x

[]

attempt 3 (**1bp flanking seq + 7T + 1bp flanking seq**) there are two matches but neither where the start coordinate is for this HT in H37Rv

In [11]:
H37Rv_repeat_seq_and_flanks = 'GTTTTTTTC'
x = [i for i in range(len(Mtb_genome_str)) if Mtb_genome_str.startswith(str(H37Rv_repeat_seq_and_flanks), i)]
x

[1168843, 2548373]

attempt 4 (**2bp flanking seq + 6T + 3bp flanking seq**) near start coordinate for this HT in H37Rv, this is the HT with 1bp T deletion

In [17]:
H37Rv_repeat_seq_and_flanks = 'AGTTTTTTCAT'
Mtb_genome_str = Mtb_genome.seq
x = [i for i in range(len(Mtb_genome_str)) if Mtb_genome_str.startswith(str(H37Rv_repeat_seq_and_flanks), i)]
x

[4030203]