In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

create a virtual_env to run **InSilicoSeq** (called *insilicoseq_virtualenv* with python 3.7), have to install
- pandas (conda install -c anaconda pandas)
- numpy (conda install -c anaconda numpy)
- biopython (conda install -c conda-forge biopython)
- jupyter (conda install -c anaconda jupyter)
- insilicoseq (conda install -c bioconda insilicoseq)

In [2]:
import pandas as pd
import numpy as np
import Bio

from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.Seq import MutableSeq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC
from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq

import os
import sys
from shutil import copy
import pickle
import shutil

#### specify the tag for the genome assembly

In [4]:
genome_assemblies = ['RW-TB008', 'N1274', 'N1272', 'N1202', 'N1177', 'N1176', 'N0155', 
                     'N0153', 'N0145', 'N0091', 'N0072', 'N0054', 'N0004', 'M0017522_5', 
                     'M0016737_0', 'M0016395_7', 'M0014888_3', 'M0011368_9', 'M0010874_7', 
                     'M0003941_3', 'DNA120', 'DNA091', 'DNA086', 'DNA075', 'DNA044', 'DNA020', 
                     'DNA019_Rose', 'AZE_02_042', '02_R1896', '02_R1708', '02_R1179', '02_R0894', '01_R1430']

In [5]:
len(genome_assemblies)

33

In [6]:
# iterate through each Mtb assembly
for Mtb_genome_tag in genome_assemblies:

    # load the dataframe with the seq differences between H37Rv and assembly, 
    # alter assembly HT/SSR sequence where needed and generate FASTA file
    ##########################################################################################
    ##########################################################################################
    # load the H37Rv-assembly mappings with the sequences to alter in each HT/SSR region
    repeat_region_mapping_df =  pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/H37Rv_to_assembly_mappings_2/HT_SSR_'+Mtb_genome_tag+'_mappings.pkl')
    
    # re-sort to order HT/SSR regions in 
    repeat_region_mapping_df.sort_values(by='chromStart', ascending=True, inplace=True)
    repeat_region_mapping_df.reset_index(inplace=True, drop=True)

    # load FASTA of complete assembly sequence
    Mtb_genome = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/220829.36CI.ForRoger.Asms/'+Mtb_genome_tag+'.fna'
    for Mtb_genome in SeqIO.parse(Mtb_genome, "fasta"):
        Mtb_genome.seq.alphabet = IUPAC.unambiguous_dna

    # FASTA WITH ALTERED ASSEMBLY GENOME

    # copy assembly sequnece
    assembly_with_variants = Mtb_genome.seq.tomutable()

    # iterate through HT/SSR regions and change assembly where indicated
    for repeat_region_i in repeat_region_mapping_df.index:
        
        repeat_region_assembly_start = repeat_region_mapping_df.loc[repeat_region_i, 'assembly_chromStart']
        repeat_region_assembly_end = repeat_region_mapping_df.loc[repeat_region_i, 'assembly_chromEnd']
        polyNT = repeat_region_mapping_df.loc[repeat_region_i,'assembly_polyNT']
        added_seq = repeat_region_mapping_df.loc[repeat_region_i,'add_seq_into_assembly']

        # check to see if HT/SSR region needs sequence altered
        if (added_seq != 'None') and (added_seq != 'no match') and (type(repeat_region_assembly_start) != list):

            # change HT/SSR seqs where necessary
            assembly_with_variants[int(repeat_region_assembly_start):int(repeat_region_assembly_end)] = polyNT + added_seq # add "insertion"
            
            ##########################################################################################
            # update coordinates for HT/SSR repeat regions
            ##########################################################################################
            # add +bp to HT/SSR region assembly start & end coordinates if an insertion is added and shift sequences downstream by length of added seq
            updated_repeat_region_start_list = []
            updated_repeat_region_end_list = []
            for repeat_region_i, repeat_region_start, repeat_region_end in zip(repeat_region_mapping_df.index, repeat_region_mapping_df.assembly_chromStart, repeat_region_mapping_df.assembly_chromEnd):

                # check to make sure not np.nan (none mapped region)
                if (type(repeat_region_start) != list) and (np.isnan(repeat_region_start) == True):
                    updated_repeat_region_start_list.append(np.nan)
                    updated_repeat_region_end_list.append(np.nan)
                
                # repeat region from H37Rv mapped to multiple places in assembly
                elif type(repeat_region_start) == list:
                    repeat_region_start_sublist = []
                    repeat_region_end_sublist = []
                    
                    for repeat_region_start_i, repeat_region_end_i in zip(repeat_region_start,repeat_region_end):
                        
                        if repeat_region_start_i > repeat_region_assembly_start:
                            repeat_region_start_sublist.append(repeat_region_start_i + len(added_seq))
                        else:
                            repeat_region_start_sublist.append(repeat_region_start_i)
                        
                        if repeat_region_end_i > repeat_region_assembly_end:
                            repeat_region_end_sublist.append(repeat_region_end_i + len(added_seq))
                        else:
                            repeat_region_end_sublist.append(repeat_region_end_i)
                            
                    updated_repeat_region_start_list.append(repeat_region_start_sublist)
                    updated_repeat_region_end_list.append(repeat_region_end_sublist)
                    
                # add +bp to any position downstream of chromStart for assembly repeat region
                else:
                    if repeat_region_start > repeat_region_assembly_start:
                        updated_repeat_region_start_list.append(repeat_region_start + len(added_seq))
                    else:
                        updated_repeat_region_start_list.append(repeat_region_start)
                        
                    if repeat_region_end > repeat_region_assembly_start:
                        updated_repeat_region_end_list.append(repeat_region_end + len(added_seq))
                    else:
                        updated_repeat_region_end_list.append(repeat_region_end)
                        
            # update start/end positions of repeat regions in assemblies
            repeat_region_mapping_df.loc[:, 'assembly_chromStart'] = updated_repeat_region_start_list
            repeat_region_mapping_df.loc[:, 'assembly_chromEnd'] = updated_repeat_region_end_list
                
        # might have to account for "duplications" where H37Rv sub-sequence mapped to multiple place in H37Rv
        if (added_seq != 'None') and (added_seq != 'no match') and (type(repeat_region_assembly_start) == list):
            
            # iterate through each region of assembly that H37Rv mapped to, change sequence and update coordinates accordingly
            for repeat_region_assembly_start_i, repeat_region_assembly_end_i in zip(repeat_region_assembly_start, repeat_region_assembly_end):
            
                # change HT/SSR seqs where necessary
                assembly_with_variants[int(repeat_region_assembly_start_i):int(repeat_region_assembly_end_i)] = polyNT + added_seq # add "insertion"

                ##########################################################################################
                # update coordinates for HT/SSR repeat regions
                ##########################################################################################
                # add +bp to HT/SSR region assembly start & end coordinates if an insertion is added and shift sequences downstream by length of added seq
                updated_repeat_region_start_list = []
                updated_repeat_region_end_list = []
                for repeat_region_i, repeat_region_start, repeat_region_end in zip(repeat_region_mapping_df.index, repeat_region_mapping_df.assembly_chromStart, repeat_region_mapping_df.assembly_chromEnd):

                    # check to make sure not np.nan (none mapped region)
                    if (type(repeat_region_start) != list) and (np.isnan(repeat_region_start) == True):
                        updated_repeat_region_start_list.append(np.nan)
                        updated_repeat_region_end_list.append(np.nan)

                    # repeat region from H37Rv mapped to multiple places in assembly
                    elif type(repeat_region_start) == list:
                        repeat_region_start_sublist = []
                        repeat_region_end_sublist = []

                        for repeat_region_start_i, repeat_region_end_i in zip(repeat_region_start,repeat_region_end):

                            if repeat_region_start_i > repeat_region_assembly_start_i:
                                repeat_region_start_sublist.append(repeat_region_start_i + len(added_seq))
                            else:
                                repeat_region_start_sublist.append(repeat_region_start_i)

                            if repeat_region_end_i > repeat_region_assembly_end_i:
                                repeat_region_end_sublist.append(repeat_region_end_i + len(added_seq))
                            else:
                                repeat_region_end_sublist.append(repeat_region_end_i)

                        updated_repeat_region_start_list.append(repeat_region_start_sublist)
                        updated_repeat_region_end_list.append(repeat_region_end_sublist)

                    # add +bp to any position downstream of chromStart for assembly repeat region
                    else:
                        if repeat_region_start > repeat_region_assembly_start_i:
                            updated_repeat_region_start_list.append(repeat_region_start + len(added_seq))
                        else:
                            updated_repeat_region_start_list.append(repeat_region_start)

                        if repeat_region_end > repeat_region_assembly_start_i:
                            updated_repeat_region_end_list.append(repeat_region_end + len(added_seq))
                        else:
                            updated_repeat_region_end_list.append(repeat_region_end)

                # update start/end positions of repeat regions in assemblies
                repeat_region_mapping_df.loc[:, 'assembly_chromStart'] = updated_repeat_region_start_list
                repeat_region_mapping_df.loc[:, 'assembly_chromEnd'] = updated_repeat_region_end_list

    # convert mutable seq back to normal sequence
    assembly_with_variants = assembly_with_variants.toseq()

    # store in SeqRecord with strain ID
    assembly_with_variants = SeqRecord(assembly_with_variants , id = Mtb_genome_tag)
    
    # store altered assembly as a FASTA file
    assembly_dir = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/altered_assemblies/'
    SeqIO.write(assembly_with_variants, assembly_dir + Mtb_genome_tag + "_altered_seq.fasta", "fasta")
    
    # save updated mappings as a pickled file
    repeat_region_mapping_df.to_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/H37Rv_to_assembly_mappings_3/' + 'HT_SSR_'+Mtb_genome_tag+'_mappings.pkl')
    ##########################################################################################
    ##########################################################################################

In [7]:
from slurmpy import Slurm
import os

In [8]:
# simulate reads from the altered assembly to generate FASTQ files
##########################################################################################
##########################################################################################

assembly_dir = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/altered_assemblies/'

# iterate through each Mtb assembly
for Mtb_genome_tag in genome_assemblies:

    #### prepare file and directory paths ####

    #input file paths
    altered_assembly_fasta = assembly_dir + Mtb_genome_tag + "_altered_seq.fasta" #directory/filename of reference sequence with variants

    #create directory to store fastq files
    fastq_file_dir = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/simulated_fastq_files_from_InSilicoSeq_for_altered_seq/'
    if not os.path.exists(fastq_file_dir + Mtb_genome_tag):
        os.makedirs(fastq_file_dir + Mtb_genome_tag)

    #output directories
    out_dir = fastq_file_dir + Mtb_genome_tag + '/' + Mtb_genome_tag
    SLURM_log_dir = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/HT_SSR_recall_sims/simulated_fastq_files_from_InSilicoSeq_for_altered_seq/O2_SLURM_logs'

    #### construct job to submit to O2 ####

    #store all commands in a list
    commands_list = []

    #load virtual environment for SNPPar
    commands_list.append( 'set +eu' )
    commands_list.append( 'source activate insilicoseq_virtualenv' )
    commands_list.append( 'set -eu' )

    #run InSilicoSeq

    #change directory to one with all of the files
    ## parameters
    ## -ss HS20 ; Illumina HiSeq
    ## -l 100 ; read length of 100
    ## -f 100 ; mean coverage of 100x
    ## -p ; paired end reads
    ## -m 200; 200 bp mean size of DNA fragments for paired-end simulations
    ## -s 25; 25 bp standard deviation of DNA fragment size for paired-end simulations

    ## art_command = ART + ' -ss HS20 -i ' + altered_assembly_fasta + ' -o ' + fastq_directory + ' -l 100 -f 100 -p -m 200 -s 25'
    ## C = LN/G; 100 = (125 x N)/(4411532), so N = (100 x 4411532) / 125 = 3,529,225.6 ~ 3.5 million reads

    commands_list.append( 'iss generate --n_reads 3.5M --cpus 4 --genomes {0} --model hiseq --output {1}'.format(altered_assembly_fasta, out_dir) )

    #### SUBMIT the job to O2 ####

    #append all commands in a single string to be submitted as a job
    read_sim_job = ''
    for command_i in commands_list:
        read_sim_job = read_sim_job + '\n' + command_i

    #directory where you want output + error files
    os.chdir(SLURM_log_dir)

    job_name = 'ISS_' + Mtb_genome_tag

    #submit SNPPar job via SLURM
    s = Slurm(job_name , {'partition':'short', 'account':'farhat', 'N':'1' , 'n':'1' , 'c':'4' , 't':'0-1:30:00' , 'mem':'32G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(read_sim_job)

    print job_name  + ' : ' +  str(job_id)

submitted: Submitted batch job 61896207
submitted: Submitted batch job 61896208
submitted: Submitted batch job 61896209


ISS_RW-TB008 : 61896207
ISS_N1274 : 61896208
ISS_N1272 : 61896209


submitted: Submitted batch job 61896210
submitted: Submitted batch job 61896211
submitted: Submitted batch job 61896212


ISS_N1202 : 61896210
ISS_N1177 : 61896211
ISS_N1176 : 61896212


submitted: Submitted batch job 61896213
submitted: Submitted batch job 61896214
submitted: Submitted batch job 61896215


ISS_N0155 : 61896213
ISS_N0153 : 61896214
ISS_N0145 : 61896215


submitted: Submitted batch job 61896216
submitted: Submitted batch job 61896217
submitted: Submitted batch job 61896218


ISS_N0091 : 61896216
ISS_N0072 : 61896217
ISS_N0054 : 61896218


submitted: Submitted batch job 61896219
submitted: Submitted batch job 61896220
submitted: Submitted batch job 61896221


ISS_N0004 : 61896219
ISS_M0017522_5 : 61896220
ISS_M0016737_0 : 61896221


submitted: Submitted batch job 61896222
submitted: Submitted batch job 61896223


ISS_M0016395_7 : 61896222
ISS_M0014888_3 : 61896223


submitted: Submitted batch job 61896224
submitted: Submitted batch job 61896225


ISS_M0011368_9 : 61896224
ISS_M0010874_7 : 61896225


submitted: Submitted batch job 61896226
submitted: Submitted batch job 61896227
submitted: Submitted batch job 61896228


ISS_M0003941_3 : 61896226
ISS_DNA120 : 61896227
ISS_DNA091 : 61896228


submitted: Submitted batch job 61896229
submitted: Submitted batch job 61896230
submitted: Submitted batch job 61896231


ISS_DNA086 : 61896229
ISS_DNA075 : 61896230
ISS_DNA044 : 61896231


submitted: Submitted batch job 61896232
submitted: Submitted batch job 61896233


ISS_DNA020 : 61896232
ISS_DNA019_Rose : 61896233


submitted: Submitted batch job 61896234
submitted: Submitted batch job 61896235
submitted: Submitted batch job 61896236


ISS_AZE_02_042 : 61896234
ISS_02_R1896 : 61896235
ISS_02_R1708 : 61896236
ISS_02_R1179 : 61896237
ISS_02_R0894 : 61896238
ISS_01_R1430 : 61896240


submitted: Submitted batch job 61896237
submitted: Submitted batch job 61896238
submitted: Submitted batch job 61896240
