In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline

#### This notebook collects all of the variant profiles for the longitudinal samples for use in simulating reads and *drops* those SNPs into the reference sequences used for simulating reads.

#######################################################################################################################################################################################################################################################################################

## [1] Create the Script to drop alternate alleles (SNVs) into the (complete) Reference Genomes before simulating reads from these genomes

#######################################################################################################################################################################################################################################################################################

In [2]:
#Choose a specific reference genome, pull sequence & annotation; compare to H37Rv (indexed: 0 - 53)
ref_genome_i = 5
import sys

In [3]:
#import necessary packages
import vcf
import os
import pandas as pd
import numpy as np

from itertools import compress
import ast
import itertools
import time

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq
from Bio import pairwise2
from shutil import copy
import subprocess
import pickle
import shutil

In [4]:
##########################################################################################
##########################################################################################

'''
This piece of code imports the SNPs (Base Changes) that we want to simulate. The SNPs are taken from 
the Longitudinal SNP analysis. Unique Filtered SNPs from significant genes are stored in a DataFrame after 
determining which genes have a significant amount of variation.
'''

SNPs_from_all_patients_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/pickled_files/variant_calling/longitudinal_SNPs/longitudinal_SNPs_to_simulate.pkl')

##########################################################################################
##########################################################################################

In [7]:
SNPs_from_all_patients_DF.head()

Unnamed: 0,ref_position,ref_allele,alt_allele,gene_id,genomic_coord,gene_category,gene_symbol,SNP_type,AA_change
0,761139,C,G,Rv0667,1333,Antibiotic Resistance,rpoB,N,H445D
1,861378,T,G,Rv0768,467,Non-Essential,aldA,N,I156S
2,1253207,A,G,Rv1129c,1328,Non-Essential,,N,I443T
3,2075138,T,A,Rv1830,298,Non-Essential,,N,S100T
4,2694727,A,G,Rv2397c,238,Essential,cysA1,N,F80L


In [68]:
##########################################################################################
##########################################################################################
'''
This piece of code constructs a mapping of the CDS regions of H37Rv to the CDS 
regions of the chosen reference sequence (ref_genome_i) using the 
feature annotation tables for both sequences.
'''

#annotation for reference sequences that we will use for our simulations
ref_seqs_annot = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/reference_sequences/strains_typed/Reference_Genomes_used_in_simulations.csv').set_index('Unnamed: 0')

#############################################################################
#PULL H37Rv sequence & annotation information

#H37Rv GENOME SEQUENCE
H37Rv_genome = '/n/data1/hms/dbmi/farhat/bin/work-horse/bin/h37rv.fasta'
for H37Rv_seq in SeqIO.parse(H37Rv_genome, "fasta"):
    H37Rv_seq.seq.alphabet = IUPAC.unambiguous_dna

#H37Rv REFERENCE ANNOTATION
H37Rv_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/H37Rv/h37rv_genome_summary.txt', '\t').set_index('name')

#filter out annotation for t-RNAs and other misc RNAs
RNAs_filter = []
for CDS_i in H37Rv_annotation.index:
    if ('tRNA' in H37Rv_annotation.loc[CDS_i , 'description'].split(' ')) or ('Rvn' in CDS_i):
        RNAs_filter.append(True)
        
    else:
        RNAs_filter.append(False)

non_RNAs_filter = [not RNA for RNA in RNAs_filter]

H37Rv_annotation_no_RNA = H37Rv_annotation[non_RNAs_filter]

#drop unneccessary columns
H37Rv_annotation_no_RNA = H37Rv_annotation_no_RNA.loc[: , ['symbol' , 'length' , 'chromStart' , 'chromEnd' , 'strand' , 'description']]


#STORE CORRESPONDING SEQUENCES FROM H37Rv IN GENOME ANNOTATION
for gene_id in H37Rv_annotation_no_RNA.index:

    start_site = min( (H37Rv_annotation_no_RNA.loc[gene_id , 'chromStart']) , H37Rv_annotation_no_RNA.loc[gene_id , 'chromEnd'] ) #starts counting from 1
    end_site = max( (H37Rv_annotation_no_RNA.loc[gene_id , 'chromStart']) , H37Rv_annotation_no_RNA.loc[gene_id , 'chromEnd'] )

    seq_i = H37Rv_seq.seq[start_site:end_site] 

    #store sequence
    H37Rv_annotation_no_RNA.loc[gene_id , 'sequence'] = seq_i

#############################################################################
#PULL REFERENCE sequence & annotation information

#specify specific reference sequence
ref_genome_strain = list(ref_seqs_annot.Strain)[ref_genome_i]
ref_genome_strain_annotation_filename = list(ref_seqs_annot.Feature_Table_filename)[ref_genome_i]

#directory where the fasta files for the reference sequences are stored
ref_seq_directory = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/reference_sequences/strains_typed/'
#directory where annotation files are stored
ref_strain_genome_annot_files_directory = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/reference_sequences/strains_typed/reference_sequence_annotation_tables/'

#REFERENCE GENOME SEQUENCE
#get full path for the fasta file corrresponding to the reference sequence
ref_genome_fasta = ref_seq_directory + ref_genome_strain  + '.fasta'

#parse the fasta file corresponding to the reference genome
for reference_genome in SeqIO.parse(ref_genome_fasta, "fasta"):
    reference_genome.seq.alphabet = IUPAC.unambiguous_dna

#REFERENCE GENOME FEATURE TABLE
ref_genome_annot = pd.read_csv(ref_strain_genome_annot_files_directory + ref_genome_strain_annotation_filename , compression = 'gzip' , header = 0 , sep = '\t')

#filter for rows that code for 'genes'
ref_genome_annot = ref_genome_annot[ref_genome_annot.ix[: , 0] == 'gene']

#filter for rows that are 'protein_coding' or 'pseudogene'
protein_coding_or_pseudogene_filter = [protein_coding or pseudogene for protein_coding , pseudogene in zip(list(ref_genome_annot.ix[: , 1] == 'protein_coding') , list(ref_genome_annot.ix[: , 1] == 'pseudogene'))]
ref_genome_annot = ref_genome_annot[protein_coding_or_pseudogene_filter]

#filter out for buggy first rows that contain the start/end of the whole Reference Genome Sequence
ref_genome_annot = ref_genome_annot[ (ref_genome_annot.end - ref_genome_annot.start) < 1000000 ]

#drop unnecessary columns
ref_genome_annot = ref_genome_annot.loc[: , [ 'start' , 'end' , 'strand' , 'symbol' , 'feature_interval_length']]

ref_genome_annot.reset_index(inplace = True , drop = True)

#STORE CORRESPONDING SEQUENCES FROM REFERENCE GENOME IN GENOME ANNOTATION
for CDS_i in ref_genome_annot.index:

    start_site = min( (ref_genome_annot.loc[CDS_i , 'start'] - 1) , ref_genome_annot.loc[CDS_i , 'end'] ) #starts counting from 1
    end_site = max( (ref_genome_annot.loc[CDS_i , 'start'] - 1) , ref_genome_annot.loc[CDS_i , 'end'] )

    seq_i = reference_genome.seq[start_site:end_site] 

    #store sequence
    ref_genome_annot.loc[CDS_i , 'sequence'] = seq_i

#############################################################################
#COMPARE REFERENCE GENOME WITH H37RV & MAP RELEVANT CODING REGIONS FOR NOVEL GENES

#define window size for CDS length comparison (has to be an even number)
window_size = 10

num_genes = len(H37Rv_annotation_no_RNA.index) #number of genes after excluding RNAs

#WRAP AROUND CHROMOSOME FOR FIRST/LAST FEW GENES
#in order to 'wrap around' chromosome, append last 5 genes to beginning of DF and first 5 genes to end of DF to create sliding window for all genes
last_few_genes_annot = H37Rv_annotation_no_RNA.ix[num_genes - (window_size / 2) : , :] #last few genes
first_few_genes_annot = H37Rv_annotation_no_RNA.ix[0 : (window_size / 2) , :] #first few genes
H37Rv_annotation_no_RNA_fudged_for_circular_chrom = last_few_genes_annot.append(H37Rv_annotation_no_RNA.append(first_few_genes_annot))

#H37Rv-ref_genome coding mapping
#create dataframe to hold mapping
H37Rv_RefGeome_gene_mapping = pd.DataFrame(columns = ['symbol' , 'H37Rv_start' , 'H37Rv_end' , 'RefGenome_start' , 'RefGenome_end' , 'strand' , 'Highest_window_score' , 'second_Highest_window_score' , 'window_index'])

#only look for Significant Genes from within-host analysis & look for corresponding coding sequence in reference genome
genes_with_real_SNPs = list( set( SNPs_from_all_patients_DF.gene_id ) )

for gene_id in genes_with_real_SNPs: #skip first several rows & last several rows (which are redundant genes created for sliding windows) 

    #find index in H37Rv annotation dataframe
    row_index = list(H37Rv_annotation_no_RNA_fudged_for_circular_chrom.ix[(window_size / 2) : - (window_size / 2)].index).index(gene_id) + 5

    #take the 5 CDS regions before & after into account
    H37Rv_annotation_no_RNA_neighborhood = H37Rv_annotation_no_RNA_fudged_for_circular_chrom.ix[ row_index - (window_size / 2) : row_index + ((window_size / 2) + 1) , :]
    
    #look at gene lengths of the 11 consective genes & look for similar gene length profile in reference genome
    H37Rv_annotation_no_RNA_neighborhood_lengths = H37Rv_annotation_no_RNA_neighborhood.length

    #compare the neighborhood of gene lengths with every set of consecutive 11 gene lengths in reference genome annotation
    ## key: window_index , value: score of gene_length_window_match
    gene_length_match_scores = {}

    #create a sliding window of gene lengths to find the appropriate coding region on the reference genome for novel genes
    first_window_index = window_size / 2
    last_window_index = np.shape(ref_genome_annot)[0] - (window_size / 2)

    for gene_lengths_window_i in range(first_window_index , last_window_index):

        #take the 5 CDS regions before & after into account
        ref_genome_annot_neighborhood = ref_genome_annot.ix[ gene_lengths_window_i - (window_size / 2) : gene_lengths_window_i + (window_size / 2) , :]

        #look at gene lengths of the 11 consecutive genes & look for similar gene length profile in H37Rv
        ref_genome_annot_neighborhood_lengths =  ref_genome_annot_neighborhood.feature_interval_length
        
        #compare window of gene lengths between reference genome and H37Rv
        ##length_match_score = np.sum( np.array( ref_genome_annot_neighborhood_lengths ) == np.array( H37Rv_annotation_no_RNA_neighborhood_lengths ) ) #absolute match method
        length_match_score = len( set(list(ref_genome_annot_neighborhood_lengths)).intersection(set(list(H37Rv_annotation_no_RNA_neighborhood_lengths))) ) #rough match method

        #store window index and corresponding match score
        gene_length_match_scores[gene_lengths_window_i] = length_match_score

    #find the max match score and corresponding window index
    gene_length_match_scores = pd.Series(gene_length_match_scores)
    
    gene_length_match_scores_genome_order = gene_length_match_scores
    
    gene_length_match_scores = gene_length_match_scores.sort_values(ascending = False)
    
    window_index_top_match_score = list(gene_length_match_scores)[0] #top score for sliding window alignments
    
    #store window indice(s) that have the top match score
    window_indices_with_top_score = list(gene_length_match_scores[gene_length_match_scores == window_index_top_match_score].index)
    
    gene_alignment_scores_in_top_window = {} #key: sub-window indices , valus: score from pariwise alignment
    
    #iterate through all highest scoring windows (or neighborhoods)
    for window_match_index in window_indices_with_top_score:
        
        #align gene_i to every CDS within the 'sliding window' of the highest window scoring CDS region(s)
        first_chosen_window_index = window_match_index - (window_size / 2)
        last_chosen_window_index = window_match_index + (window_size / 2)

        for window_index in range(first_chosen_window_index , last_chosen_window_index + 1):

            #find the gene sequence corresponding to window index
            corresponding_gene_ref_genome = ref_genome_annot.loc[window_index , :]

            #check to see if sequences are identical by aligning them
            H37Rv_coding_seq = H37Rv_annotation_no_RNA.loc[gene_id , 'sequence']
            ref_genome_coding_seq = corresponding_gene_ref_genome.sequence

            #Identical characters are given 1 point, -1/2 point is deducted for each non-identical character, 2 points are deduced when opening a gap, 2 point is deducted for extending it
            alignment_score = pairwise2.align.globalms(H37Rv_coding_seq, ref_genome_coding_seq , 2 , -1 , -2 , -2 , score_only = True)

            #store in dictionary
            gene_alignment_scores_in_top_window[window_index] = alignment_score
            
    #choose window index with highest alignment score as match (this time from genes within top/chosen window)
    top_window_score_gene_alignments_results = pd.Series(gene_alignment_scores_in_top_window)
    top_window_score_gene_alignments_results = top_window_score_gene_alignments_results.sort_values(ascending = False)
    
    window_match_index = list(top_window_score_gene_alignments_results.index)[0] #take the index corresponding to the top alignment score
    window_match_score = list(top_window_score_gene_alignments_results)[0] #actual raw score from top alignment
    
    #store relevant information
    corresponding_gene_ref_genome = ref_genome_annot.loc[window_match_index , :]
    
    H37Rv_gene_length = H37Rv_annotation_no_RNA.loc[gene_id , 'length']
    RefGenome_gene_length = corresponding_gene_ref_genome.feature_interval_length
    
    #if window match score positive and gene lengths match, then likely a correct match also ensure 'found' gene has same length as gene in H37Rv
    if (window_match_score > 0) and (H37Rv_gene_length == RefGenome_gene_length): 

        symbol = H37Rv_annotation_no_RNA.loc[gene_id , 'symbol']
        strand = H37Rv_annotation_no_RNA.loc[gene_id , 'strand']
        H37Rv_start = H37Rv_annotation_no_RNA.loc[gene_id , 'chromStart']
        H37Rv_end = H37Rv_annotation_no_RNA.loc[gene_id , 'chromEnd']
        RefGenome_start = corresponding_gene_ref_genome.start
        RefGenome_end = corresponding_gene_ref_genome.end

        Highest_window_score = list(gene_length_match_scores)[0]
        second_Highest_window_score = list(gene_length_match_scores)[1]

        #start counting from 0!
        H37Rv_RefGeome_gene_mapping.loc[gene_id , :] = [symbol , H37Rv_start , H37Rv_end , RefGenome_start - 1 , RefGenome_end , strand , Highest_window_score , second_Highest_window_score , window_match_index]
        
    else: #if window match score negative, then likely no gene/CDS analogue was found
        
        H37Rv_RefGeome_gene_mapping.loc[gene_id , :] = [np.nan]*9
    
##########################################################################################
##########################################################################################

In [72]:
##########################################################################################
##########################################################################################

'''
This piece of code uses the mapping of the CDS regions of H37Rv to the CDS 
regions of the chosen reference sequence (ref_genome_i) to find the relevant
reference positions of the SNPs called against H37Rv from paired isolates.

We drop any repeated SNPs (variants that have the same RefGenome position 
as another variant). Then introduce the variants into a copy of the Reference Sequence.

This piece of code also stores the Feature Annotation Table , Altered Variants Record , 
Original Reference Sequence & Altered Reference Sequence in the same directory.
'''

#DROP SNPs for which NO corresponding CDS was found in the Reference Genome, we will only simulate SNPs in genes with good mappings!

#reset index for all coding SNPs
SNPs_from_all_patients_DF.reset_index(inplace = True , drop = True)

#genes that we found corresponding CDS/genes for in RefGenome
mapped_genes = list( H37Rv_RefGeome_gene_mapping.index[ [not badly_mapped for badly_mapped in H37Rv_RefGeome_gene_mapping.isnull().all(axis = 1) ] ] )

SNP_has_mapped_region_in_RefGenome_filter = []

for SNP_i in range(0 , np.shape(SNPs_from_all_patients_DF)[0]):
    
    #does the (H37Rv) gene that SNP was located on have a mapped region on Reference Genome?
    if SNPs_from_all_patients_DF.loc[SNP_i , 'gene_id'] in mapped_genes:
        SNP_has_mapped_region_in_RefGenome_filter.append(True)
        
    else:
        SNP_has_mapped_region_in_RefGenome_filter.append(False)
        
SNPs_from_all_patients_DF = SNPs_from_all_patients_DF[SNP_has_mapped_region_in_RefGenome_filter]

#reset index again (after possibly dropping some SNPs that couldn't be mapped to RefGenome)
SNPs_from_all_patients_DF.reset_index(inplace = True , drop = True)

#######################################################################################################################################
# USE MAPPING FROM H37RV -> REFERENCE GENOME TO IDENTIFY SNP SITES ON REFERENCE GENOME

#series for storing corresponding SNP genomic positions in reference genome
RefGenome_SNP_location_series = pd.Series(index = SNPs_from_all_patients_DF.index)

#find corresponding alt positions of all SNPs 
for SNP_i in range(0 , np.shape(SNPs_from_all_patients_DF)[0]):
    
    SNP_info = SNPs_from_all_patients_DF.loc[SNP_i , :]
    
    #find genomic sit for SNP on H37Rv and corresponding gene_id
    H37Rv_SNP_location = SNP_info.ref_position
    SNP_gene_id = SNP_info.gene_id
    
    #find the genome relative start sites for H37Rv & Reference Genome for GeneID
    H37Rv_gene_start = H37Rv_RefGeome_gene_mapping.loc[SNP_gene_id , 'H37Rv_start']
    RefGenome_gene_start = H37Rv_RefGeome_gene_mapping.loc[SNP_gene_id , 'RefGenome_start']
    
    #find 5' - 3' gene-wise coordinate for SNP on H37Rv [location on H37Rv genome - start position of gene]
    ## ref_position (from Pilon) starts counting from 1 , H37Rv_start & H37Rv_end start couting from 0
    genomic_coord = (H37Rv_SNP_location - 1) - (H37Rv_gene_start)
    
    #find relevant reference position for RefGenome [genomic_coord + start position of corresponding gene on RefGenome]
    RefGenome_SNP_location = RefGenome_gene_start + genomic_coord
    
    RefGenome_SNP_location_series[SNP_i] = RefGenome_SNP_location
    
#append newly found alt positions for SNPs to DataFrame with rest of information
SNPs_from_all_patients_DF.loc[: , 'RefGenome_position'] = list(RefGenome_SNP_location_series) 

#DROP repeated SNPs (two variants that have the same RefGenome position)
SNP_RefGenome_duplicated_filter = SNPs_from_all_patients_DF.duplicated(subset = ['RefGenome_position'] ,  keep = 'first')
SNP_RefGenome_duplicated_filter = [not duplicate_RefGenome_Position for duplicate_RefGenome_Position in SNP_RefGenome_duplicated_filter]

SNPs_from_all_patients_DF = SNPs_from_all_patients_DF[SNP_RefGenome_duplicated_filter]

#reset index again (after possibly dropping some SNPs that were already represented at the RefGenome Position)
SNPs_from_all_patients_DF.reset_index(inplace = True , drop = True)

#######################################################################################################################################
#Drop Variants into the Sequence of the Reference Genome (keeping track of original base and alt allele being introduced)
## SNPs will be called against given Reference Genome

#copy reference sequnece (RefGenome)
reference_genome_with_variants = reference_genome.seq.tomutable()

#keep track of old (real) alleles and introduced alleles
real_alleles = pd.Series(index = SNPs_from_all_patients_DF.index)
introduced_alleles = pd.Series(index = SNPs_from_all_patients_DF.index)

#store starting and ending RefGenome positions of CDS sequence that variant lands on
RefGenome_CDS_start = pd.Series(index = SNPs_from_all_patients_DF.index)
RefGenome_CDS_end = pd.Series(index = SNPs_from_all_patients_DF.index)

#drop variants into reference genome
for SNP_i in SNPs_from_all_patients_DF.index:
    
    SNP_i_info = SNPs_from_all_patients_DF.loc[SNP_i , :]
    
    SNP_alt_base = SNP_i_info.alt_allele #base that was different in at least 1 isolate-pair
    SNP_H37Rv_base = SNP_i_info.ref_allele #base that was on H37Rv
    SNP_RefGenome_position = int(SNP_i_info.RefGenome_position - 1.0) #for indexing purposes, start from 0
    
    gene_id = SNP_i_info.gene_id
    RefGenome_CDS_start[SNP_i] = H37Rv_RefGeome_gene_mapping.loc[gene_id , 'RefGenome_start']
    RefGenome_CDS_end[SNP_i] = H37Rv_RefGeome_gene_mapping.loc[gene_id , 'RefGenome_end']
    
    #drop SNP into reference genome
    if reference_genome_with_variants[SNP_RefGenome_position] == SNP_H37Rv_base: #Reference Genome Base was the same as in H37Rv
        
        real_alleles[SNP_i] = reference_genome.seq[SNP_RefGenome_position]
        reference_genome_with_variants[SNP_RefGenome_position] = SNP_alt_base #change Reference Genome Base to Alternate Allele
        introduced_alleles[SNP_i] = SNP_alt_base
        
    elif reference_genome_with_variants[SNP_RefGenome_position] != SNP_H37Rv_base: #Reference Genome Base was different from H37Rv
        
        real_alleles[SNP_i] = reference_genome.seq[SNP_RefGenome_position]
        reference_genome_with_variants[SNP_RefGenome_position] = SNP_H37Rv_base #change Reference Genome Base to H37Rv Base
        introduced_alleles[SNP_i] = SNP_H37Rv_base
        

#TABLE WITH INTRODUCED VARIANTS
#store records of new and introduced alleles to Reference Genome
SNPs_from_all_patients_DF.loc[: , 'RefGenome_real_alleles'] = real_alleles
SNPs_from_all_patients_DF.loc[: , 'RefGenome_introduced_alleles'] = introduced_alleles

#store start/end sites for corresponding CDS sites on RefGenome
SNPs_from_all_patients_DF.loc[: , 'RefGenome_CDS_start'] = RefGenome_CDS_start
SNPs_from_all_patients_DF.loc[: , 'RefGenome_CDS_end'] = RefGenome_CDS_end

#change column names
##### SNPs_from_all_patients_DF.columns = ['H37Rv_alt_base' , 'H37Rv_ref_base' , 'H37Rv_position' , 'gene_id' , 'RefGenome_position' , 'RefGenome_real_alleles' , 'RefGenome_introduced_alleles']

#FASTA WITH ALTERED REFERENCE GENOME
#convert mutable seq back to normal sequence
reference_genome_with_variants = reference_genome_with_variants.toseq()

#store in SeqRecord with strain ID
reference_genome_with_variants = SeqRecord(reference_genome_with_variants , id = ref_genome_strain)

#######################################################################################################################################
#STORE VARIOUS FILES IN DIRECTORY FOR REFERENCE GENOME
reference_genome_directory = '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/reference_sequence_info_and_calls_against_H37Rv/'
reference_genome_ID = ref_genome_strain[:-8]

#create directory for reference genome (remove then re-create if one already exists)
if os.path.exists(reference_genome_directory + reference_genome_ID):
    shutil.rmtree(reference_genome_directory + reference_genome_ID)
    os.makedirs(reference_genome_directory + reference_genome_ID)
elif not os.path.exists(reference_genome_directory + reference_genome_ID):
    os.makedirs(reference_genome_directory + reference_genome_ID)
    
#copy over Reference Sequence fasta
copy(ref_genome_fasta  , reference_genome_directory + reference_genome_ID)

#copy over Reference Sequence Feature Annotation table
copy(ref_strain_genome_annot_files_directory + ref_genome_strain_annotation_filename  , reference_genome_directory + reference_genome_ID)

#store altered Reference Sequence w/ introduced variants
#write the reference Sequences (with ids) to a FASTA file (this is used to BLAST against)
os.chdir(reference_genome_directory + reference_genome_ID)
SeqIO.write(reference_genome_with_variants, reference_genome_ID + "_with_altered_variants.fasta", "fasta")

#store record of changes made to the Reference Sequence
SNPs_from_all_patients_DF.to_csv(reference_genome_directory + reference_genome_ID + '/' + reference_genome_ID + '_altered_variants_record.csv')

# CSV file above stores the SNPs that we will introduce into the Complete Reference Genomes for
# genes that we have successfully mapped between a the given Reference Genome & H37Rv. Therefore, 
# the set of Gene IDs for which we will introduce SNPs in represents the set of Gene IDs that 
# had good mappings between this Reference Genome & H37Rv

##########################################################################################
##########################################################################################

In [74]:
##########################################################################################
##########################################################################################

'''
This piece of code takes the altered Reference Genome and uses ART to 
generate reads off of the ALTERED Reference Genome. The simulation 
parameters for the 'Altered' Reference Genome are going to be similar to 
the parameters in our real sequencing data. This will simulate how 
accurate we're able to call the introduced SNPs into the Reference Genome 
given our pipeline and comparable sequencing quality.
'''
#directory where fastq file with altered reference sequence is located
directory_with_files = reference_genome_directory + reference_genome_ID + '/'

#directory/filename of reference sequence with variants
RefGenome_with_variants = reference_genome_directory + reference_genome_ID + '/' + reference_genome_ID + "_with_altered_variants.fasta"

#create directory to store fastq files
if not os.path.exists(reference_genome_directory + reference_genome_ID + '/' + 'fastq_files_from_ART_for_ALT_RefGenome'):
    os.makedirs(reference_genome_directory + reference_genome_ID + '/' + 'fastq_files_from_ART_for_ALT_RefGenome')

#directory & prefix filename to store ART output
fastq_directory = reference_genome_directory + reference_genome_ID + '/' + 'fastq_files_from_ART_for_ALT_RefGenome' + '/' + reference_genome_ID + '_ART_'

#command to run ART
ART = '/home/rv76/Bio_Pipelines/art_bin_MountRainier/art_illumina'

#change directory to one with all of the files
## parameters
## -ss HS10 ; Illumina HiSeq 1000
## -l 100 ; read length of 100
## -f 80 ; mean coverage of 80x
## -p ; paired end reads
## -m 200; 200 bp mean size of DNA fragments for paired-end simulations
## -s 25; 25 bp standard deviation of DNA fragment size for paired-end simulations

art_command = ART + ' -ss HS10 -i ' + RefGenome_with_variants + ' -o ' + fastq_directory + ' -l 100 -f 80 -p -m 200 -s 25'

#submit command to simulate reads
subprocess.call(art_command , shell = True)

#delete aln files from ART output (keep only fastqs)
aln_file_1 = fastq_directory + '1.aln'
aln_file_2 = fastq_directory + '2.aln'

subprocess.call('rm -f ' + aln_file_1 , shell = True)
subprocess.call('rm -f ' + aln_file_2 , shell = True)

##########################################################################################
##########################################################################################

0

#######################################################################################################################################################################################################################################################################################

## [2] Submit Jobs for each seperate Reference Genome

#######################################################################################################################################################################################################################################################################################

In [2]:
from slurmpy import Slurm
import os
import pandas as pd
import numpy as np

In [3]:
#retrieve table of Reference Genomes we will us in simulations
ref_seqs_annot = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/reference_sequences/strains_typed/Reference_Genomes_used_in_simulations.csv').set_index('Unnamed: 0')

number_of_Reference_Genomes = np.shape(ref_seqs_annot)[0]

print 'There are ' + str(number_of_Reference_Genomes) + ' Reference Genomes that we will simulate SNPs in.'

There are 54 Reference Genomes that we will simulate SNPs in.


In [4]:
#Choose a specific reference genome, pull sequence & annotation; compare to H37Rv (indexed: 0 - 53)
for ref_genome_i in range(0 , number_of_Reference_Genomes):

    ref_genome_i = str(ref_genome_i)

    #command to run script
    run_script = "python /n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/python_scripts/RefGenome_with_variants_reads_sim_against_H37Rv_rough_window_match.py " + ref_genome_i

    #submit job to O2
    #directory where you want output + error files
    os.chdir("/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/simulated_reads/O2_jobs_output/RefGenome_ART_reads_simulations")

    #submit job to O2
    job_name = 'RG_' + ref_genome_i

    s = Slurm(job_name , {'partition':'short' , 'n':'1' , 't':'0-01:30:00' , 'mem-per-cpu':'16G' , 'mail-type':'FAIL' , 'mail-user':'roger_vargas@g.harvard.edu'})

    #submits the job
    job_id = s.run(run_script)

    print job_name  + ' : ' +  str(job_id)

submitted: Submitted batch job 2969039


RG_0 : 2969039


submitted: Submitted batch job 2969040


RG_1 : 2969040


submitted: Submitted batch job 2969044


RG_2 : 2969044


submitted: Submitted batch job 2969046
submitted: Submitted batch job 2969047


RG_3 : 2969046
RG_4 : 2969047


submitted: Submitted batch job 2969048
submitted: Submitted batch job 2969049


RG_5 : 2969048
RG_6 : 2969049


submitted: Submitted batch job 2969050


RG_7 : 2969050


submitted: Submitted batch job 2969051


RG_8 : 2969051


submitted: Submitted batch job 2969052


RG_9 : 2969052


submitted: Submitted batch job 2969053


RG_10 : 2969053


submitted: Submitted batch job 2969054
submitted: Submitted batch job 2969055


RG_11 : 2969054
RG_12 : 2969055


submitted: Submitted batch job 2969056
submitted: Submitted batch job 2969057


RG_13 : 2969056
RG_14 : 2969057


submitted: Submitted batch job 2969058
submitted: Submitted batch job 2969059
submitted: Submitted batch job 2969060
submitted: Submitted batch job 2969061
submitted: Submitted batch job 2969062


RG_15 : 2969058
RG_16 : 2969059
RG_17 : 2969060
RG_18 : 2969061
RG_19 : 2969062


submitted: Submitted batch job 2969063


RG_20 : 2969063


submitted: Submitted batch job 2969064


RG_21 : 2969064


submitted: Submitted batch job 2969065
submitted: Submitted batch job 2969066


RG_22 : 2969065
RG_23 : 2969066


submitted: Submitted batch job 2969067
submitted: Submitted batch job 2969068
submitted: Submitted batch job 2969069
submitted: Submitted batch job 2969070
submitted: Submitted batch job 2969071


RG_24 : 2969067
RG_25 : 2969068
RG_26 : 2969069
RG_27 : 2969070
RG_28 : 2969071


submitted: Submitted batch job 2969072
submitted: Submitted batch job 2969073
submitted: Submitted batch job 2969074
submitted: Submitted batch job 2969075


RG_29 : 2969072
RG_30 : 2969073
RG_31 : 2969074
RG_32 : 2969075


submitted: Submitted batch job 2969076
submitted: Submitted batch job 2969077
submitted: Submitted batch job 2969078
submitted: Submitted batch job 2969079


RG_33 : 2969076
RG_34 : 2969077
RG_35 : 2969078
RG_36 : 2969079


submitted: Submitted batch job 2969080


RG_37 : 2969080


submitted: Submitted batch job 2969081
submitted: Submitted batch job 2969082
submitted: Submitted batch job 2969083
submitted: Submitted batch job 2969084
submitted: Submitted batch job 2969085


RG_38 : 2969081
RG_39 : 2969082
RG_40 : 2969083
RG_41 : 2969084
RG_42 : 2969085


submitted: Submitted batch job 2969086


RG_43 : 2969086


submitted: Submitted batch job 2969087
submitted: Submitted batch job 2969088
submitted: Submitted batch job 2969089
submitted: Submitted batch job 2969090


RG_44 : 2969087
RG_45 : 2969088
RG_46 : 2969089
RG_47 : 2969090


submitted: Submitted batch job 2969091
submitted: Submitted batch job 2969092
submitted: Submitted batch job 2969093
submitted: Submitted batch job 2969094
submitted: Submitted batch job 2969095


RG_48 : 2969091
RG_49 : 2969092
RG_50 : 2969093
RG_51 : 2969094
RG_52 : 2969095
RG_53 : 2969096


submitted: Submitted batch job 2969096
