In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### This notebook was created to construct phylogenies of the Global Mtb lineages using Luca's pipeline for preparing data and calling IQTree

In [2]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import itertools
import gzip
import vcf
from slurmpy import Slurm

##################################################################################################################################################################################################################

## [1] Interactive Session to prepare data for IQTree

##################################################################################################################################################################################################################

#### Different isolates groups: '1', '2', '3', '4A', '4B', '4C', '5', '6'

### Inputs

In [20]:
global_lineage = '6'

#### Load in the isolate annotation DF for Genotypes Matrix

In [21]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')

In [22]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID,lineage_call,group
0,4,2,1,2,1.0,1.0,i3,1.0,,,,SAMEA3558733,4.2.1.2.1.1.i3.1,4B
1,4,2,1,2,2.0,1.0,1,,,,,SAMN03648641,4.2.1.2.2.1.1,4B
2,3,1,1,i1,,,,,,,,SAMN03647419,3.1.1.i1,3
3,4,2,1,2,1.0,1.0,i1,,,,,SAMEA3671418,4.2.1.2.1.1.i1,4B
4,1,1,1,2,,,,,,,,SAMN07659096,1.1.1.2,1


In [23]:
np.shape(isolate_annotation_DF)

(31428, 14)

#### Create a text file with a list of all of the isolates that belong to a particular global lineage

In [24]:
isolates_belonging_to_global_lineage = isolate_annotation_DF[isolate_annotation_DF.group == global_lineage]

In [25]:
isolates_belonging_to_global_lineage.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID,lineage_call,group
14,6,,,,,,,,,,,SAMEA5366599,6,6
161,6,,,,,,,,,,,SAMEA2533713,6,6
1847,6,,,,,,,,,,,SAMN02231100,6,6
2173,6,,,,,,,,,,,SAMEA2533690,6,6
2513,6,,,,,,,,,,,SAMEA2535059,6,6


In [26]:
np.shape(isolates_belonging_to_global_lineage)

(96, 14)

In [27]:
isolate_IDs_for_global_lineage = [isolate_ID for isolate_ID in list(isolates_belonging_to_global_lineage.isolate_ID) ]

#save each element as a new row in a text file
with open('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/global_lineage_isolate_lists/isolate_list_lineage_' + global_lineage + '.txt', 'w') as f:
    for item in isolate_IDs_for_global_lineage:
        f.write("%s\n" % item)

Number of isolates that have a lineage call at the sub-lineage level (**global lineage.sub lineage**)

In [28]:
np.shape(isolates_belonging_to_global_lineage)[0] - sum( np.isnan(isolates_belonging_to_global_lineage.lineage_2.values.astype(float)) )

0

#### [interactive session] BASH function to prepare data for iqtree

In [None]:
prep_data_for_iqtree(){
    
# Load BCFtools
module load bcftools/1.9
    
## The default ulimit is 1024. I should increase it. I think on o2 the hard limit is
ulimit -n 50000  #'ulimit -Hn' to check the hard limit, do this for lineage 4 and find login node with high ulimit capacity
    
# I initialize the variables
local OUTDIR=$1
local TAG=$2
local LIST_ISOL=$3

echo "OUTDIR: ${1}"
echo "TAG: ${2}"
echo "LIST_ISOL: ${3}"

# I create the output directory
rm -rf ${OUTDIR}
mkdir -p ${OUTDIR}
    
# Change the current directory to the output directory
cd ${OUTDIR}

# I clean the directory with the bcf data
rm -rf bcf/
mkdir -p bcf/

# I generate the bcf files, change the headers and index them
for i in `cat ${LIST_ISOL}`;do
bcftools view /n/data1/hms/dbmi/farhat/rollingDB/genomic_data/${i}/pilon/${i}.vcf --types snps -O b -o ${i}_snps.bcf
bcftools index ${i}_snps.bcf
bcftools view ${i}_snps.bcf --regions-file /home/lf61/lf61/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/data_cleaning/regions_to_include_final.bed -o ${i}.bcf -O b;
bcftools reheader -s <(echo "${i}") ${i}.bcf -o bcf/${i}_renamed.bcf;
bcftools index bcf/${i}_renamed.bcf;
done

# I add canettii
bcftools view /n/data1/hms/dbmi/farhat/lfreschi/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/vcf_canettii/canettii.vcf --types snps -O b -o canettii_snps.bcf
bcftools index canettii_snps.bcf
bcftools view canettii_snps.bcf --regions-file /home/lf61/lf61/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/data_cleaning/regions_to_include_final.bed -o canettii.bcf -O b;
bcftools reheader -s <(echo "canettii") canettii.bcf -o bcf/canettii_renamed.bcf;
bcftools index bcf/canettii_renamed.bcf;

# cleaning
rm -rf *.bcf
rm -rf *.bcf.csi

# I merge the bcf files
ls bcf/*|grep "bcf$" > list_bcfs.txt
bcftools merge -l list_bcfs.txt -o ${OUTDIR}/${TAG}.vcf -O v -0

## I use vcf2phylip (from luca's home directory)
/home/lf61/sw/vcf2phylip/1.5/vcf2phylip.py -i ${OUTDIR}/${TAG}.vcf -f

}

#### Get variables for data preperation function & for iqtree

In [29]:
OUTDIR = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/tree_output_files/phylogeny_lineage_' + global_lineage +  '/'
TAG = 'lineage_' + global_lineage
LIST_ISOL = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/global_lineage_isolate_lists/isolate_list_lineage_' + global_lineage + '.txt'

In [30]:
OUTDIR

'/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/tree_output_files/phylogeny_lineage_6/'

In [31]:
TAG

'lineage_6'

In [32]:
LIST_ISOL

'/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/global_lineage_isolate_lists/isolate_list_lineage_6.txt'

### [interactive session] Call function above in an interactive session to prepare the data for iqtree

In [33]:
print 'prep_data_for_iqtree' + ' ' + OUTDIR + ' ' + TAG + ' ' + LIST_ISOL

prep_data_for_iqtree /n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/tree_output_files/phylogeny_lineage_6/ lineage_6 /n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/global_lineage_isolate_lists/isolate_list_lineage_6.txt


##################################################################################################################################################################################################################

## [2] Submit jobs to O2 to create trees

##################################################################################################################################################################################################################

### Use code below to submit a iqtree jobs for lineage 1, 2, 3, 4A, 4B & 4C (without using ModelFinder and selecting a GTR+F+I+R model)

In [4]:
## TAG = 'lineage_4C'
## OUTDIR = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/tree_output_files/phylogeny_lineage_4C/'

In [5]:
iqtree_job_p1 = 'VIRTUAL_ENV_DISABLE_PROMPT=true source activate iqtree_virtualenv'
iqtree_job_p2 = 'time iqtree -s {0}{1}.min4.fasta -mem 15G -pre {0}tree_{1}_iqtree_FINAL -m GTR+F+I+R -mset GTR -bb 1000 -alrt 1000 -nt AUTO -ntmax 8 -redo'.format(OUTDIR, TAG)

iqtree_job = iqtree_job_p1 + '\n' + iqtree_job_p2

#directory where you want output + error files
os.chdir(OUTDIR)

job_name = 'iq_L' + TAG.split('_')[1]  

s = Slurm(job_name , {'partition':'long' , 'N':'1' , 'c':'8' , 't':'30-00:00:00' , 'mem-per-cpu':'16G' , 'mail-type':'ALL' , 'mail-user':'roger_vargas@g.harvard.edu' , 'o':'out_run_iqtree_{}.txt'.format(TAG)})

#submits the job
job_id = s.run(iqtree_job)

print job_name

iq_L4C


submitted: Submitted batch job 7202091


### Use code below to submit a iqtree jobs for lineage 5 & 6 (uses ModelFinder)

In [34]:
iqtree_job_p1 = 'VIRTUAL_ENV_DISABLE_PROMPT=true source activate iqtree_virtualenv'
iqtree_job_p2 = 'time iqtree -s {0}{1}.min4.fasta -mem 7G -pre {0}tree_{1}_iqtree_FINAL -m MFP -mset GTR -bb 1000 -alrt 1000 -nt AUTO -ntmax 8 -redo'.format(OUTDIR, TAG)

iqtree_job = iqtree_job_p1 + '\n' + iqtree_job_p2

#directory where you want output + error files
os.chdir(OUTDIR)

job_name = 'iq_L' + TAG.split('_')[1]  

s = Slurm(job_name , {'partition':'short' , 'N':'1' , 'c':'8' , 't':'0-12:00:00' , 'mem-per-cpu':'7G' , 'mail-type':'ALL' , 'mail-user':'roger_vargas@g.harvard.edu' , 'o':'out_run_iqtree_{}.txt'.format(TAG)})

#submits the job
job_id = s.run(iqtree_job)

print job_name

iq_L6


submitted: Submitted batch job 7173987


### Use code below to re-submit a iqtree job for lineage 2 (without using ModelFinder and selecting a GTR+F+I+R model, started but was interrupted by a NODE FAIL)

In [3]:
TAG = 'lineage_2'
OUTDIR = '/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/tree_output_files/phylogeny_lineage_2/'

In [4]:
#iqtree_job_p1 = 'VIRTUAL_ENV_DISABLE_PROMPT=true source activate iqtree_virtualenv'
iqtree_job_p1 = 'set +eu\nVIRTUAL_ENV_DISABLE_PROMPT=true source activate iqtree_virtualenv\nset -eu' # to fix "/home/rv76/anaconda2/etc/profile.d/conda.sh: line 55: PS1: unbound variable" ERROR
iqtree_job_p2 = 'time iqtree -s {0}{1}.min4.fasta -mem 15G -pre {0}tree_{1}_iqtree_FINAL -m GTR+F+I+R -mset GTR -bb 1000 -alrt 1000 -nt AUTO -ntmax 8'.format(OUTDIR, TAG)

iqtree_job = iqtree_job_p1 + '\n' + iqtree_job_p2

#directory where you want output + error files
os.chdir(OUTDIR)

job_name = 'iq_L' + TAG.split('_')[1]  

s = Slurm(job_name , {'partition':'priority' , 'N':'1' , 'c':'8' , 't':'30-00:00:00' , 'mem-per-cpu':'16G' , 'mail-type':'ALL' , 'mail-user':'roger_vargas@g.harvard.edu' , 'o':'out_run_iqtree_{}.txt'.format(TAG)})

#submits the job
job_id = s.run(iqtree_job)

print job_name

iq_L2


submitted: Submitted batch job 12705994


In [None]:
/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/phylogenies/tree_output_files/phylogeny_lineage_2/lineage_2.min4.fasta

### Time to took to create different phylogenies

- Lineage 1 (n = 2,815) - 2-01:29:06
- Lineage 2 (n = 8,090) - 51-08:39:33 + 12-00:12:50 = 63-08:52:23
- Lineage 3 (n = 3,398) - 11-19:57:11
- Lineage 4A (n = 5,839) - 6-10:58:27
- Lineage 4B (n = 6,958) - 6-17:40:45
- Lineage 4C (n = 4,134) - 2-17:58:37
- Lineage 5 (n = 98) - 00:04:11
- Lineage 6 (n = 96) - 00:02:35