# This notebook was created to prepare the data and construct a phylogeny for the sub-lineage 2.2.1.1.1.i3 isolate cluster.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import itertools
import gzip
import vcf
from slurmpy import Slurm

### t-SNE

Import t-SNE embeddings

In [3]:
t_SNE_coords = np.loadtxt('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/pairwise_distance_matrix/tb_output/tSNE_embeddings_from_pairwise_SNP_dist.txt')

In [4]:
t_SNE_coords

array([[ 15.77039433, -31.88985825],
       [ 47.15923691, -36.95353317],
       [ 12.14623451,  71.26150513],
       ...,
       [-15.47960186,  38.40433884],
       [ 16.71702766, -43.72228622],
       [ 68.94006348,  35.38974762]])

In [5]:
np.shape(t_SNE_coords)

(31428, 2)

##################################################################################################################################################################################################################

# [1] Interactive Session to prepare data for IQTree

##################################################################################################################################################################################################################

#### Different isolates groups: '1', '2', '3', '4A', '4B', '4C', '5', '6'

### Inputs

In [6]:
sub_lineage = '2.2.1.1.1.i3_cluster'

#### Load in the isolate annotation DF for Genotypes Matrix

In [7]:
#load isolate annotation file (columns of Genotype Matrix)
isolate_annotation_DF = pd.read_pickle('/n/data1/hms/dbmi/farhat/Roger/homoplasy_project/rolling_DB_scrape/Genotypes_Filtered_2/genotypes_isolate_annotation.pkl')

In [8]:
isolate_annotation_DF.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID,lineage_call,group
0,4,2,1,2,1.0,1.0,i3,1.0,,,,SAMEA3558733,4.2.1.2.1.1.i3.1,4B
1,4,2,1,2,2.0,1.0,1,,,,,SAMN03648641,4.2.1.2.2.1.1,4B
2,3,1,1,i1,,,,,,,,SAMN03647419,3.1.1.i1,3
3,4,2,1,2,1.0,1.0,i1,,,,,SAMEA3671418,4.2.1.2.1.1.i1,4B
4,1,1,1,2,,,,,,,,SAMN07659096,1.1.1.2,1


In [9]:
np.shape(isolate_annotation_DF)

(31428, 14)

#### Create a text file with a list of all of the isolates that belong to a particular lineage & filter based off of t-SNE coordinates

In [10]:
#lineage 2.2.1.1.1.i3 (SUBSET this sublineage to cluster of isolates with -2 <= t-SNE_1 <= 8 , 26 <= t-SNE_2 <= 38)
isolate_sublineage_filter = [isolate_i_lineage_call[0:12] == '2.2.1.1.1.i3' for isolate_i_lineage_call in isolate_annotation_DF.lineage_call] # n = 1887
isolate_cluster_filter = [( -2 <= tSNE_1 <= 8 ) and ( 26 <= tSNE_2 <= 38 ) for tSNE_1, tSNE_2 in zip(t_SNE_coords[: , 0], t_SNE_coords[: , 1])] #n = 438

#create filter based off of isolates belonging to cluster within sublineage
isolate_sublineage_cluster_filter = [(isolate_in_sublineage and isolate_in_cluster) for isolate_in_sublineage, isolate_in_cluster in zip(isolate_sublineage_filter, isolate_cluster_filter)] #n = 438

In [11]:
isolates_belonging_to_sub_lineage = isolate_annotation_DF[isolate_sublineage_cluster_filter]

In [12]:
isolates_belonging_to_sub_lineage.head()

Unnamed: 0,lineage_1,lineage_2,lineage_3,lineage_4,lineage_5,lineage_6,lineage_7,lineage_8,lineage_9,lineage_10,lineage_11,isolate_ID,lineage_call,group
132,2,2,1,1,1,i3,,,,,,SAMEA1015996,2.2.1.1.1.i3,2
134,2,2,1,1,1,i3,,,,,,SAMEA1404001,2.2.1.1.1.i3,2
219,2,2,1,1,1,i3,,,,,,SAMN08708725,2.2.1.1.1.i3,2
269,2,2,1,1,1,i3,,,,,,SAMEA3443144,2.2.1.1.1.i3,2
338,2,2,1,1,1,i3,,,,,,SAMEA1403810,2.2.1.1.1.i3,2


In [13]:
np.shape(isolates_belonging_to_sub_lineage)

(438, 14)

In [14]:
isolate_IDs_for_sub_lineage = [isolate_ID for isolate_ID in list(isolates_belonging_to_sub_lineage.isolate_ID)]

#save each element as a new row in a text file
with open('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/sublineage_isolate_lists/isolate_list_lineage_' + sub_lineage + '.txt', 'w') as f:
    for item in isolate_IDs_for_sub_lineage:
        f.write("%s\n" % item)

#### Create a text file with a list of the additional 12 eis C-14T mutants with AG MICs that belong to a particular lineage

In [15]:
isolate_IDs_for_sub_lineage_extra_strains = ['622-19','IT1070','IT233','IT634','IT77','IT947']

#save each element as a new row in a text file
with open('/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/sublineage_isolate_lists/isolate_list_lineage_' + '2.2.1.1.1.i3_cluster_extra_strains' + '.txt', 'w') as f:
    for item in isolate_IDs_for_sub_lineage_extra_strains:
        f.write("%s\n" % item)

#### [interactive session] BASH function to prepare data for iqtree

In [None]:
prep_data_for_iqtree(){
    
# Load BCFtools
module load bcftools/1.9
    
## The default ulimit is 1024. I should increase it. I think on o2 the hard limit is
ulimit -n 50000  #'ulimit -Hn' to check the hard limit, do this for lineage 4 and find login node with high ulimit capacity
    
# I initialize the variables
local OUTDIR=$1
local TAG=$2
local LIST_ISOL=$3
local LIST_ISOL_EXTRA=$4

echo "OUTDIR: ${1}"
echo "TAG: ${2}"
echo "LIST_ISOL: ${3}"
echo "LIST_ISOL_EXTRA: ${4}"
    
# I create the output directory
rm -rf ${OUTDIR}
mkdir -p ${OUTDIR}
    
# Change the current directory to the output directory
cd ${OUTDIR}

# I clean the directory with the bcf data
rm -rf bcf/
mkdir -p bcf/

# I generate the bcf files, change the headers and index them
for i in `cat ${LIST_ISOL}`;do
bcftools view /n/data1/hms/dbmi/farhat/rollingDB/genomic_data/${i}/pilon/${i}.vcf --types snps -O b -o ${i}_snps.bcf
bcftools index ${i}_snps.bcf
bcftools view ${i}_snps.bcf --regions-file /home/lf61/lf61/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/data_cleaning/regions_to_include_final.bed -o ${i}.bcf -O b;
bcftools reheader -s <(echo "${i}") ${i}.bcf -o bcf/${i}_renamed.bcf;
bcftools index bcf/${i}_renamed.bcf;
done
    
# I generate the bcf files, change the headers and index them
for i in `cat ${LIST_ISOL_EXTRA}`;do
bcftools view /n/data1/hms/dbmi/farhat/lfreschi/repos/megapipe/megapipe_snakemake/results_mmpR_eis/${i}/pilon/${i}.vcf --types snps -O b -o ${i}_snps.bcf
bcftools index ${i}_snps.bcf
bcftools view ${i}_snps.bcf --regions-file /home/lf61/lf61/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/data_cleaning/regions_to_include_final.bed -o ${i}.bcf -O b;
bcftools reheader -s <(echo "${i}") ${i}.bcf -o bcf/${i}_renamed.bcf;
bcftools index bcf/${i}_renamed.bcf;
done

# I add canettii
bcftools view /n/data1/hms/dbmi/farhat/lfreschi/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/vcf_canettii/canettii.vcf --types snps -O b -o canettii_snps.bcf
bcftools index canettii_snps.bcf
bcftools view canettii_snps.bcf --regions-file /home/lf61/lf61/mic_assemblies/40-full-analysis/lin-sp-var-10k/results/data_cleaning/regions_to_include_final.bed -o canettii.bcf -O b;
bcftools reheader -s <(echo "canettii") canettii.bcf -o bcf/canettii_renamed.bcf;
bcftools index bcf/canettii_renamed.bcf;

# cleaning
rm -rf *.bcf
rm -rf *.bcf.csi

# I merge the bcf files
ls bcf/*|grep "bcf$" > list_bcfs.txt
bcftools merge -l list_bcfs.txt -o ${OUTDIR}/${TAG}.vcf -O v -0

## I use vcf2phylip (from luca's home directory)
/home/lf61/sw/vcf2phylip/1.5/vcf2phylip.py -i ${OUTDIR}/${TAG}.vcf -f

}

#### Get variables for data preperation function & for iqtree

In [16]:
OUTDIR = '/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/tree_output_files/phylogeny_lineage_' + sub_lineage +  '/'
TAG = 'lineage_' + sub_lineage
LIST_ISOL = '/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/sublineage_isolate_lists/isolate_list_lineage_' + sub_lineage + '.txt'
LIST_ISOL_EXTRA = '/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/sublineage_isolate_lists/isolate_list_lineage_2.2.1.1.1.i3_cluster_extra_strains.txt'

In [17]:
OUTDIR

'/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/tree_output_files/phylogeny_lineage_2.2.1.1.1.i3_cluster/'

In [18]:
TAG

'lineage_2.2.1.1.1.i3_cluster'

In [19]:
LIST_ISOL

'/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/sublineage_isolate_lists/isolate_list_lineage_2.2.1.1.1.i3_cluster.txt'

In [20]:
LIST_ISOL_EXTRA

'/n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/sublineage_isolate_lists/isolate_list_lineage_2.2.1.1.1.i3_cluster_extra_strains.txt'

### [interactive session] Call function above in an interactive session to prepare the data for iqtree

In [21]:
print 'prep_data_for_iqtree' + ' ' + OUTDIR + ' ' + TAG + ' ' + LIST_ISOL + ' ' + LIST_ISOL_EXTRA

prep_data_for_iqtree /n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/tree_output_files/phylogeny_lineage_2.2.1.1.1.i3_cluster/ lineage_2.2.1.1.1.i3_cluster /n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/sublineage_isolate_lists/isolate_list_lineage_2.2.1.1.1.i3_cluster.txt /n/data1/hms/dbmi/farhat/Roger/mmpR_BDQ_mutant_project/phylogenies/sublineage_isolate_lists/isolate_list_lineage_2.2.1.1.1.i3_cluster_extra_strains.txt


##################################################################################################################################################################################################################

# [2] Submit job to O2 to create trees

##################################################################################################################################################################################################################

### Use code below to submit a iqtree job (uses ModelFinder)

In [23]:
#iqtree_job_p1 = 'VIRTUAL_ENV_DISABLE_PROMPT=true source activate iqtree_virtualenv'
iqtree_job_p1 = 'set +eu\nVIRTUAL_ENV_DISABLE_PROMPT=true source activate iqtree_virtualenv\nset -eu' # to fix "/home/rv76/anaconda2/etc/profile.d/conda.sh: line 55: PS1: unbound variable" ERROR
iqtree_job_p2 = 'time iqtree -s {0}{1}.min4.fasta -mem 7G -pre {0}tree_{1}_iqtree_FINAL -m MFP -mset GTR -bb 1000 -alrt 1000 -nt AUTO -ntmax 8 -redo'.format(OUTDIR, TAG)

iqtree_job = iqtree_job_p1 + '\n' + iqtree_job_p2

#directory where you want output + error files
os.chdir(OUTDIR)

job_name = 'iq_L' + TAG.split('_')[1]  

s = Slurm(job_name , {'partition':'priority' , 'N':'1' , 'c':'8' , 't':'0-12:00:00' , 'mem-per-cpu':'8G' , 'mail-type':'ALL' , 'mail-user':'roger_vargas@g.harvard.edu' , 'o':'out_run_iqtree_{}.txt'.format(TAG)})

#submits the job
job_id = s.run(iqtree_job)

print job_name

iq_L2.2.1.1.1.i3


submitted: Submitted batch job 26950900
