# Making phylogeny of 36 clinical isolates with complete circular assemblies (and >= 40X Illumina WGS median depth)


### Maximillian Marin
### mgmarin@g.harvard.edu

### Goal: Use BCFtools to merge all VCF files and create a phylogeny for clinical isolates with PacBio assemblies. 


In [1]:
import pandas as pd
from tqdm import tqdm

#### Pandas Viewing Settings

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Define directories of interest:

In [3]:
Repo_DataDir = "../../Data"

PMP_SM_ResultsSummary_Dir_210108 = Repo_DataDir + "/210108_PMP_SM_50CI_V7_ResultsSummary"

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_36CI_CircularOnly_F2Filtered_AtLeast40XMeanDepthIllumina_AssemblySummary_V7.tsv"       

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary = pd.read_csv(PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_AnalysisSet_AssemblySummary = PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary


SampleIDs_36CI_SOI = list( PMP_36CI_AnalysisSet_AssemblySummary["SampleID"].values )


print(','.join(SampleIDs_36CI_SOI) )

# Make sample to metadata mapping dicts

ID_To_IlluminaAvrgCov_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'IlluminaWGSToH37rv_AvrgCov']].values)                     
ID_To_Lineage_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'PrimaryLineage_PB']].values)
ID_To_Dataset_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'Dataset_Tag']].values)

M0011368_9,M0014888_3,M0016395_7,M0010874_7,01_R1430,02_R0894,02_R1708,02_R1896,M0016737_0,M0017522_5,01_R1134,M0003941_3,02_R1179,N1176,N0072,N0153,N0145,N0155,N0004,N1274,N0054,N1272,N0091,N1202,N1177,RW-TB008,DNA028,DNA075,DNA091,DNA044,DNA020,AZE_02_042,DNA019_Rose,DNA120,DNA188,DNA086


### Let's verify the shapes of the metadata dataframes

In [4]:
PMP_36CI_AnalysisSet_AssemblySummary.shape

(36, 26)

## Construct dictionary with PATHs to relevant files for all samples

### Define directories to PMP-SM (PacBio assembly and analysis pipeline)

In [5]:
### Define directories to PMP-SM (PacBio assembly and analysis pipeline)

### Define varaint calling pipeline output directories

PacBio_ProjectDir = "/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project"

PMP_SM_Outputs_Dir = PacBio_ProjectDir + "/PacmanPipe_SM_Outputs"

PMP_SM_83CI_191203_OutputDir = PMP_SM_Outputs_Dir + "/201201_PMP_SM_TB_Portals_R1_Output_V2"


In [6]:
print(len(SampleIDs_36CI_SOI))

36


In [7]:
listOfSample_Tags = SampleIDs_36CI_SOI

SampleTag_ToPaths_Dict = {}

Mcanettii_ToH37rv_MM2_VCF_PilonPASS_UnionOfSNPs_WiMcanettii_PATH = f"{PMP_SM_83CI_191203_OutputDir}/Mcanetti_VCFprocessing_TreeBuilding/Mcanettii_ToH37rv.mm2.OnlySNPpositionsIn.UnionOfAllSamples.PilonPASS.WiMcanettii.bcf.gz"          
Mcanettii_ToH37rv_MM2_VCF_PBMM2_UnionOfSNPs_WiMcanettii_PATH = f"{PMP_SM_83CI_191203_OutputDir}/Mcanetti_VCFprocessing_TreeBuilding/Mcanettii_ToH37rv.mm2.OnlySNPpositionsIn.UnionOfAllSamples.PBMM2.mpileup.WiMcanettii.bcf.gz"          


for Sample_Tag in listOfSample_Tags:
    
    # Defining PATHs for PacBio data output (PacmanPipe-V4)
    sample_PMP_OutputDir = PMP_SM_83CI_191203_OutputDir + "/" + Sample_Tag

    variants_OutputDir = sample_PMP_OutputDir + "/pacbio_VariantCallingVersusH37Rv"

    MM2_AssemblyToRef_Output_Dir = variants_OutputDir + "/Minimap2_Flye_I3_PP_AlignTo_H37rv"
    PB_Minimap2_VCF_PATH = MM2_AssemblyToRef_Output_Dir + f"/{Sample_Tag}_mm2_GC3_PP_AssemblyToH37rv.vcf" 


    PB_Minimap2_paftools_UnionOfSNPs_BCF_GZ_PATH = MM2_AssemblyToRef_Output_Dir + f"/{Sample_Tag}.mm2.Flye_I3_PP_AssemblyToH37rv.mpileup.call.SNPs.Union.AllSamples.bcf.gz" 
    
    sample_Pilon_OutputDir=f"{sample_PMP_OutputDir}/IlluminaWGS/Pilon_IlluminaPE_AlignedTo_H37rv_minMQ_1_minDP_5_Fix_All_Breaks"
    
    
    Ill_Pilon_VCF_PATH = sample_Pilon_OutputDir + f"/{Sample_Tag}.IllPE.H37rv.vcf"
    
    Ill_Pilon_UnionOfSNPs_BCF_REDUCED_PATH = sample_Pilon_OutputDir + f"/{Sample_Tag}.IllPE.H37rv.OnlySNPpositionsIn.UnionOfAllSamples.bcf.gz"

    #!ls -lah $Ill_Pilon_UnionOfSNPs_BCF_REDUCED_PATH
    
    dictOfPaths_Temp = {}

    dictOfPaths_Temp["PB_Minimap2_mpileup_UnionOfSNPs_BCF_GZ_PATH"] = PB_Minimap2_paftools_UnionOfSNPs_BCF_GZ_PATH    

    
    dictOfPaths_Temp["Ill_Pilon_UnionOfSNPs_BCF_REDUCED_PATH"] = Ill_Pilon_UnionOfSNPs_BCF_REDUCED_PATH    

    #dictOfPaths_Temp["PB_GCPolished3_Minimap2_VCF_PATH"] = PB_Minimap2_VCF_PATH
    #dictOfPaths_Temp["Ill_Pilon_VCF_PATH"] = Ill_Pilon_VCF_PATH
    #dictOfPaths_Temp["Ill_Pilon_VCF_REDUCED_PATH"] = Ill_Pilon_VCF_REDUCED_PATH
    #dictOfPaths_Temp["Reduced_VCF_PATH"] = Ill_Pilon_VCF_REDUCED_PATH
    
    SampleTag_ToPaths_Dict[Sample_Tag] = dictOfPaths_Temp
    
    

In [20]:
#!ls -alh $sample_Pilon_OutputDir

# Begin merging of BCFs and phylogeny construction 

In [21]:
#!ls -1 /n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project

## A) Reformating, filtering of VCFs to BCFs

In [10]:
AnalysisName = "210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup"

PB_Vs_Illumina_DataAnalysis_Dir = "../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI"

Target_Output_Dir = f"{PB_Vs_Illumina_DataAnalysis_Dir}/{AnalysisName}"

input_SampleInfo_Dict = SampleTag_ToPaths_Dict

input_SampleNames = listOfSample_Tags

# Use the 
VCF_PATH_InDictName = "PB_Minimap2_mpileup_UnionOfSNPs_BCF_GZ_PATH"


In [11]:
!mkdir $Target_Output_Dir

In [12]:

Individual_BCFs_Dir = f"{Target_Output_Dir}/BCFs"

!mkdir $Individual_BCFs_Dir

for sampleID in tqdm(input_SampleNames):
    
    Reduced_VCF_PATH = input_SampleInfo_Dict[sampleID][VCF_PATH_InDictName]
    
    BCF_SNPs_PATH = f"{Individual_BCFs_Dir}/{sampleID}.snps.bcf"
    BCF_SNPs_Sorted_AndFiltered_PATH = f"{Individual_BCFs_Dir}/{sampleID}.snps.DPfiltered.bcf"

    BCF_SNPs_Renamed_PATH = f"{Individual_BCFs_Dir}/{sampleID}.snps.renamed.bcf"
    
    #!bcftools view $Reduced_VCF_PATH --types snps --include "MQ>=40 && BQ>=20" -f .,PASS -O b -o $BCF_SNPs_PATH
    #!bcftools view $Reduced_VCF_PATH --types snps -f .,PASS -O b -o $BCF_SNPs_PATH
    
    
    
    !bcftools view $Reduced_VCF_PATH -f .,PASS -O b -o $BCF_SNPs_PATH

    !bcftools index $BCF_SNPs_PATH 

    
    # Now sort BCF and remove all sites with DP != 1
    !bcftools sort $BCF_SNPs_PATH -Ou | bcftools view -e "DP!=1" -O b -o $BCF_SNPs_Sorted_AndFiltered_PATH 
    
    
    SampleTag_header_PATH = Individual_BCFs_Dir + "/" + sampleID + ".name.txt"
    !echo $sampleID > $SampleTag_header_PATH
    
    !bcftools reheader -s $SampleTag_header_PATH $BCF_SNPs_Sorted_AndFiltered_PATH -o $BCF_SNPs_Renamed_PATH
    
    !bcftools index $BCF_SNPs_Renamed_PATH

    #!rm $SampleTag_header_PATH $BCF_SNPs_PATH
    
    #break


  0%|          | 0/36 [00:00<?, ?it/s]

Writing to /tmp/bcftools-sort.ypmR91
Merging 1 temporary files
Cleaning
Done


  3%|▎         | 1/36 [00:01<00:35,  1.02s/it]

Writing to /tmp/bcftools-sort.xK4aF3
Merging 1 temporary files
Cleaning
Done


  6%|▌         | 2/36 [00:01<00:33,  1.00it/s]

Writing to /tmp/bcftools-sort.J9Gvs3
Merging 1 temporary files
Cleaning
Done


  8%|▊         | 3/36 [00:02<00:32,  1.02it/s]

Writing to /tmp/bcftools-sort.qXSZLW
Merging 1 temporary files
Cleaning
Done


 11%|█         | 4/36 [00:03<00:30,  1.04it/s]

Writing to /tmp/bcftools-sort.lNvbJ6
Merging 1 temporary files
Cleaning
Done


 14%|█▍        | 5/36 [00:04<00:29,  1.04it/s]

Writing to /tmp/bcftools-sort.CFcxj6
Merging 1 temporary files
Cleaning
Done


 17%|█▋        | 6/36 [00:05<00:28,  1.04it/s]

Writing to /tmp/bcftools-sort.cHRqo0
Merging 1 temporary files
Cleaning
Done


 19%|█▉        | 7/36 [00:06<00:27,  1.06it/s]

Writing to /tmp/bcftools-sort.kg0yqY
Merging 1 temporary files
Cleaning
Done


 22%|██▏       | 8/36 [00:07<00:26,  1.08it/s]

Writing to /tmp/bcftools-sort.iMaMVT
Merging 1 temporary files
Cleaning
Done


 25%|██▌       | 9/36 [00:08<00:24,  1.08it/s]

Writing to /tmp/bcftools-sort.HroCeP
Merging 1 temporary files
Cleaning
Done


 28%|██▊       | 10/36 [00:09<00:23,  1.10it/s]

Writing to /tmp/bcftools-sort.LeZSEG
Merging 1 temporary files
Cleaning
Done


 31%|███       | 11/36 [00:10<00:22,  1.10it/s]

Writing to /tmp/bcftools-sort.DzHjwF
Merging 1 temporary files
Cleaning
Done


 33%|███▎      | 12/36 [00:11<00:22,  1.08it/s]

Writing to /tmp/bcftools-sort.yP8hAB
Merging 1 temporary files
Cleaning
Done


 36%|███▌      | 13/36 [00:12<00:21,  1.09it/s]

Writing to /tmp/bcftools-sort.aqQNvA
Merging 1 temporary files
Cleaning
Done


 39%|███▉      | 14/36 [00:13<00:20,  1.08it/s]

Writing to /tmp/bcftools-sort.bvoYhu
Merging 1 temporary files
Cleaning
Done


 42%|████▏     | 15/36 [00:13<00:19,  1.08it/s]

Writing to /tmp/bcftools-sort.ZpV15t
Merging 1 temporary files
Cleaning
Done


 44%|████▍     | 16/36 [00:14<00:18,  1.06it/s]

Writing to /tmp/bcftools-sort.Awmgsu
Merging 1 temporary files
Cleaning
Done


 47%|████▋     | 17/36 [00:15<00:17,  1.07it/s]

Writing to /tmp/bcftools-sort.ppoV4q
Merging 1 temporary files
Cleaning
Done


 50%|█████     | 18/36 [00:16<00:16,  1.07it/s]

Writing to /tmp/bcftools-sort.2VmtGA
Merging 1 temporary files
Cleaning
Done


 53%|█████▎    | 19/36 [00:18<00:17,  1.02s/it]

Writing to /tmp/bcftools-sort.7bLrWJ
Merging 1 temporary files
Cleaning
Done


 56%|█████▌    | 20/36 [00:19<00:17,  1.10s/it]

Writing to /tmp/bcftools-sort.NgGNdb
Merging 1 temporary files
Cleaning
Done


 58%|█████▊    | 21/36 [00:20<00:16,  1.11s/it]

Writing to /tmp/bcftools-sort.xcBHGh
Merging 1 temporary files
Cleaning
Done


 61%|██████    | 22/36 [00:21<00:15,  1.09s/it]

Writing to /tmp/bcftools-sort.BXPDyu
Merging 1 temporary files
Cleaning
Done


 64%|██████▍   | 23/36 [00:22<00:14,  1.10s/it]

Writing to /tmp/bcftools-sort.UtOGpE
Merging 1 temporary files
Cleaning
Done


 67%|██████▋   | 24/36 [00:23<00:12,  1.08s/it]

Writing to /tmp/bcftools-sort.XjQDeJ
Merging 1 temporary files
Cleaning
Done


 69%|██████▉   | 25/36 [00:24<00:12,  1.09s/it]

Writing to /tmp/bcftools-sort.5G1LgN
Merging 1 temporary files
Cleaning
Done


 72%|███████▏  | 26/36 [00:25<00:11,  1.11s/it]

Writing to /tmp/bcftools-sort.bHDYLY
Merging 1 temporary files
Cleaning
Done


 75%|███████▌  | 27/36 [00:28<00:13,  1.48s/it]

Writing to /tmp/bcftools-sort.O9sW1p
Merging 1 temporary files
Cleaning
Done


 78%|███████▊  | 28/36 [00:29<00:10,  1.37s/it]

Writing to /tmp/bcftools-sort.8FsErD
Merging 1 temporary files
Cleaning
Done


 81%|████████  | 29/36 [00:30<00:09,  1.30s/it]

Writing to /tmp/bcftools-sort.vxFYdO
Merging 1 temporary files
Cleaning
Done


 83%|████████▎ | 30/36 [00:31<00:07,  1.25s/it]

Writing to /tmp/bcftools-sort.8FiOD1
Merging 1 temporary files
Cleaning
Done


 86%|████████▌ | 31/36 [00:32<00:06,  1.20s/it]

Writing to /tmp/bcftools-sort.55Q999
Merging 1 temporary files
Cleaning
Done


 89%|████████▉ | 32/36 [00:33<00:04,  1.18s/it]

Writing to /tmp/bcftools-sort.ZVfbfb
Merging 1 temporary files
Cleaning
Done


 92%|█████████▏| 33/36 [00:34<00:03,  1.16s/it]

Writing to /tmp/bcftools-sort.LYlbHn
Merging 1 temporary files
Cleaning
Done


 94%|█████████▍| 34/36 [00:36<00:02,  1.14s/it]

Writing to /tmp/bcftools-sort.BSdSzx
Merging 1 temporary files
Cleaning
Done


 97%|█████████▋| 35/36 [00:37<00:01,  1.14s/it]

Writing to /tmp/bcftools-sort.CjRDwJ
Merging 1 temporary files
Cleaning
Done


100%|██████████| 36/36 [00:38<00:00,  1.13s/it]


## B) Output Reformated SNP BCF paths to a single file (for merging)

In [13]:
PeruIsolates_BCF_List_PATH = Target_Output_Dir + "/ListOfAll_RenamedBCFs.txt"

with open(PeruIsolates_BCF_List_PATH, "w") as outputFile:
    
    for Sample_Tag in tqdm(input_SampleNames):
        
        BCF_SNPs_Renamed_PATH = f"{Individual_BCFs_Dir}/{Sample_Tag}.snps.renamed.bcf"
        
        outputFile.write(BCF_SNPs_Renamed_PATH + "\n")

100%|██████████| 36/36 [00:00<00:00, 126039.19it/s]


## 3) Merge BCF files

In [14]:
Merged_VCF_PATH = Target_Output_Dir + f"/{AnalysisName}_MergedVCF_AtoRef.vcf"

!bcftools merge -i "-" -l $PeruIsolates_BCF_List_PATH -o $Merged_VCF_PATH -O v #-0

## 4) Convert the Merged VCF to Phylip format

In [15]:
!time /n/data1/hms/dbmi/farhat/mm774/MGM_bin/vcf2phylip/vcf2phylip.py -i $Merged_VCF_PATH -f -m 36


Converting file ../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup/210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF_AtoRef.vcf:

Number of samples in VCF: 36
Total of genotypes processed: 17995
Genotypes excluded because they exceeded the amount of missing data allowed: 2319
Genotypes that passed missing data filter but were excluded for not being SNPs: 3
SNPs that passed the filters: 15673

Sample 1 of 36, M0011368_9, added to the nucleotide matrix(ces).
Sample 2 of 36, M0014888_3, added to the nucleotide matrix(ces).
Sample 3 of 36, M0016395_7, added to the nucleotide matrix(ces).
Sample 4 of 36, M0010874_7, added to the nucleotide matrix(ces).
Sample 5 of 36, 01_R1430, added to the nucleotide matrix(ces).
Sample 6 of 36, 02_R0894, added to the nucleotide matrix(ces).
Sample 7 of 36, 02_R1708, added to the nucleotide matrix(ces).
Sample 8 of 36, 02_R1896, added to the nucleotide matrix(ces).
Sam

In [16]:
!ls -lah $Target_Output_Dir

total 1.4M
drwxrwsr-x  3 mm774 farhat  388 Mar 26 13:22 .
drwxrwsr-x 10 mm774 farhat  472 Mar 26 13:21 ..
-rw-rw-r--  1 mm774 farhat 552K Mar 26 13:22 210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF_AtoRef.min36.fasta
-rw-rw-r--  1 mm774 farhat 552K Mar 26 13:22 210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF_AtoRef.min36.phy
-rw-rw-r--  1 mm774 farhat 7.7M Mar 26 13:22 210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF_AtoRef.vcf
drwxrwsr-x  2 mm774 farhat 8.5K Mar 26 13:22 BCFs
-rw-rw-r--  1 mm774 farhat 5.1K Mar 26 13:22 ListOfAll_RenamedBCFs.txt


## 5) Generate a Phylogenetic Tree using FastTree

In [19]:
Input_FASTA_Aln = Target_Output_Dir + f"/{AnalysisName}_MergedVCF_AtoRef.min36.fasta"
PeruIsolates_TreeFile_PATH = Target_Output_Dir + f"/{AnalysisName}_MergedVCF.fasttree.newick"

!time FastTree -nt -gtr $Input_FASTA_Aln > $PeruIsolates_TreeFile_PATH

FastTree Version 2.1.10 SSE3
Alignment: ../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup/210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF_AtoRef.min36.fasta
Nucleotide distances: Jukes-Cantor Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (2 rounds range 10) +ML-NNI opt-each=1
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Generalized Time-Reversible, CAT approximation with 20 rate categories
Initial topology in 0.37 seconds
Refining topology: 21 rounds ME-NNIs, 2 rounds ME-SPRs, 10 rounds ML-NNIs
Total branch-length 0.936 after 6.53 sec1, 1 of 33 splits   
ML-NNI round 1: LogLk = -105573.482 NNIs 0 max delta 0.00 Time 9.85
GTR Frequencies: 0.1408 0.3580 0.3587 0.1425ep 12 of 12   ptimizing GTR model, step 2 of 12   
GTR rates(ac ag at cg ct gt) 1.0321 2.9804 0.4430 0.5129 2.9014 1.0000
Switched to using 20 rate categories (CAT approximation)19 of 20   Site likelihoods w

### Let's look at the output directory:

In [18]:
!ls -1 $Target_Output_Dir

210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF_AtoRef.min36.fasta
210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF_AtoRef.min36.phy
210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF_AtoRef.vcf
210109_Phylogeny_PMPSM_36CI.PacBio_MM2.V7.UnionOfSNPs.NoOutgroup_MergedVCF.fasttree.newick
BCFs
ListOfAll_RenamedBCFs.txt
