# Calculating relative coverage, and GC% across all 36 Mtb genome assemblies

### Maximillian Marin
### mgmarin@g.harvard.edu

### Goal: Calculate 1) relative coverage and 2) GC% <br>

Parsing this data will be used for any analysis of coverage bias across all 36 Mtb clinical isolates.

In [2]:
import numpy as np
import pandas as pd
import vcf
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

In [2]:
from scipy.stats import spearmanr

In [3]:
from Bio import SeqIO

#### Pandas Viewing Settings

In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Define functions for calculating GC% and entropy across a circular genome sequence

### Functions for calculating GC content

In [5]:
def calc_GCcontent_SlidingWindow (input_Seq, window_size):

    halfOf_WindowSize = window_size/2

    #print("Half of window size:", halfOf_WindowSize)
    
    gc_content = lambda s: 100.0*len([c for c in s if c in "GC"]) / len(s)

    GC_yy = []

    for i in  (  np.arange(0, len(input_Seq) - window_size, dtype=int) ):
        finalSeq = input_Seq[i:i + window_size]
        GC_yy.append(  gc_content(finalSeq) )

    GC_yy_FRONT_Circ = []

    for i in  np.arange(-halfOf_WindowSize, 0 , dtype=int):
        finalSeq = input_Seq[i:] + input_Seq[: window_size + i] # backSeq_IDX = window_size + i

        GC_yy_FRONT_Circ.append(  gc_content(finalSeq) )

    GC_yy_BACK_Circ = []

    for i in  np.arange( -window_size, -halfOf_WindowSize, dtype=int):    
        finalSeq = input_Seq[i:] + input_Seq[: window_size + i] # backSeq_IDX = window_size + i
        GC_yy_BACK_Circ.append(  gc_content(finalSeq) )

    GC_yy_FINAL = GC_yy_FRONT_Circ + GC_yy + GC_yy_BACK_Circ
    
    return np.array(GC_yy_FINAL)

## Parse Assembly and Sequence analysis results Sample Info

In [6]:
Repo_DataDir = "../../Data"

PMP_SM_ResultsSummary_Dir_210108 = Repo_DataDir + "/210108_PMP_SM_50CI_V7_ResultsSummary"


PMP_50CI_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_50CI_AssemblySummary_V7.tsv"

PMP_42CI_CicularOnly_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_42CI_CircularOnly_F2Filtered_AssemblySummary_V7.tsv" 

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_36CI_CircularOnly_F2Filtered_AtLeast40XMeanDepthIllumina_AssemblySummary_V7.tsv"       


PMP_50CI_AssemblySummary = pd.read_csv(PMP_50CI_AssemblySummary_TSV_PATH, sep = "\t")

PMP_42CI_CicularOnly_AssemblySummary = pd.read_csv(PMP_42CI_CicularOnly_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary = pd.read_csv(PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_AnalysisSet_AssemblySummary = PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary


SampleIDs_36CI_SOI = list( PMP_36CI_AnalysisSet_AssemblySummary["SampleID"].values )


print(','.join(SampleIDs_36CI_SOI) )

# Make sample to lineage mapping dict

ID_To_IlluminaAvrgCov_Dict = dict(PMP_50CI_AssemblySummary[['SampleID', 'IlluminaWGSToH37rv_AvrgCov']].values)                     
ID_To_Lineage_Dict = dict(PMP_50CI_AssemblySummary[['SampleID', 'PrimaryLineage_PB']].values)
ID_To_Dataset_Dict = dict(PMP_50CI_AssemblySummary[['SampleID', 'Dataset_Tag']].values)

M0011368_9,M0014888_3,M0016395_7,M0010874_7,01_R1430,02_R0894,02_R1708,02_R1896,M0016737_0,M0017522_5,01_R1134,M0003941_3,02_R1179,N1176,N0072,N0153,N0145,N0155,N0004,N1274,N0054,N1272,N0091,N1202,N1177,RW-TB008,DNA028,DNA075,DNA091,DNA044,DNA020,AZE_02_042,DNA019_Rose,DNA120,DNA188,DNA086


In [7]:
PMP_36CI_AnalysisSet_AssemblySummary[PMP_36CI_AnalysisSet_AssemblySummary["SampleID"] == "N0145"]

Unnamed: 0,SampleID,numContigs_Complete,circContig_Length,circContig_Cov,PacBio_Subread_Median_Length,LineageCall_Illumina,LineageCall_PacBio,F2_Illumina,F2_PacBio,ANI_I3,ANI_I3_PP,IlluminaWGSToH37rv_AvrgCov,PacBio_Subreads_H37Rv_AvrgCov,NumAnno_ORFs_PB_PilonPolished,NumAnno_ORFs_PB_DeNovo,GCcontent_PB_PP_GBK,NumChanges_PilonPolished,NumSNPs_PilonPolished,NumTotalInsertions_PilonPolished,Num1bpInsertion_PilonPolished,Num2bpInsertion_PilonPolished,NumTotalDeletions_PilonPolished,Num1bpDeletion_PilonPolished,PrimaryLineage_PB,PrimaryLineage_Ill,Dataset_Tag
16,N0145,1,4416863,344,2042.0,"lineage2,lineage2.2,lineage2.2.1,lineage2.2.1.1","lineage2,lineage2.2,lineage2.2.1,lineage2.2.1.1",0.008973,0.262688,99.8914,99.8941,89,344,4071,4073,65.604423,20,0,2,2,0,18,18,lineage2,lineage2,ChinerOms_2019


## Define Directories

In [8]:
PB_Vs_Illumina_DataAnalysis_Dir = "../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI"

CoverageBiasAnalysis_V2_Dir = PB_Vs_Illumina_DataAnalysis_Dir + "/210113_CoverageBiasAnalysis_GC_V2"  

!mkdir $CoverageBiasAnalysis_V2_Dir

for SampleID in SampleIDs_36CI_SOI:
    
    sample_CovBias_Dir = f"{CoverageBiasAnalysis_V2_Dir}/{SampleID}"
    
    !mkdir $sample_CovBias_Dir


mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210113_CoverageBiasAnalysis_GCandSH_V2’: File exists
mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210113_CoverageBiasAnalysis_GCandSH_V2/M0011368_9’: File exists
mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210113_CoverageBiasAnalysis_GCandSH_V2/M0014888_3’: File exists
mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210113_CoverageBiasAnalysis_GCandSH_V2/M0016395_7’: File exists
mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210113_CoverageBiasAnalysis_GCandSH_V2/M0010874_7’: File exists
mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210113_CoverageBiasAnalysis_GCandSH_V2/01_R1430’: File exists
mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210113_CoverageBiasAnalysis_GCandSH_V2/02_R0894’: File exists


In [9]:
!ls -1 $PB_Vs_Illumina_DataAnalysis_Dir

201016_FP_TP_FN_And_SV_DistributionAcrossH37Rv_Analysis_V3_PB_MM2_GT
201027_Genmap_Mappability_H37rv_V1
210112_EBR_H37rv_36CI_MM2vsPilon_V7
210113_CoverageBiasAnalysis_GCandSH_V2
210126_FalsePositivesAnalysis_V4
Happy_VC_Eval_ResultsDir_36CI


## Define directory that contains output of analysis pipeline

In [10]:
# Define varaint calling pipeline output directories

PacBio_ProjectDir = "/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project"

PMP_SM_Outputs_Dir = PacBio_ProjectDir + "/PacmanPipe_SM_Outputs"

PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir = PMP_SM_Outputs_Dir + "/201201_PMP_SM_TB_Portals_R1_Output_V2"


## Save dictionary of paths to relevant files

In [11]:
dictOf_PATHs_36CI_Mtb = {}

for SampleID in SampleIDs_36CI_SOI: 

    Sample_Output_Dir = PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir + "/" + SampleID
   
    # Define PATH to Assembly FASTA (filtered for contigs greater than 100 Kb)
    GC3_PP_OutputDir = f"{Sample_Output_Dir}/FlyeAssembly_I3_IlluminaPolishing/pilon_IllPE_Polishing_I3_Assembly_ChangeSNPsINDELsOnly"
    Sample_GC3_PP_Assembly_FA_PATH = f"{GC3_PP_OutputDir}/{SampleID}.Flye.I3Assembly.PilonPolished.fasta"
    
    
    CovAnalysis_Dir = f"{Sample_Output_Dir}/PBassembly_Flye_I3_PP_CoverageAnalysis"
    
    # Define PATH to PacBio read coverage across assembly sequence
    
    PB_CovAnalysis_Dir = f"{CovAnalysis_Dir}/PacBio_Subreads_AlignedTo_Flye_I3_PP_Minimap2"
    
    PB_Depth_TXT_PATH = f"{PB_CovAnalysis_Dir}/{SampleID}.pb.subreads.AlnTo.Flye_I3_PP.minimap2.bam.depth.txt"
    
    
    # Define PATH to Illumina read coverage across assembly sequence
    
    Illumina_CovAnalysis_Dir = f"{CovAnalysis_Dir}/IlluminaPE_AlignedTo_Flye_I3_PP_bwamem"
 
    Illumina_Depth_TXT_PATH = f"{Illumina_CovAnalysis_Dir}/{SampleID}.IllPE.AlnTo.Flye_I3_PP.duprem.bam.depth.txt"
    
    dictOf_PATHs_36CI_Mtb[SampleID] = {}

    dictOf_PATHs_36CI_Mtb[SampleID]["Sample_GC3_PP_Assembly_FA_PATH"] = Sample_GC3_PP_Assembly_FA_PATH
    dictOf_PATHs_36CI_Mtb[SampleID]["PB_Depth_TXT_PATH"] = PB_Depth_TXT_PATH
    dictOf_PATHs_36CI_Mtb[SampleID]["Illumina_Depth_TXT_PATH"] = Illumina_Depth_TXT_PATH


## Parse and save TSV for coverage and GC bias information

In [12]:
listOf_Sample_Cov_GC_DF = []

for SampleID in tqdm(SampleIDs_36CI_SOI):  #tqdm(SampleIDs_40CI_SOI): 

    ### 1) Parse Assembly sequence ###
    Assembly_FA = dictOf_PATHs_36CI_Mtb[SampleID]["Sample_GC3_PP_Assembly_FA_PATH"]
    records = list(SeqIO.parse(Assembly_FA, "fasta"))
    Mtb_Assembly_Seq = str( records[0].seq )


    ### 2) Calculate GC content ###
    #Mtb_Assembly_GC_50bp_SW_Array = calc_GCcontent_SlidingWindow (input_Seq = Mtb_Assembly_Seq, window_size = 50)
    Mtb_Assembly_GC_100bp_SW_Array = calc_GCcontent_SlidingWindow (input_Seq = Mtb_Assembly_Seq, window_size = 100)   

    
    ### 3) Parse PacBio sequencing depth ###
    PB_Depth_TXT_PATH = dictOf_PATHs_36CI_Mtb[SampleID]["PB_Depth_TXT_PATH"]
    Sample_PB_DP_Array = pd.read_csv(PB_Depth_TXT_PATH, sep= "\t", header = None )[2].values
    Sample_PB_DP_PerBaseNormalized = Sample_PB_DP_Array / Sample_PB_DP_Array.mean()


    ### 4) Parse Illumina sequencing depth ###

    Illumina_Depth_TXT_PATH = dictOf_PATHs_36CI_Mtb[SampleID]["Illumina_Depth_TXT_PATH"]
    Sample_Illumina_DP_Array = pd.read_csv(Illumina_Depth_TXT_PATH, sep= "\t", header = None )[2].values
    Sample_Illumina_DP_PerBaseNormalized = Sample_Illumina_DP_Array / Sample_Illumina_DP_Array.mean()

    
    ### 5) Merge all data into single dataframe

    Sample_CovBias_NParrays = np.array( [Sample_Illumina_DP_Array,
                                         Sample_Illumina_DP_PerBaseNormalized,
                                         Sample_PB_DP_Array, 
                                         Sample_PB_DP_PerBaseNormalized,
                                         Mtb_Assembly_GC_100bp_SW_Array,] ).T

    Sample_Cov_GC_DF = pd.DataFrame( Sample_CovBias_NParrays )
    Sample_Cov_GC_DF.columns = ["Illumina_DP", "Illumina_DP_PBN", "PacBio_DP", "PacBio_DP_PBN", "GC_100bp"]
    Sample_Cov_GC_DF["RefPos_0based"] = Sample_Cov_GC_DF.index
    Sample_Cov_GC_DF["SampleID"] = SampleID


    ### 6) Output DF to TSV
    
    sample_CovBias_Dir = f"{CoverageBiasAnalysis_V2_Dir}/{SampleID}"
    
    sample_CovBias_TSV_PATH = f"{sample_CovBias_Dir}/{SampleID}.Depth.GCcontent.tsv"

    Sample_Cov_GC_DF.to_csv(sample_CovBias_TSV_PATH, sep = "\t", index=False)
    
    #listOf_Sample_Cov_GC_DF.append()



100%|██████████| 36/36 [38:06<00:00, 63.06s/it]


In [13]:
!head $Illumina_Depth_TXT_PATH

DNA086_contig_1_pilon	1	24
DNA086_contig_1_pilon	2	25
DNA086_contig_1_pilon	3	25
DNA086_contig_1_pilon	4	25
DNA086_contig_1_pilon	5	27
DNA086_contig_1_pilon	6	27
DNA086_contig_1_pilon	7	27
DNA086_contig_1_pilon	8	27
DNA086_contig_1_pilon	9	28
DNA086_contig_1_pilon	10	28


# Merge and output all individual relative coverage TSVs for all 40 samples

In [14]:
# This step requires a lot of memory
listOf_Sample_Cov_GC_DF = []

for SampleID in tqdm(SampleIDs_36CI_SOI):
    
    #print(f"Sample: {SampleID}")
    
    sample_CovBias_Dir = f"{CoverageBiasAnalysis_V2_Dir}/{SampleID}"
    sample_CovBias_TSV_PATH = f"{sample_CovBias_Dir}/{SampleID}.Depth.GCcontent.tsv"
    Sample_Cov_GC_DF = pd.read_csv(sample_CovBias_TSV_PATH, sep = "\t")

    listOf_Sample_Cov_GC_DF.append(Sample_Cov_GC_DF)
    
#All_Cov_GC_DF = pd.concat(listOf_Sample_Cov_GC_DF)


100%|██████████| 36/36 [01:52<00:00,  3.06s/it]


## Output all samples DF to TSV


In [15]:
## Output all samples DF to TSV

All_Cov_GC_DF = pd.concat(listOf_Sample_Cov_GC_DF)


In [16]:
All_Cov_GC_DF.shape

(158808901, 7)

In [17]:
## Output all samples DF to TSV

Combined_CovBias_TSV_PATH = f"{CoverageBiasAnalysis_V2_Dir}/210113.CoverageBiasAnalysis.Mtb.36CI.V3.GC.SH.tsv"

All_Cov_GC_DF.to_csv(Combined_CovBias_TSV_PATH, sep = "\t", index=False)

### Look at output directory

In [18]:
!ls -la $CoverageBiasAnalysis_V2_Dir | grep ".tsv"

-rw-rw-r--  1 mm774 farhat 11082325171 Mar 15 17:56 210113.CoverageBiasAnalysis.Mtb.36CI.V3.GC.SH.tsv


In [19]:
!ls -lah $CoverageBiasAnalysis_V2_Dir | grep ".tsv"

-rw-rw-r--  1 mm774 farhat 11G Mar 15 17:56 210113.CoverageBiasAnalysis.Mtb.36CI.V3.GC.SH.tsv
