In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import stats
# import gffutils

%matplotlib inline

#### Pandas Viewing Settings

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Define Directories

In [3]:
PB_Vs_Illumina_DataAnalysis_Dir = "../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI"

FalsePositive_Analysis_V2_Dir = PB_Vs_Illumina_DataAnalysis_Dir + "/210126_FalsePositivesAnalysis_V4"  

PBvIll_EBR_Dir = PB_Vs_Illumina_DataAnalysis_Dir + "/210112_EBR_H37rv_36CI_MM2vsPilon_V7"         


!mkdir $FalsePositive_Analysis_V2_Dir

mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210126_FalsePositivesAnalysis_V4’: File exists


## Parse Assembly and Sequence analysis results Sample Info

In [4]:
Repo_DataDir = "../../Data"

PMP_SM_ResultsSummary_Dir_210108 = Repo_DataDir + "/210108_PMP_SM_50CI_V7_ResultsSummary"

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_36CI_CircularOnly_F2Filtered_AtLeast40XMeanDepthIllumina_AssemblySummary_V7.tsv"       

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary = pd.read_csv(PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_AnalysisSet_AssemblySummary = PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary


SampleIDs_36CI_SOI = list( PMP_36CI_AnalysisSet_AssemblySummary["SampleID"].values )


print(','.join(SampleIDs_36CI_SOI) )

# Make sample to metadata mapping dicts

ID_To_IlluminaAvrgCov_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'IlluminaWGSToH37rv_AvrgCov']].values)                     
ID_To_Lineage_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'PrimaryLineage_PB']].values)
ID_To_Dataset_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'Dataset_Tag']].values)

M0011368_9,M0014888_3,M0016395_7,M0010874_7,01_R1430,02_R0894,02_R1708,02_R1896,M0016737_0,M0017522_5,01_R1134,M0003941_3,02_R1179,N1176,N0072,N0153,N0145,N0155,N0004,N1274,N0054,N1272,N0091,N1202,N1177,RW-TB008,DNA028,DNA075,DNA091,DNA044,DNA020,AZE_02_042,DNA019_Rose,DNA120,DNA188,DNA086


## 1) Read back in EBR and Pmappability H37rv gene level analysis

In [5]:
#Repo_DataDir = "../../Data"


FeatureLevelAnalysis_Dir_O2 = PBvIll_EBR_Dir + "/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap" 


H37Rv_FeatureLevelAnalysis_EBR_Pmap_TSV_O2_Repo = f"{FeatureLevelAnalysis_Dir_O2}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.tsv"
H37Rv_GeneLevelAnalysis_EBR_Pmap_TSV_O2_Repo = f"{FeatureLevelAnalysis_Dir_O2}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.Genes.tsv"
H37Rv_IntergenicLevelAnalysis_EBR_Pmap_TSV_O2_Repo = f"{FeatureLevelAnalysis_Dir_O2}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.IntergenicRegions.tsv"

FLA_DF = pd.read_csv(H37Rv_FeatureLevelAnalysis_EBR_Pmap_TSV_O2_Repo, sep = "\t",)

GLA_DF = pd.read_csv(H37Rv_GeneLevelAnalysis_EBR_Pmap_TSV_O2_Repo, sep = "\t",)

## Construct dictionary with PATHs to relevant files for all samples

### Define directories to PMP-SM (PacBio assembly and analysis pipeline)

In [6]:
### Define directories to PMP-SM (PacBio assembly and analysis pipeline)

### Define varaint calling pipeline output directories

PacBio_ProjectDir = "/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project"

PMP_SM_Outputs_Dir = PacBio_ProjectDir + "/PacmanPipe_SM_Outputs"

PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir = PMP_SM_Outputs_Dir + "/201201_PMP_SM_TB_Portals_R1_Output_V2"


In [7]:
SampleID_ToPaths_Dict = {}

for SampleID in SampleIDs_36CI_SOI:

    SampleID_ToPaths_Dict[SampleID] = {}    
    
    # Defining PATHs for PacBio data output (PacmanPipe-V4)
    sample_PMP_OutputDir = PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir + "/" + SampleID
    
    ### NucDiff SVs and SNPs/INDELs PATHs ###
    NucDiff_OutDir = f"{sample_PMP_OutputDir}/VariantCallingVersusH37Rv/NucDiff_Analysis_{SampleID}"
    NucDiff_ResultsDir = f"{NucDiff_OutDir}/results"
    
    
    # NucDiff SVs GFF PATH #
    NucDiff_SVs_Filtered_GFF = f"{NucDiff_ResultsDir}/NucDiff_{SampleID}_ref_struct.Filtered.SVs.gff"
    
    SampleID_ToPaths_Dict[SampleID]["NucDiff_SVs_Filtered_GFF"] = NucDiff_SVs_Filtered_GFF

    
    # NucDiff SNPs AND INDELs VCF #

    NucDiff_SNPsAndINDELs_VCF = f"{NucDiff_ResultsDir}/NucDiff_{SampleID}_ref_snps.Reformated.SNPsAndINDELs.Lengths_1to15bp.vcf"
    
    
    ################# END #################
    
    
    
    ### Hap.py variant classification VCFs ###
    
    Happy_OutDir = f"{sample_PMP_OutputDir}/Hap.py_VariantCalling_EvalDir/PBMM2_Paftools_GroundTruthVCF_Evaluations_V3_minMQ_1_minDP_5_Fix_All_Breaks_AmbRegionsRemoved"

    # No Masking ("None")
    RegionFilteringScheme = "MM2vsPilon_NoneRemoved"
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme] = {}
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["SNPs"] = {}
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["INDELs"] = {}
    
    Happy_Sample_Dir = f"{Happy_OutDir}/{SampleID}_Happy_VCeval_T_PB_G3PP_MM2_paftools_Vs_Q_Ill_Pilon_VCs_NoRegionsRemoved"

    Happy_G3PP_Vs_Pilon_ROC_SNPs_PASS_CSV = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.roc.Locations.SNP.PASS.csv.gz"
    Happy_G3PP_Vs_Pilon_ROC_INDELs_PASS_CSV = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.roc.Locations.INDEL.PASS.csv.gz"
    
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["SNPs"]["Happy_ROC_PASS_CSV"] = Happy_G3PP_Vs_Pilon_ROC_SNPs_PASS_CSV
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["INDELs"]["Happy_ROC_PASS_CSV"] = Happy_G3PP_Vs_Pilon_ROC_INDELs_PASS_CSV   

    
    Happy_VCF_GZ = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.vcf.gz"
    Happy_SNPsOnly_VCF = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.vcf"
    Happy_SNPsOnly_FPsOnly_VCF = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.FPs.vcf"
    Happy_SNPsOnly_FPsOnly_MQ30_VCF = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.FPs.FiltMQ30.vcf"
    Happy_SNPsOnly_TPsOnly_VCF = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.TPs.vcf"
    Happy_SNPsOnly_FNsOnly_VCF = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.FNs.vcf"

    Happy_SNPsOnly_FPsOnly_BCF_GZ = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.FPs.bcf.gz"
    Happy_SNPsOnly_FPsOnly_MQ30_BCF_GZ = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.FPs.FiltMQ30.bcf.gz"
    
    Happy_SNPsOnly_TPsOnly_BCF_GZ = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.TPs.bcf.gz"
    Happy_SNPsOnly_FNsOnly_BCF_GZ = f"{Happy_Sample_Dir}/Hap.py.{SampleID}.SNPs.FNs.bcf.gz"
    
    
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_VCF_GZ_PATH"] = Happy_VCF_GZ
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_VCF"] = Happy_SNPsOnly_VCF
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_FPsOnly_VCF"] = Happy_SNPsOnly_FPsOnly_VCF
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_FPsOnly_BCF_GZ"] = Happy_SNPsOnly_FPsOnly_BCF_GZ
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_FPsOnly_FiltMQ30_VCF"] = Happy_SNPsOnly_FPsOnly_MQ30_VCF
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_FPsOnly_FiltMQ30_BCF_GZ"] = Happy_SNPsOnly_FPsOnly_MQ30_BCF_GZ
       
    
    
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_TPsOnly_BCF_GZ"] = Happy_SNPsOnly_TPsOnly_BCF_GZ
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_FNsOnly_BCF_GZ"] = Happy_SNPsOnly_FNsOnly_BCF_GZ
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_TPsOnly_VCF"] = Happy_SNPsOnly_TPsOnly_VCF
    SampleID_ToPaths_Dict[SampleID][RegionFilteringScheme]["Happy_SNPsOnly_FNsOnly_VCF"] = Happy_SNPsOnly_FNsOnly_VCF
    ################# END #################


    
    
    

In [8]:
SampleID_ToPaths_Dict.keys()

dict_keys(['M0011368_9', 'M0014888_3', 'M0016395_7', 'M0010874_7', '01_R1430', '02_R0894', '02_R1708', '02_R1896', 'M0016737_0', 'M0017522_5', '01_R1134', 'M0003941_3', '02_R1179', 'N1176', 'N0072', 'N0153', 'N0145', 'N0155', 'N0004', 'N1274', 'N0054', 'N1272', 'N0091', 'N1202', 'N1177', 'RW-TB008', 'DNA028', 'DNA075', 'DNA091', 'DNA044', 'DNA020', 'AZE_02_042', 'DNA019_Rose', 'DNA120', 'DNA188', 'DNA086'])

# 1) Subset and filter Hap.py annotated VCFs

- Create VCFs for SNPs only and TPs, FPs, FNs

In [9]:
#!tabix -h 

In [10]:
#SampleIDs_28CI_WiCircA

In [11]:
SampleID_ToPaths_Dict.keys()

dict_keys(['M0011368_9', 'M0014888_3', 'M0016395_7', 'M0010874_7', '01_R1430', '02_R0894', '02_R1708', '02_R1896', 'M0016737_0', 'M0017522_5', '01_R1134', 'M0003941_3', '02_R1179', 'N1176', 'N0072', 'N0153', 'N0145', 'N0155', 'N0004', 'N1274', 'N0054', 'N1272', 'N0091', 'N1202', 'N1177', 'RW-TB008', 'DNA028', 'DNA075', 'DNA091', 'DNA044', 'DNA020', 'AZE_02_042', 'DNA019_Rose', 'DNA120', 'DNA188', 'DNA086'])

In [12]:

for SampleID in tqdm(SampleIDs_36CI_SOI):

    i_Happy_VCF_GZ_PATH = SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_VCF_GZ_PATH"]
    i_Happy_SNPsOnly_VCF_PATH = SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_VCF"]
    
    i_Happy_SNPsOnly_FPsOnly_VCF_PATH = SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_VCF"]    
    i_Happy_SNPsOnly_FPsOnly_FiltMQ30_VCF_PATH = SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_FiltMQ30_VCF"]    

    i_Happy_SNPsOnly_FPsOnly_BCF_GZ_PATH = SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_BCF_GZ"]    
    i_Happy_SNPsOnly_FPsOnly_BCF_FiltMQ30_GZ_PATH = SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_FiltMQ30_BCF_GZ"]    

    
    !bcftools view --types snps $i_Happy_VCF_GZ_PATH > $i_Happy_SNPsOnly_VCF_PATH
    !bcftools view --types snps -i '(FORMAT/BD[1]=="FP")' $i_Happy_VCF_GZ_PATH > $i_Happy_SNPsOnly_FPsOnly_VCF_PATH
    !bcftools view --types snps -i '(FORMAT/BD[1]=="FP") & (FORMAT/QQ[1]>=30)' $i_Happy_VCF_GZ_PATH > $i_Happy_SNPsOnly_FPsOnly_FiltMQ30_VCF_PATH

    #!bcftools view --output-type b --types snps -i '(FORMAT/BD[1]=="FP") & (FORMAT/QQ[1]>=30)' $i_Happy_VCF_GZ_PATH > $i_Happy_SNPsOnly_FPsOnly_BCF_FiltMQ30_GZ_PATH

    #! tabix $i_Happy_SNPsOnly_FPsOnly_BCF_GZ_PATH
    #! tabix $i_Happy_SNPsOnly_FPsOnly_BCF_GZ_PATH

    #break
    #print(SampleID)
    #print("MQ > 1")
    #!bcftools view -H --types snps -i '(FORMAT/BD[1]=="FP") ' $i_Happy_VCF_GZ_PATH | wc -l 
    #print("MQ >= 30")

    #!bcftools view -H --types snps -i '(FORMAT/BD[1]=="FP") & (FORMAT/QQ[1]>=30)' $i_Happy_VCF_GZ_PATH | wc -l 

    #print()
    #print()

100%|██████████| 36/36 [00:15<00:00,  2.43it/s]


In [13]:
!bcftools view -H --types snps -i '(FORMAT/BD[1]=="FP") ' $i_Happy_VCF_GZ_PATH  | wc -l

6


In [14]:
!bcftools view -H --types snps -i '(FORMAT/BD[1]=="FP") ' $i_Happy_SNPsOnly_FPsOnly_VCF_PATH  | wc -l

6


In [15]:
!bcftools view -H --types snps -i '(FORMAT/BD[1]=="FP") ' $i_Happy_SNPsOnly_FPsOnly_FiltMQ30_VCF_PATH  | wc -l

6


In [16]:
!bcftools view -H --types snps -i '(FORMAT/BD[1]=="FP") ' $i_Happy_VCF_GZ_PATH

NC_000962.3	1982064	.	G	C	.	.	BS=1982064;Regions=CONF,TS_contained	GT:QQ:BD:BK:BI:BVT:BLT	.:.:.:.:.:NOCALL:nocall	1/1:53:FP:.:tv:SNP:homalt
NC_000962.3	1982067	.	G	C	.	.	BS=1982067;Regions=CONF,TS_contained	GT:QQ:BD:BK:BI:BVT:BLT	.:.:.:.:.:NOCALL:nocall	1/1:53:FP:.:tv:SNP:homalt
NC_000962.3	2074509	.	C	G	.	.	BS=2074509;Regions=CONF,TS_contained	GT:QQ:BD:BK:BI:BVT:BLT	.:.:.:.:.:NOCALL:nocall	1/1:52:FP:.:tv:SNP:homalt
NC_000962.3	2163790	.	A	C	.	.	BS=2163790;Regions=CONF,TS_contained	GT:QQ:BD:BK:BI:BVT:BLT	.:.:.:.:.:NOCALL:nocall	1/1:55:FP:.:tv:SNP:homalt
NC_000962.3	2531964	.	C	G	.	.	BS=2531964;Regions=CONF,TS_contained	GT:QQ:BD:BK:BI:BVT:BLT	.:.:.:.:.:NOCALL:nocall	1/1:51:FP:.:tv:SNP:homalt
NC_000962.3	3232815	.	A	G	.	.	BS=3232815;Regions=CONF,TS_contained	GT:QQ:BD:BK:BI:BVT:BLT	.:.:.:.:.:NOCALL:nocall	1/1:54:FP:.:ti:SNP:homalt


In [17]:
SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_FiltMQ30_VCF"]

'/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project/PacmanPipe_SM_Outputs/201201_PMP_SM_TB_Portals_R1_Output_V2/DNA086/Hap.py_VariantCalling_EvalDir/PBMM2_Paftools_GroundTruthVCF_Evaluations_V3_minMQ_1_minDP_5_Fix_All_Breaks_AmbRegionsRemoved/DNA086_Happy_VCeval_T_PB_G3PP_MM2_paftools_Vs_Q_Ill_Pilon_VCs_NoRegionsRemoved/Hap.py.DNA086.SNPs.FPs.FiltMQ30.vcf'

In [18]:
SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_VCF"]

'/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project/PacmanPipe_SM_Outputs/201201_PMP_SM_TB_Portals_R1_Output_V2/DNA086/Hap.py_VariantCalling_EvalDir/PBMM2_Paftools_GroundTruthVCF_Evaluations_V3_minMQ_1_minDP_5_Fix_All_Breaks_AmbRegionsRemoved/DNA086_Happy_VCeval_T_PB_G3PP_MM2_paftools_Vs_Q_Ill_Pilon_VCs_NoRegionsRemoved/Hap.py.DNA086.SNPs.FPs.vcf'

### Question: How many FPs per sample ?
Answer: __ false positives across 36 isolates (MQ > 1)
Answer: __ false positives across 36 isolates (MQ > 30)

## How many FPs (MQ > 1)?

In [19]:
listOf_NumFPs = []

for SampleID in (SampleIDs_36CI_SOI):
    i_Happy_SNPsOnly_FPsOnly_VCF_PATH = SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_VCF"]    
        
    #!wc -l $i_Happy_SNPsOnly_FPsOnly_VCF_PATH | cut -d ' ' -f 1
    
    print(SampleID, end=" ")
    !bcftools view  --no-header $i_Happy_SNPsOnly_FPsOnly_VCF_PATH | wc -l
    
    Num_FPs_Str = !bcftools view  --no-header $i_Happy_SNPsOnly_FPsOnly_VCF_PATH | wc -l | cut -d ' ' -f 1
    Num_FPs_Int = int(Num_FPs_Str[0])
    listOf_NumFPs.append(Num_FPs_Int)

print(sum(listOf_NumFPs))

M0011368_9 2
M0014888_3 8
M0016395_7 4
M0010874_7 3
01_R1430 15
02_R0894 26
02_R1708 20
02_R1896 9
M0016737_0 17
M0017522_5 8
01_R1134 4
M0003941_3 32
02_R1179 10
N1176 21
N0072 34
N0153 33
N0145 25
N0155 21
N0004 15
N1274 37
N0054 41
N1272 28
N0091 31
N1202 27
N1177 4
RW-TB008 19
DNA028 30
DNA075 26
DNA091 29
DNA044 20
DNA020 37
AZE_02_042 9
DNA019_Rose 21
DNA120 13
DNA188 11
DNA086 6
696


## How many FPs (MQ > 30)?

In [20]:
listOf_NumFPs = []

for SampleID in (SampleIDs_36CI_SOI):
    i_Happy_SNPsOnly_FPsOnly_VCF_PATH = SampleID_ToPaths_Dict[SampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_FiltMQ30_VCF"]    
        
    #!wc -l $i_Happy_SNPsOnly_FPsOnly_VCF_PATH | cut -d ' ' -f 1
    
    print(SampleID, end=" ")
    !bcftools view  --no-header $i_Happy_SNPsOnly_FPsOnly_VCF_PATH | wc -l
    
    Num_FPs_Str = !bcftools view  --no-header $i_Happy_SNPsOnly_FPsOnly_VCF_PATH | wc -l | cut -d ' ' -f 1
    Num_FPs_Int = int(Num_FPs_Str[0])
    listOf_NumFPs.append(Num_FPs_Int)

print(sum(listOf_NumFPs))

M0011368_9 2
M0014888_3 7
M0016395_7 2
M0010874_7 2
01_R1430 12
02_R0894 20
02_R1708 20
02_R1896 9
M0016737_0 12
M0017522_5 8
01_R1134 3
M0003941_3 18
02_R1179 7
N1176 13
N0072 31
N0153 21
N0145 19
N0155 19
N0004 15
N1274 34
N0054 35
N1272 16
N0091 19
N1202 22
N1177 4
RW-TB008 19
DNA028 16
DNA075 18
DNA091 17
DNA044 19
DNA020 36
AZE_02_042 9
DNA019_Rose 21
DNA120 10
DNA188 7
DNA086 6
548


# 2) Merge all False Positive SNPs

In [21]:
All_FPs_SNPs_FiltMQ30_VCF_PATH = f"{FalsePositive_Analysis_V2_Dir}/PMP_36CI.SNPs.All.FPs.FiltMQ30.vcf"
All_FPs_SNPs_VCF_PATH = f"{FalsePositive_Analysis_V2_Dir}/PMP_36CI.SNPs.All.FPs.vcf"


In [22]:
listOf_All_SNPs_FPs_VCFs = [ SampleID_ToPaths_Dict[sampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_VCF"]   for sampleID in  SampleIDs_36CI_SOI ] 
listOf_All_SNPs_FPs_FiltMQ30_VCFs = [ SampleID_ToPaths_Dict[sampleID]["MM2vsPilon_NoneRemoved"]["Happy_SNPsOnly_FPsOnly_FiltMQ30_VCF"]   for sampleID in  SampleIDs_36CI_SOI ] 


spaceSeperated_listOf_All_SNPs_FPs_VCFs = " ".join(listOf_All_SNPs_FPs_VCFs)
spaceSeperated_listOf_All_SNPs_FPs_FiltMQ30_VCFs = " ".join(listOf_All_SNPs_FPs_FiltMQ30_VCFs)


### Merge FP SNP VCFs w/ bcftools (MQ > 1)

In [23]:
!bcftools concat $spaceSeperated_listOf_All_SNPs_FPs_VCFs | bcftools sort > $All_FPs_SNPs_VCF_PATH

Writing to /tmp/bcftools-sort.onEXas
Checking the headers and starting positions of 36 files
Concatenating /n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project/PacmanPipe_SM_Outputs/201201_PMP_SM_TB_Portals_R1_Output_V2/M0011368_9/Hap.py_VariantCalling_EvalDir/PBMM2_Paftools_GroundTruthVCF_Evaluations_V3_minMQ_1_minDP_5_Fix_All_Breaks_AmbRegionsRemoved/M0011368_9_Happy_VCeval_T_PB_G3PP_MM2_paftools_Vs_Q_Ill_Pilon_VCs_NoRegionsRemoved/Hap.py.M0011368_9.SNPs.FPs.vcf	0.000379 seconds
Concatenating /n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project/PacmanPipe_SM_Outputs/201201_PMP_SM_TB_Portals_R1_Output_V2/M0014888_3/Hap.py_VariantCalling_EvalDir/PBMM2_Paftools_GroundTruthVCF_Evaluations_V3_minMQ_1_minDP_5_Fix_All_Breaks_AmbRegionsRemoved/M0014888_3_Happy_VCeval_T_PB_G3PP_MM2_paftools_Vs_Q_Ill_Pilon_VCs_NoRegionsRemoved/Hap.py.M0014888_3.SNPs.FPs.vcf	0.000394 seconds
Concatenating /n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project/PacmanPipe_SM_O

### Merge FP SNP VCFs w/ bcftools (MQ > 30)

In [24]:
!bcftools concat $spaceSeperated_listOf_All_SNPs_FPs_FiltMQ30_VCFs | bcftools sort > $All_FPs_SNPs_FiltMQ30_VCF_PATH

Writing to /tmp/bcftools-sort.loKzeH
Checking the headers and starting positions of 36 files
Concatenating /n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project/PacmanPipe_SM_Outputs/201201_PMP_SM_TB_Portals_R1_Output_V2/M0011368_9/Hap.py_VariantCalling_EvalDir/PBMM2_Paftools_GroundTruthVCF_Evaluations_V3_minMQ_1_minDP_5_Fix_All_Breaks_AmbRegionsRemoved/M0011368_9_Happy_VCeval_T_PB_G3PP_MM2_paftools_Vs_Q_Ill_Pilon_VCs_NoRegionsRemoved/Hap.py.M0011368_9.SNPs.FPs.FiltMQ30.vcf	0.000359 seconds
Concatenating /n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project/PacmanPipe_SM_Outputs/201201_PMP_SM_TB_Portals_R1_Output_V2/M0014888_3/Hap.py_VariantCalling_EvalDir/PBMM2_Paftools_GroundTruthVCF_Evaluations_V3_minMQ_1_minDP_5_Fix_All_Breaks_AmbRegionsRemoved/M0014888_3_Happy_VCeval_T_PB_G3PP_MM2_paftools_Vs_Q_Ill_Pilon_VCs_NoRegionsRemoved/Hap.py.M0014888_3.SNPs.FPs.FiltMQ30.vcf	0.000386 seconds
Concatenating /n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Proje

### Look at output files

In [25]:
!ls -alh $FalsePositive_Analysis_V2_Dir

total 984K
drwxrwsr-x  2 mm774 farhat  453 Feb  2 23:37 .
drwxrwsr-x 11 mm774 farhat  529 Mar 26 23:36 ..
-rw-rw-r--  1 mm774 farhat 637K Mar 26 23:54 200901_Mtb_H37rv_AllRegions_Info.SNPs.All.FPs.bed
-rw-rw-r--  1 mm774 farhat 637K Mar 26 23:54 200901_Mtb_H37rv_AllRegions_Info.SNPs.FPs.FiltMQ30.bed
-rw-rw-r--  1 mm774 farhat 2.6K Feb 19 21:26 210202_Mtb_H37rv.Top30SourcesOfFPs.FiltMQ30.bed
-rw-rw-r--  1 mm774 farhat 2.7K Feb 19 21:26 210202_Mtb_H37rv.Top30SourcesOfFPs.FiltMQ30.WithHeader.bed
-rw-rw-r--  1 mm774 farhat 665K Mar 26 23:54 H37Rv_AllRegions_GeneAndIntergenic_NumberOf_FPs.FiltMQ30.tsv
-rw-rw-r--  1 mm774 farhat  92K Mar 27 00:02 PMP_36CI.SNPs.All.FPs.FiltMQ30.vcf
-rw-rw-r--  1 mm774 farhat 112K Mar 27 00:02 PMP_36CI.SNPs.All.FPs.vcf


In [26]:
!md5sum $All_FPs_SNPs_VCF_PATH $All_FPs_SNPs_FiltMQ30_VCF_PATH

ad5679c448cb6f9f3ef418e956f62a80  ../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210126_FalsePositivesAnalysis_V4/PMP_36CI.SNPs.All.FPs.vcf
219c296932b95c976d74209a4b4781ec  ../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210126_FalsePositivesAnalysis_V4/PMP_36CI.SNPs.All.FPs.FiltMQ30.vcf


In [27]:
!bcftools view  --no-header $All_FPs_SNPs_VCF_PATH | wc -l

696


In [28]:
!bcftools view  --no-header $All_FPs_SNPs_FiltMQ30_VCF_PATH | wc -l

548


### Look at repeating False Positives

In [29]:
!bcftools view  --no-header $All_FPs_SNPs_VCF_PATH | cut -f 2 | sort | uniq -c | sort -k 1nr | head -n 10

     20 2163790
     20 3846851
     20 3846852
     20 3846853
     20 3846857
     20 3846860
     20 3846866
     20 3846897
     19 3846886
     12 2165503


In [30]:
!bcftools view  --no-header $All_FPs_SNPs_VCF_PATH | cut -f 2 | sort | wc -l

696


# 3.B) Parse processed NucDiff SVs

In [31]:
Repo_DataDir = "../../Data"

PMP_28CI_NucDiff_SV_Analysis_Dir = Repo_DataDir + "/210126_PMP_36CI_NucDiff_SV_Analysis_Dir"
PMP_28CI_NucDiff_AllSVs_Detected_TSV = PMP_28CI_NucDiff_SV_Analysis_Dir + "/210126.PMP.36CI.NucDiff_AllSVs_Detected.V2.tsv"

NucDiff_SVs_36CI_DF = pd.read_csv(PMP_28CI_NucDiff_AllSVs_Detected_TSV, sep="\t")
NucDiff_SVs_36CI_DF.shape

(5484, 7)

### Filter for SVs >= 50 bp

In [32]:
NucDiff_SVs_50bp_OrMore_DF = NucDiff_SVs_36CI_DF[NucDiff_SVs_36CI_DF["SV_Length"] >=50 ]
NucDiff_SVs_50bp_OrMore_DF.shape

(3620, 7)

In [33]:
NucDiff_SVs_36CI_DF.head()

Unnamed: 0,Chrom,start_0based,end_0based,SV_Type,SV_Length,SampleID,PrimaryLineage
0,NC_000962.3,71583,71586,duplication,2,M0011368_9,lineage4
1,NC_000962.3,71585,71586,insertion,35,M0011368_9,lineage4
2,NC_000962.3,150889,150903,duplication,13,M0011368_9,lineage4
3,NC_000962.3,150902,150903,insertion,167,M0011368_9,lineage4
4,NC_000962.3,335050,337913,tandem_duplication,2862,M0011368_9,lineage4


In [34]:
#NucDiff_SVs_36CI_DF["SV_Type"].value_counts()

In [35]:
#NucDiff_SVs_50bp_OrMore_DF["SV_Type"].value_counts()

# 4) Import/parse processed H37rv genome annotations

In [36]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"

H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"
H37Rv_GenomeAnnotations_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.IntergenicRegions.tsv"
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv"    
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed"


## H37Rv Gene Annotations TSV
H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")

## H37Rv_Integenic_Regions_TSV
H37Rv_IntergenicRegions_DF = pd.read_csv(H37Rv_GenomeAnnotations_IntergenicRegions_TSV, sep = "\t")

## H37Rv_GeneAndIntegenic_Regions_TSV
H37Rv_AllRegions_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_TSV, sep = "\t")


In [37]:
#!ls -lah $AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir

In [38]:
H37Rv_AllRegions_DF.shape

(7151, 17)

In [39]:
H37Rv_IntergenicRegions_DF.shape

(3072, 9)

In [40]:
H37Rv_GenomeAnno_Genes_DF.shape

(4079, 12)

In [41]:
!head $H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED | head -n 1

NC_000962.3	0	1524	+	Rv0001	dnaA	NotExcluded	None	information pathways


In [42]:
!head $H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED | head -n 1 | cut -f 1-6

NC_000962.3	0	1524	+	Rv0001	dnaA


In [43]:
H37Rv_AllRegions_DF.head(3)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,NotExcluded,,Chromosomal replication initiator protein DnaA,information pathways,No,,,,,
1,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,Intergenic,,,Intergenic,,False,Rv0001,NotExcluded,Rv0002,NotExcluded
2,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,NotExcluded,,DNA polymerase III (beta chain) DnaN (DNA nucl...,information pathways,No,,,,,


### Let's check that the union of the two feature sets (and make sure it makes up all of H37rv)

In [44]:
!cat $H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED | cut -f 1,2,3 | sort -k 1,1 -k2,2n | bedtools merge     

NC_000962.3	0	4411532


In [45]:
!cat $H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED | cut -f 1,2,3 | sort -k 1,1 -k2,2n | bedtools merge > ./All_H37rv_Genome.bed

### Verify that 704 FPs (MQ > 1) overlap with the entire genome

In [46]:
!bedtools annotate -counts -i All_H37rv_Genome.bed -files $All_FPs_SNPs_VCF_PATH 

NC_000962.3	0	4411532	696


### Verify that 556 FPs (MQ > 30) overlap with the entire genome

In [47]:
!bedtools annotate -counts -i All_H37rv_Genome.bed -files $All_FPs_SNPs_FiltMQ30_VCF_PATH 

NC_000962.3	0	4411532	548


In [48]:
!rm ./All_H37rv_Genome.bed

# 5) Overlap All regions (gene + intergenic) with all potential FPs (All 36 samples)

In [49]:

H37rv_AllRegions_AnnoBy_All_FPs_PATH = f"{FalsePositive_Analysis_V2_Dir}/200901_Mtb_H37rv_AllRegions_Info.SNPs.All.FPs.bed" 
H37rv_AllRegions_AnnoBy_All_FPs_FiltMQ30_PATH = f"{FalsePositive_Analysis_V2_Dir}/200901_Mtb_H37rv_AllRegions_Info.SNPs.FPs.FiltMQ30.bed" 



!bedtools annotate -counts -i $H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED -files $All_FPs_SNPs_VCF_PATH > $H37rv_AllRegions_AnnoBy_All_FPs_PATH  
!bedtools annotate -counts -i $H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED -files $All_FPs_SNPs_FiltMQ30_VCF_PATH > $H37rv_AllRegions_AnnoBy_All_FPs_FiltMQ30_PATH  

In [50]:
#!bedtools annotate -counts -i $H37rv_AllGenesAnnotated_BED_DF_PATH -files $All_FPs_SNPs_VCF_PATH > $H37rv_Genes_AnnoBy_All_FPs_PATH  

#!bedtools annotate -counts -i $H37rv_AllIntergenicRegions_BED_DF_PATH -files $All_FPs_SNPs_VCF_PATH > $H37rv_IntergenicRegions_AnnoBy_All_FPs_PATH 



In [51]:
!head $H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED

NC_000962.3	0	1524	+	Rv0001	dnaA	NotExcluded	None	information pathways
NC_000962.3	1524	2051		IntergenicRegion_1_Rv0001-Rv0002		Intergenic		Intergenic
NC_000962.3	2051	3260	+	Rv0002	dnaN	NotExcluded	None	information pathways
NC_000962.3	3260	3279		IntergenicRegion_2_Rv0002-Rv0003		Intergenic		Intergenic
NC_000962.3	3279	4437	+	Rv0003	recF	NotExcluded	None	information pathways
NC_000962.3	4433	4997	+	Rv0004	Rv0004	NotExcluded	None	conserved hypotheticals
NC_000962.3	4997	5239		IntergenicRegion_3_Rv0004-Rv0005		Intergenic		Intergenic
NC_000962.3	5239	7267	+	Rv0005	gyrB	NotExcluded	None	information pathways
NC_000962.3	7267	7301		IntergenicRegion_4_Rv0005-Rv0006		Intergenic		Intergenic
NC_000962.3	7301	9818	+	Rv0006	gyrA	NotExcluded	None	information pathways


In [52]:
!head $H37rv_AllRegions_AnnoBy_All_FPs_PATH


NC_000962.3	1048411	1050346	+	Rv0939	Rv0939	NotExcluded	None	intermediary metabolism and respiration	0
NC_000962.3	2096876	2097299	+	Rv1847	Rv1847	NotExcluded	None	conserved hypotheticals	0
NC_000962.3	3145170	3147873	-	Rv2839c	infB	NotExcluded	None	information pathways	0
NC_000962.3	4193390	4195373	-	Rv3743c	ctpJ	NotExcluded	None	cell wall and cell processes	0
NC_000962.3	130894	131104	-	Rv0108c	Rv0108c	NotExcluded	None	conserved hypotheticals	0
NC_000962.3	260923	262252	+	Rv0218	Rv0218	NotExcluded	None	cell wall and cell processes	0
NC_000962.3	392695	394045	-	Rv0327c	cyp135A1	NotExcluded	None	intermediary metabolism and respiration	0
NC_000962.3	522346	524533	-	Rv0435c	Rv0435c	NotExcluded	None	cell wall and cell processes	0
NC_000962.3	654923	655949	-	Rv0564c	gpdA1	NotExcluded	None	lipid metabolism	0
NC_000962.3	786148	786946	+	Rv0686	Rv0686	NotExcluded	None	cell wall and cell processes	0


In [53]:
AllRegions_AnnoByFPs_FiltMQ30_DF = pd.read_csv(H37rv_AllRegions_AnnoBy_All_FPs_FiltMQ30_PATH, sep="\t", header=None)

AllRegions_AnnoByFPs_FiltMQ30_DF.columns = ['Chrom', 'Start', 'End',
                                    'Strand', 'H37rv_GeneID', 'Symbol',
                                    'ExcludedGroup_Category', 'PEandPPE_Subfamily',
                                    'Functional_Category', "FP_Count"] 
                                   
AllRegions_AnnoByFPs_FiltMQ30_DF = AllRegions_AnnoByFPs_FiltMQ30_DF.sort_values("Start")
AllRegions_AnnoByFPs_FiltMQ30_DF["Length"] = AllRegions_AnnoByFPs_FiltMQ30_DF["End"] - AllRegions_AnnoByFPs_FiltMQ30_DF["Start"]
AllRegions_AnnoByFPs_FiltMQ30_DF.head()

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,ExcludedGroup_Category,PEandPPE_Subfamily,Functional_Category,FP_Count,Length
271,NC_000962.3,0,1524,+,Rv0001,dnaA,NotExcluded,,information pathways,0,1524
272,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,,Intergenic,0,527
273,NC_000962.3,2051,3260,+,Rv0002,dnaN,NotExcluded,,information pathways,0,1209
274,NC_000962.3,3260,3279,,IntergenicRegion_2_Rv0002-Rv0003,,Intergenic,,Intergenic,0,19
275,NC_000962.3,3279,4437,+,Rv0003,recF,NotExcluded,,information pathways,0,1158


In [54]:
AllRegions_AnnoByFPs_FiltMQ30_DF["Length"].sum()

4427939

In [55]:
AllRegions_AnnoByFPs_FiltMQ30_DF.query("Symbol == 'dnaA' ")

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,ExcludedGroup_Category,PEandPPE_Subfamily,Functional_Category,FP_Count,Length
271,NC_000962.3,0,1524,+,Rv0001,dnaA,NotExcluded,,information pathways,0,1524


In [56]:
AllRegions_AnnoByFPs_FiltMQ30_DF.sort_values("FP_Count", ascending=False).head(30)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,ExcludedGroup_Category,PEandPPE_Subfamily,Functional_Category,FP_Count,Length
6286,NC_000962.3,3845970,3847164,,IntergenicRegion_2683_Rv3428c-Rv3429,,Intergenic,,Intergenic,142,1194
2840,NC_000962.3,1636003,1638229,-,Rv1452c,PE_PGRS28,PE/PPEs,PE_V_PGRS,PE/PPE,59,2226
3645,NC_000962.3,2162931,2167311,-,Rv1917c,PPE34,PE/PPEs,PPE_SL-5_PPE-MPTR,PE/PPE,35,4380
138,NC_000962.3,1981613,1984775,-,Rv1753c,PPE24,PE/PPEs,PPE_SL-5_PPE-MPTR,PE/PPE,24,3162
6428,NC_000962.3,3941723,3944963,+,Rv3512,PE_PGRS56,PE/PPEs,PE_V_PGRS,PE/PPE,16,3240
5393,NC_000962.3,3232506,3232870,,IntergenicRegion_2285_Rv2920c-Rv2921c,,Intergenic,,Intergenic,16,364
30,NC_000962.3,3931004,3936710,+,Rv3508,PE_PGRS54,PE/PPEs,PE_V_PGRS,PE/PPE,16,5706
5574,NC_000962.3,3379375,3380452,-,Rv3021c,PPE47,PE/PPEs,,PE/PPE,14,1077
186,NC_000962.3,2867123,2867786,+,Rv2544,lppB,Coscolla Repetitive Genes,,cell wall and cell processes,14,663
245,NC_000962.3,3945793,3950263,+,Rv3514,PE_PGRS57,PE/PPEs,PE_V_PGRS,PE/PPE,14,4470


In [57]:
AllRegions_AnnoByFPs_FiltMQ30_DF.shape

(7151, 11)

In [58]:
AllRegions_AnnoByFPs_FiltMQ30_DF.sort_values("FP_Count", ascending=False).head(20)["FP_Count"].sum()

439

In [59]:
AllRegions_AnnoByFPs_FiltMQ30_DF.sort_values("FP_Count", ascending=False).head(20)["FP_Count"].sum() / AllRegions_AnnoByFPs_FiltMQ30_DF["FP_Count"].sum()   


0.801094890510949

In [60]:
AllRegions_AnnoByFPs_FiltMQ30_DF.sort_values("FP_Count", ascending=False).head(30)["FP_Count"].sum() / AllRegions_AnnoByFPs_FiltMQ30_DF["FP_Count"].sum()

0.8941605839416058

In [61]:
AllRegions_AnnoByFPs_FiltMQ30_DF.sort_values("FP_Count", ascending=False).head(40)["FP_Count"].sum() / AllRegions_AnnoByFPs_FiltMQ30_DF["FP_Count"].sum()


0.9452554744525548

In [62]:
AllRegions_AnnoByFPs_FiltMQ30_DF.sort_values("FP_Count", ascending=False).head(20)["Length"].sum()

41987

In [63]:
AllRegions_AnnoByFPs_FiltMQ30_DF.sort_values("FP_Count", ascending=False)["Length"].sum()

4427939

In [64]:
AllRegions_AnnoByFPs_FiltMQ30_DF.head(3)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,ExcludedGroup_Category,PEandPPE_Subfamily,Functional_Category,FP_Count,Length
271,NC_000962.3,0,1524,+,Rv0001,dnaA,NotExcluded,,information pathways,0,1524
272,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,,Intergenic,0,527
273,NC_000962.3,2051,3260,+,Rv0002,dnaN,NotExcluded,,information pathways,0,1209


In [65]:
AllRegions_AnnoByFPs_FiltMQ30_DF.tail(3)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,ExcludedGroup_Category,PEandPPE_Subfamily,Functional_Category,FP_Count,Length
7148,NC_000962.3,4410411,4410789,-,Rv3923c,rnpA,NotExcluded,,information pathways,0,378
7149,NC_000962.3,4410785,4410929,-,Rv3924c,rpmH,NotExcluded,,information pathways,0,144
7150,NC_000962.3,4410789,4411532,,IntergenicRegion_3072_Rv3923c-Rv0001,,Intergenic,,Intergenic,0,743


In [66]:
AllRegions_AnnoByFPs_FiltMQ30_DF.groupby("ExcludedGroup_Category").sum()

Unnamed: 0_level_0,Start,End,FP_Count,Length
ExcludedGroup_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Coscolla Repetitive Genes,168204775,168285322,26,80547
InsertionSeqs_And_Phages,364622673,364731336,20,108663
Intergenic,6731198364,6731584086,217,385722
NotExcluded,8071642055,8075211718,14,3569663
PE/PPEs,375075922,375359266,271,283344


In [67]:
AllRegions_AnnoByFPs_FiltMQ30_DF.groupby("ExcludedGroup_Category").sum().sum()

Start       15710743789
End         15715171728
FP_Count            548
Length          4427939
dtype: int64

# Outputting Supplementary tables for this analysis

## Make output directory

In [68]:

#FP_And_SV_DistributionAcrossH37Rv_Analysis_Dir = f"{PB_Vs_Illumina_DataAnalysis_Dir}/201016_FP_TP_FN_And_SV_DistributionAcrossH37Rv_Analysis_V3_PB_MM2_GT"
#!mkdir $FP_And_SV_DistributionAcrossH37Rv_Analysis_Dir

## A) FPs per region (gene + intergenic)

In [69]:
AllRegions_AnnoBy_FPs_FiltMQ30_TSV = f"{FalsePositive_Analysis_V2_Dir}/H37Rv_AllRegions_GeneAndIntergenic_NumberOf_FPs.FiltMQ30.tsv"           

In [70]:
AllRegions_AnnoByFPs_FiltMQ30_DF.to_csv(AllRegions_AnnoBy_FPs_FiltMQ30_TSV, sep = "\t", index = False)         


In [71]:
!head -n 4 $AllRegions_AnnoBy_FPs_FiltMQ30_TSV

Chrom	Start	End	Strand	H37rv_GeneID	Symbol	ExcludedGroup_Category	PEandPPE_Subfamily	Functional_Category	FP_Count	Length
NC_000962.3	0	1524	+	Rv0001	dnaA	NotExcluded	None	information pathways	0	1524
NC_000962.3	1524	2051		IntergenicRegion_1_Rv0001-Rv0002		Intergenic		Intergenic	0	527
NC_000962.3	2051	3260	+	Rv0002	dnaN	NotExcluded	None	information pathways	0	1209


In [72]:
AllRegions_AnnoByFPs_FiltMQ30_DF.head(3)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,ExcludedGroup_Category,PEandPPE_Subfamily,Functional_Category,FP_Count,Length
271,NC_000962.3,0,1524,+,Rv0001,dnaA,NotExcluded,,information pathways,0,1524
272,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,,Intergenic,0,527
273,NC_000962.3,2051,3260,+,Rv0002,dnaN,NotExcluded,,information pathways,0,1209
