# Exploring regions that with ambiguously defined ground truths relative to the H37Rv reference genome

### Goal: To summarize the variant caller annotations associated with poorly recalled genomic positions. 

In [1]:
import numpy as np
import pandas as pd
import vcf
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

%matplotlib inline

#### Pandas Viewing Settings

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Parse Assembly and Sequence analysis results Sample Info

In [3]:
Repo_DataDir = "../../Data"

PMP_SM_ResultsSummary_Dir_210108 = Repo_DataDir + "/210108_PMP_SM_50CI_V7_ResultsSummary"

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_36CI_CircularOnly_F2Filtered_AtLeast40XMeanDepthIllumina_AssemblySummary_V7.tsv"       

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary = pd.read_csv(PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_AnalysisSet_AssemblySummary = PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary


SampleIDs_36CI_SOI = list( PMP_36CI_AnalysisSet_AssemblySummary["SampleID"].values )


print(','.join(SampleIDs_36CI_SOI) )

# Make sample to metadata mapping dicts

ID_To_IlluminaAvrgCov_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'IlluminaWGSToH37rv_AvrgCov']].values)                     
ID_To_Lineage_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'PrimaryLineage_PB']].values)
ID_To_Dataset_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'Dataset_Tag']].values)

M0011368_9,M0014888_3,M0016395_7,M0010874_7,01_R1430,02_R0894,02_R1708,02_R1896,M0016737_0,M0017522_5,01_R1134,M0003941_3,02_R1179,N1176,N0072,N0153,N0145,N0155,N0004,N1274,N0054,N1272,N0091,N1202,N1177,RW-TB008,DNA028,DNA075,DNA091,DNA044,DNA020,AZE_02_042,DNA019_Rose,DNA120,DNA188,DNA086


# 0) Read in pickles of processed data

## Read back in NPY for EBR-36CI

In [4]:
PB_Vs_Illumina_DataAnalysis_Dir = "../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI"

# Define directory for EBR analysis data
PBvIll_EBR_Dir = PB_Vs_Illumina_DataAnalysis_Dir + "/210112_EBR_H37rv_36CI_MM2vsPilon_V7"         

PBvsIll_EBR_IndivSample_NPZs = f"{PBvIll_EBR_Dir}/210112_EBR_H37rv_IndividualSampleRecall_NPZs"


# Parse in aggregated EBR-36CI array
EBR_36CI_WGS40X_NPZ_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.npz"

EBR_36CI_Array_A4 = np.load(EBR_36CI_WGS40X_NPZ_PATH)["arr_0"]


In [5]:
#print(EBR_31CI_Array.shape)

In [6]:
print("# of total H37Rv positions:", EBR_36CI_Array_A4.shape[0]  )

# of total H37Rv positions: 4411532


In [7]:
print("# of H37Rv positions w/ EBR < 1:", EBR_36CI_Array_A4[ EBR_36CI_Array_A4 < 1].shape[0])    

# of H37Rv positions w/ EBR < 1: 267471


  """Entry point for launching an IPython kernel.


In [8]:
EBR_36CI_Array_A4[ EBR_36CI_Array_A4 < 1].shape[0] /  EBR_36CI_Array_A4.shape[0]

  """Entry point for launching an IPython kernel.


0.060629958028186125

In [9]:
EBR_36CI_Array_A4[ EBR_36CI_Array_A4 < 0.95].shape[0] /  EBR_36CI_Array_A4.shape[0]

  """Entry point for launching an IPython kernel.


0.03752596603628853

In [10]:
EBR_36CI_Array_A4[ EBR_36CI_Array_A4 < 0.9].shape[0] /  EBR_36CI_Array_A4.shape[0]

  """Entry point for launching an IPython kernel.


0.028594828281875775

### Define directories of PMP-SM pipeline

In [11]:
# Define varaint calling pipeline output directories

PacBio_ProjectDir = "/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project"
PMP_SM_Outputs_Dir = PacBio_ProjectDir + "/PacmanPipe_SM_Outputs"
PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir = PMP_SM_Outputs_Dir + "/201201_PMP_SM_TB_Portals_R1_Output_V2"     



In [12]:
dictOf_EBR_IndivIsolate_DFs = {}

#dictOf_EBR_IndivIsolate_NPYs = {}

#dictOf_BED_DFs_IndivSample_EBRs = {}

dictOf_EBR_Breakdown_Dicts = {}

SumOf_EBR_Breakdown_Dicts = {"Outcome_5" : 0,
                             "Outcome_7" : 0}


sample_Counter = 1

for SampleID in tqdm(SampleIDs_36CI_SOI):
    
    Sample_Output_Dir = PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir + "/" + SampleID
    
    EBR_Output_Dir = f"{Sample_Output_Dir}/EmpiricalBasePairRecall_Analysis_V7_PacBio_Vs_IlluminaPilon"
    
    # Let's read in EBR TSV
    i_EBR_Agreement_DF_TSV_PATH = f"{EBR_Output_Dir}/EBR.V7.IndivIsolate.{SampleID}.tsv"
        
    i_EBR_Outcome_Breakdown_Dict_JSON_PATH = f"{EBR_Output_Dir}/EBR.V7.IndivIsolate.{SampleID}.OutcomeBreakdown.json"        
    
    with open(i_EBR_Outcome_Breakdown_Dict_JSON_PATH) as json_file: i_count_EBR_Outcomes_Dict = json.load(json_file) 
    
    i_EBR_DF = pd.read_csv(i_EBR_Agreement_DF_TSV_PATH, sep = "\t" )

    #i_EBR_WithAmbFilled_Array = i_EBR_DF["Agreement"].fillna("Ambiguous").values
    
    #dictOf_BED_DFs_IndivSample_EBRs[SampleID] = convert_GenomeNParray_To_BED_DF(i_EBR_WithAmbFilled_Array)
    
    
    dictOf_EBR_IndivIsolate_DFs[SampleID] = i_EBR_DF

    #dictOf_EBR_IndivIsolate_NPYs[SampleID] = i_EBR_DF["Agreement"].values
        
    dictOf_EBR_Breakdown_Dicts[SampleID] = i_count_EBR_Outcomes_Dict

    
    SumOf_EBR_Breakdown_Dicts["Outcome_5"] += i_count_EBR_Outcomes_Dict["Outcome_5"]
    SumOf_EBR_Breakdown_Dicts["Outcome_7"] += i_count_EBR_Outcomes_Dict["Outcome_7"]

    
    
    

100%|██████████| 36/36 [02:19<00:00,  3.99s/it]


In [13]:
SumOf_EBR_Breakdown_Dicts

{'Outcome_5': 669, 'Outcome_7': 113848}

In [14]:
117074 + 591

117665

In [15]:
591 / 1829181

0.0003230954181133524

In [16]:
117682 / 1829181 

0.0643358967756608

In [17]:
dictOf_EBR_IndivIsolate_DFs["N0153"].head()

Unnamed: 0,Agreement,Ill_Pilon_Tag,Ill_Pilon_TD,Ill_Pilon_DP,Ill_Pilon_MQ
0,1.0,PASS,65.0,40.0,42.0
1,1.0,PASS,66.0,41.0,42.0
2,1.0,PASS,66.0,41.0,42.0
3,1.0,PASS,66.0,41.0,42.0
4,1.0,PASS,66.0,41.0,42.0


In [18]:
ListOf_lowEBRpos_DFs = []
pd.set_option('mode.chained_assignment', None)

for SampleID in ( SampleIDs_36CI_SOI ) :
    
    i_EBR_DF = dictOf_EBR_IndivIsolate_DFs[SampleID]
    
    i_EBR_DF_lowEBRpos = i_EBR_DF[ i_EBR_DF["Agreement"] == 0]
        
    i_EBR_DF_lowEBRpos["SampleID"] = SampleID
    
    ListOf_lowEBRpos_DFs.append(i_EBR_DF_lowEBRpos)
    
    #print(SampleID, i_EBR_DF_lowEBRpos.shape)
    
ALL_lowEBRpos_36CI_DF = pd.concat(ListOf_lowEBRpos_DFs)
ALL_lowEBRpos_36CI_DF.shape


(1825385, 6)

In [19]:
ALL_lowEBRpos_36CI_DF.head(3)

Unnamed: 0,Agreement,Ill_Pilon_Tag,Ill_Pilon_TD,Ill_Pilon_DP,Ill_Pilon_MQ,SampleID
71335,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71336,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71337,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9


In [20]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts().sum()

1825385

In [21]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts()

LowCov            1145079
Del;LowCov         302826
Del                257423
PASS               114517
Amb                  3773
Amb;LowCov            858
Del;Amb               719
Del;Amb;LowCov        190
Name: Ill_Pilon_Tag, dtype: int64

In [22]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts(normalize=True)

LowCov            0.627308
Del;LowCov        0.165897
Del               0.141024
PASS              0.062736
Amb               0.002067
Amb;LowCov        0.000470
Del;Amb           0.000394
Del;Amb;LowCov    0.000104
Name: Ill_Pilon_Tag, dtype: float64

In [23]:

PilonTags_For_Del_Classification = ["Del", "Del;LowCov", "Del;Amb", "Del;Amb;LowCov"]

#PilonTags_For_LowCov_Classification = ["LowCov","Amb;LowCov",]

#PilonTags_For_LowCov_Classification = ["LowCov", "Del;LowCov", "Amb;LowCov", "Del;Amb;LowCov"]


PilonTags_For_LowCov_Classification = ["LowCov","Amb;LowCov",]


PilonTags_For_Amb_Classification = ["Amb"]

PilonTags_For_PASS_Classification = ["PASS"]



In [24]:
Del_Class_lowEBRpos_36CI_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(PilonTags_For_Del_Classification) ]

LowCov_Class_lowEBRpos_36CI_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(PilonTags_For_LowCov_Classification) ]

Amb_Class_lowEBRpos_36CI_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(["Amb"]) ]

PASS_Class_lowEBRpos_36CI_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(["PASS"]) ]


In [25]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts().sum()

1825385

# What are the breakdown Pilon tags associated with positions of poor recall. 

### A) How many low recall positions were FALSE deletion calls (Del)?

In [26]:
Del_Class_lowEBRpos_36CI_DF.shape

(561158, 6)

In [27]:
Del_Class_lowEBRpos_36CI_DF.shape[0] / ALL_lowEBRpos_36CI_DF.shape[0]

0.30741898284471497

### B) How many low recall positions were tagged as LowCov (with no deletion call)?

In [28]:
LowCov_Class_lowEBRpos_36CI_DF.shape

(1145937, 6)

In [29]:
LowCov_Class_lowEBRpos_36CI_DF.shape[0] / ALL_lowEBRpos_36CI_DF.shape[0]

0.6277782495199643

### C) How many low recall positions were tagged as Amb (with no Del or LowCov tag)?

In [30]:
Amb_Class_lowEBRpos_36CI_DF.shape

(3773, 6)

In [31]:
Amb_Class_lowEBRpos_36CI_DF.shape[0] / ALL_lowEBRpos_36CI_DF.shape[0]

0.0020669612164009236

### D) How many low recall positions were FALSEY tagged as PASS?

In [32]:
PASS_Class_lowEBRpos_36CI_DF.shape

(114517, 6)

In [33]:
PASS_Class_lowEBRpos_36CI_DF.shape[0] / ALL_lowEBRpos_36CI_DF.shape[0]

0.06273580641891985

## Verify that total counts match up

In [34]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts().sum()

1825385

In [35]:
Del_Class_lowEBRpos_36CI_DF.shape[0]+ LowCov_Class_lowEBRpos_36CI_DF.shape[0] + Amb_Class_lowEBRpos_36CI_DF.shape[0] + PASS_Class_lowEBRpos_36CI_DF.shape[0]

1825385

In [36]:

#TF_Array_WithLowCov = ( (ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(listOf_PilonTags_For_LowCov_Outcome) ) )

#lowEBRpos_36CI_LowCov_DF = ALL_lowEBRpos_36CI_DF[TF_Array_WithLowCov]

lowEBRpos_36CI_LowCov_MQZero_DF = LowCov_Class_lowEBRpos_36CI_DF.query("(Ill_Pilon_TD >= 5) & (Ill_Pilon_DP < 5) & (Ill_Pilon_MQ == 0) ")
lowEBRpos_36CI_LowCov_InvalidPairs_DF = LowCov_Class_lowEBRpos_36CI_DF.query("(Ill_Pilon_TD >= 5) & (Ill_Pilon_DP < 5) & (Ill_Pilon_MQ > 0) ")      
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF = LowCov_Class_lowEBRpos_36CI_DF.query("(Ill_Pilon_TD < 5)  ")



## How much total low-EBR positions have the LowCov tag?
Answer: 1,446,305

In [37]:
LowCov_Class_lowEBRpos_36CI_DF.shape

(1145937, 6)

## How much total low-EBR positions have the LowCov and had insufficient total depth ?
Answer: ____ (__% of all Low-EBR and LowCov positions)

In [38]:
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF.shape

(525373, 6)

In [39]:
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF.shape[0] / LowCov_Class_lowEBRpos_36CI_DF.shape[0]

0.45846586679721485

## How much total low-EBR positions have the LowCov and had insufficient depth due to MQ = 0 ?
Answer: 316,473 (27.6% of all Low-EBR and LowCov positions)

In [40]:
lowEBRpos_36CI_LowCov_MQZero_DF.shape

(316304, 6)

In [41]:
lowEBRpos_36CI_LowCov_MQZero_DF.shape[0] / LowCov_Class_lowEBRpos_36CI_DF.shape[0]

0.2760221547955952

## How much total low-EBR positions have the LowCov and had insufficient depth due to Invalid Pairs (MQ > 0)?
Answer: _____ (__% of all Low-EBR and LowCov positions)

In [42]:
lowEBRpos_36CI_LowCov_InvalidPairs_DF.shape

(304260, 6)

In [43]:
lowEBRpos_36CI_LowCov_InvalidPairs_DF.shape[0] / LowCov_Class_lowEBRpos_36CI_DF.shape[0]

0.26551197840718993

### Verify that total counts add up

In [44]:
LowCov_Class_lowEBRpos_36CI_DF.shape[0]

1145937

In [45]:
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF.shape[0] + lowEBRpos_36CI_LowCov_InvalidPairs_DF.shape[0] + lowEBRpos_36CI_LowCov_MQZero_DF.shape[0]

1145937

In [46]:
i_EBR_DF_lowEBRpos["Ill_Pilon_Tag"].value_counts()

LowCov            28905
Del;LowCov         5936
Del                4560
PASS                906
Amb                  79
Amb;LowCov           23
Del;Amb              22
Del;Amb;LowCov        4
Name: Ill_Pilon_Tag, dtype: int64

In [47]:
i_EBR_DF_lowEBRpos["Ill_Pilon_Tag"].value_counts(normalize=True)

LowCov            0.714851
Del;LowCov        0.146804
Del               0.112774
PASS              0.022406
Amb               0.001954
Amb;LowCov        0.000569
Del;Amb           0.000544
Del;Amb;LowCov    0.000099
Name: Ill_Pilon_Tag, dtype: float64

In [48]:
i_EBR_DF_lowEBRpos.head()

Unnamed: 0,Agreement,Ill_Pilon_Tag,Ill_Pilon_TD,Ill_Pilon_DP,Ill_Pilon_MQ,SampleID
86769,0.0,LowCov,4.0,4.0,60.0,DNA086
86770,0.0,LowCov,4.0,4.0,60.0,DNA086
86771,0.0,LowCov,3.0,3.0,60.0,DNA086
86772,0.0,LowCov,3.0,3.0,60.0,DNA086
86773,0.0,LowCov,3.0,3.0,60.0,DNA086


In [49]:
i_EBR_DF_lowEBRpos[   i_EBR_DF_lowEBRpos["Ill_Pilon_Tag"] == "PASS"]

Unnamed: 0,Agreement,Ill_Pilon_Tag,Ill_Pilon_TD,Ill_Pilon_DP,Ill_Pilon_MQ,SampleID
154138,0.0,PASS,54.0,50.0,54.0,DNA086
154139,0.0,PASS,54.0,50.0,54.0,DNA086
154140,0.0,PASS,54.0,49.0,54.0,DNA086
154141,0.0,PASS,54.0,49.0,54.0,DNA086
154142,0.0,PASS,54.0,47.0,54.0,DNA086
...,...,...,...,...,...,...
3933629,0.0,PASS,11.0,5.0,40.0,DNA086
3933630,0.0,PASS,11.0,5.0,40.0,DNA086
3933631,0.0,PASS,11.0,5.0,40.0,DNA086
3933632,0.0,PASS,11.0,5.0,40.0,DNA086


In [50]:
i_EBR_DF_lowEBRpos["Ill_Pilon_Tag"].value_counts()

LowCov            28905
Del;LowCov         5936
Del                4560
PASS                906
Amb                  79
Amb;LowCov           23
Del;Amb              22
Del;Amb;LowCov        4
Name: Ill_Pilon_Tag, dtype: int64

In [59]:
i_EBR_DF_lowEBRpos["Ill_Pilon_Tag"].value_counts(normalize=True)

LowCov            0.714851
Del;LowCov        0.146804
Del               0.112774
PASS              0.022406
Amb               0.001954
Amb;LowCov        0.000569
Del;Amb           0.000544
Del;Amb;LowCov    0.000099
Name: Ill_Pilon_Tag, dtype: float64

In [52]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts(normalize=True)

LowCov            0.627308
Del;LowCov        0.165897
Del               0.141024
PASS              0.062736
Amb               0.002067
Amb;LowCov        0.000470
Del;Amb           0.000394
Del;Amb;LowCov    0.000104
Name: Ill_Pilon_Tag, dtype: float64

In [53]:
ALL_lowEBRpos_36CI_DF.shape

(1825385, 6)

In [54]:
## 

In [55]:
ALL_lowEBRpos_36CI_DF.head(2)

Unnamed: 0,Agreement,Ill_Pilon_Tag,Ill_Pilon_TD,Ill_Pilon_DP,Ill_Pilon_MQ,SampleID
71335,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71336,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9


In [56]:

listOf_PilonTags_WithDel = ["Del", "Del;LowCov", "Del;Amb", "Del;Amb;LowCov"]
listOf_PilonTags_WithLowCov = ["LowCov", "Del;LowCov", "Amb;LowCov", "Del;Amb;LowCov"]
            
#listOf_PilonTags_WithLowCov = ["LowCov", "Del;LowCov", "Amb;LowCov", "Del;Amb;LowCov"]

#listOf_PilonTags_WithLowCov = ["LowCov","Amb;LowCov", "Del;Amb;LowCov"]

listOf_PilonTags_For_LowCov_Outcome = ["LowCov","Amb;LowCov",]

TF_Array_WithLowCov = ( (ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(listOf_PilonTags_For_LowCov_Outcome) ) )

lowEBRpos_36CI_LowCov_DF = ALL_lowEBRpos_36CI_DF[TF_Array_WithLowCov]

lowEBRpos_36CI_LowCov_MQZero_DF = lowEBRpos_36CI_LowCov_DF.query("(Ill_Pilon_TD >= 5) & (Ill_Pilon_DP < 5) & (Ill_Pilon_MQ == 0) ")
lowEBRpos_36CI_LowCov_InvalidPairs_DF = lowEBRpos_36CI_LowCov_DF.query("(Ill_Pilon_TD >= 5) & (Ill_Pilon_DP < 5) & (Ill_Pilon_MQ > 0) ")      
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF = lowEBRpos_36CI_LowCov_DF.query("(Ill_Pilon_TD < 5)  ")




In [64]:
lowEBRpos_36CI_LowCov_DF.shape

(1145937, 6)

In [65]:
lowEBRpos_36CI_LowCov_MQZero_DF.shape

(316304, 6)

In [66]:
lowEBRpos_36CI_LowCov_MQZero_DF.shape[0] / lowEBRpos_36CI_LowCov_DF.shape[0]

0.2760221547955952

In [67]:
lowEBRpos_36CI_LowCov_InvalidPairs_DF.shape

(304260, 6)

In [68]:
lowEBRpos_36CI_LowCov_InvalidPairs_DF.shape[0] / lowEBRpos_36CI_LowCov_DF.shape[0]

0.26551197840718993

In [69]:
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF.shape

(525373, 6)

In [70]:
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF.shape[0] / lowEBRpos_36CI_LowCov_DF.shape[0]

0.45846586679721485

In [57]:
STOP!!

SyntaxError: invalid syntax (<ipython-input-57-480d53eab50e>, line 1)


### Outcome 1) PB-MM2 and Pilon are both confident, and AGREE on nucleotide identity (EBR = 1)



### Outcome 2) PB-MM2 and Pilon BOTH AGREE that position is deleted (EBR = 1)


### Outcome 3) PB-MM2 supports deletion, but Pilon FAILS and does not support deletion (EBR = 0)

3.22%


### Outcome 4) PB-MM2 and Pilon don’t support position (With no Deletion) (EBR = NAN)

### Outcome 5) PB-MM2 and Pilon are both confident, and DISAGREE (EBR = 0)

### Outcome 6) PB-MM2 is confident, Pilon != PASS (EBR = 0)

### Outcome 7) PB-MM2 DP = 0, PB supports deletion, but Pilon = PASS (EBR = 0)

### Outcome 8) PB-MM2 DP = 0, PB does not support deletion but Pilon = PASS (EBR = NAN)

### Outcome 9) PB-MM2 DP > 1 (DUPLCIATION), and Pilon = PASS (EBR = NAN) 

### Outcome 10) PB-MM2 DP > 1 (DUPLCIATION), and Pilon != PASS (EBR = NAN) 


In [None]:
print("total low EBR positions:", All_LowEBR_Counter)

In [None]:
Total_Pos_HighAndLow_EBR = All_HighEBR_Counter + All_LowEBR_Counter

In [None]:


total = 0

for keyname in [ 'Outcome_3', 'Outcome_5', 'Outcome_6', 'Outcome_7', ]:        
    
    num_Outcome = EBR_Breakdown_Dict_SumAcrossAll28CI[keyname]
    
    propOf_LowEBR = num_Outcome / All_LowEBR_Counter
    propOf_TotalEBR = num_Outcome / Total_Pos_HighAndLow_EBR
    
    
    print(keyname, num_Outcome, "    ", propOf_LowEBR, propOf_TotalEBR)
    
    print(propOf_LowEBR * 100 )
    print()
    total += propOf_LowEBR
    
print(total)

In [None]:
ALL_lowEBRpos_28CI_DF["Ill_Pilon_Tag"].value_counts(normalize=True)

In [None]:
ALL_lowEBRpos_28CI_DF["Ill_Pilon_Tag"].value_counts()

In [None]:
84045 - 83587

In [None]:
83587 / 84045

### Outcome 3) PB-MM2 supports deletion, but Pilon FAILS and does not support deletion (EBR = 0)

4.37%

### Outcome 5) PB-MM2 and Pilon are both confident, and DISAGREE (EBR = 0) "Genotypes Disagree" 

"Illumina WGS erroneously supports the wrong nucleotide identity (SNP disagreement)"

0.029% (456 positions across 28 isolates, ~16.3 false positive genotypes per isolate for Illumina WGS)

### Outcome 6) PB-MM2 is confident, Pilon != PASS (EBR = 0)

"Illumina WGS does not confidently support the presence of a genomic position which PacBio confidently supports the presence of.

90.21%




### Outcome 7) PB-MM2 PB supports deletion, but Pilon = PASS (EBR = 0)

"Illumina WGS erroneously supports presence of "deleted" genomic position.

5.39%



In [None]:
ALL_lowEBRpos_28CI_DF["Ill_Pilon_Tag"].value_counts()

In [None]:
83587

In [None]:
SumOfAll_LowCov_O6 = 242855 + 242800 + 454905

In [None]:
242855 / SumOfAll_LowCov_O6

In [None]:
242800 / SumOfAll_LowCov_O6

In [None]:
454905 / SumOfAll_LowCov_O6

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI.keys()

In [None]:

total = 0

for keyname in [ 'Outcome_3', 'Outcome_5', 'Outcome_6', 'Outcome_7', 'Outcome_6_SO_LowCov_Insufficient_ValidCoverage_LowMQ', 'Outcome_6_SO_LowCov_Insufficient_ValidCoverage_FlaggedPairs', 'Outcome_6_SO_LowCov_Insufficient_TotalCoverage', 'Outcome_6_SO_Amb', 'Outcome_6_SO_Del', 'Outcome_6_SO_Amb;LowCov', 'Outcome_6_SO_Del;Amb', 'Outcome_6_SO_Del;LowCov', 'Outcome_6_SO_Del;Amb;LowCov',]:        
    
    num_Outcome = EBR_Breakdown_Dict_SumAcrossAll28CI[keyname]
    
    propOf_LowEBR = num_Outcome / All_LowEBR_Counter
    propOf_TotalEBR = num_Outcome / Total_Pos_HighAndLow_EBR
    
    
    print(keyname, num_Outcome, "    ", propOf_LowEBR, propOf_TotalEBR)
    
    print(propOf_LowEBR * 100 )
    print()
    total += propOf_LowEBR
    
print(total)

Outcome_6_SO_LowCov_Insufficient_ValidCoverage_LowMQ          15.376957420547646  

Outcome_6_SO_LowCov_Insufficient_ValidCoverage_FlaggedPairs   15.3716241955719

Outcome_6_SO_LowCov_Insufficient_TotalCoverage                28.881127494552494

Outcome_6_SO_Del                                              12.930784898338574

Outcome_6_SO_Del;LowCov                                       17.56021464960712

In [None]:
# Proportion of EBR =0 cases where position was inferred to be DELETED
0.1756 + 0.1293

In [None]:
# Proportion of EBR = 0 cases where position was not confidently supported by Illumina-Pilon (LowCov, Amb)
90.39 - 30.49

In [None]:
30.5 % of poorly recalled positions were due to incorrectly inferred deletions

In [None]:
12.93 + 17.56 + 0.03 + 0.008

In [None]:
### Falsely inferring the deletion of 

In [None]:
15.37 + 15.37 + 28.88 + 12.93 + 17.56

In [None]:
### Outcome 3) PB-MM2 supports deletion, but Pilon FAILS and does not support deletion (EBR = 0)

4.27%

### Outcome 5) PB-MM2 and Pilon are both confident, and DISAGREE (EBR = 0) "Genotypes Disagree" 

# "Illumina WGS erroneously supports the wrong nucleotide identity (SNP disagreement)"

0.029% (456 positions across 28 isolates, ~16.3 false positive genotypes per isolate for Illumina WGS)




### Outcome 6) PB-MM2 is confident, Pilon != PASS (EBR = 0)

# "Illumina WGS does not confidently support the presence of a genomic position which PacBio confidently supports the presence of.

90.39%


#### Breakdown of Outcome 6)

### Outcome 6.1) PacBio supports position, Pilon calls LowCov OR Amb

59.9%


### Outcome 6.2) PacBio supports position, Pilon calls DEL

30.5%



#Outcome_6_SO_Del;LowCov


### Outcome 7) PB-MM2  supports deletion, but Pilon = PASS (EBR = 0)

# "Illumina WGS erroneously supports presence of PacBio supported "deleted" genomic position.

5.31%



In [None]:
We found that the most common reasons for a genomic position not being confidently recalled was due to:
    a) low or no sequencing depth,
    b) inability to uniquely align a read due to repetitive sequence content,
    c) invalid alignment orientation of paired-end reads due to reference bias (changes in the genome sequenced relative to the reference).
    
    
    

In [None]:
Illumina - Pilon erronously supports infering a deletion - 30.5%

Insufficient sequencing depth to support a variant call - 59.9% 


# Merge outcomes #3 and #7 
Illumina - Pilon erronously failed to detect the deletion of a genomic region - 9.6%
5.3%
4.3%

# Outcome #5
The remaining << 1% of poorly recalled positions were due to cases where the Illumina WGS inferred genotype disagreed with our ground truth


In [None]:
9.6 + 30.5 + 59.9

In [None]:

total = 0

for keyname in ['Outcome_1', 'Outcome_2', 'Outcome_3', 'Outcome_4', 'Outcome_5', 'Outcome_6', 'Outcome_7', 'Outcome_8', 'Outcome_9', 'Outcome_10', 'Outcome_11_Unknown', ]:        
    
    num_Outcome = EBR_Breakdown_Dict_SumAcrossAll28CI[keyname]
    
    propOf_LowEBR = num_Outcome / All_LowEBR_Counter
    propOf_TotalEBR = num_Outcome / Total_Pos_HighAndLow_EBR
    
    
    print(keyname, num_Outcome, "    ", propOf_LowEBR, propOf_TotalEBR)
    
    total += propOf_LowEBR
    
#print(total)

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI["Outcome_4"] 

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI["Outcome_4"] / 494707

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI["Outcome_8"]

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI["Outcome_8"] / 494707

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI["Outcome_9"]

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI["Outcome_9"] / 494707

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI["Outcome_10"]

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI["Outcome_10"] / 494707

### Breakdown of NAN counts:

Outcome_4:    (PacBio does not support deletion and )

Outcome_8:
Outcome_9:
Outcome_10:

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI.keys()

In [None]:

total = 0

for keyname in [ 'Outcome_3', 'Outcome_4', 'Outcome_5', 'Outcome_6', 'Outcome_7', 'Outcome_8', 'Outcome_9', 'Outcome_10', 'Outcome_11_Unknown', ]:        
    
    num_Outcome = EBR_Breakdown_Dict_SumAcrossAll28CI[keyname]
    
    propOf_LowEBR = num_Outcome / All_LowEBR_Counter
    propOf_TotalEBR = num_Outcome / Total_Pos_HighAndLow_EBR
    
    
    print(keyname, num_Outcome, "    ", propOf_LowEBR)
    
    total += propOf_LowEBR
    
    
print(total)

In [None]:
1137411 / 1264540

In [None]:

total = 0

for keyname in ['Outcome_2', 'Outcome_3', 'Outcome_4', 'Outcome_5', 'Outcome_6', 'Outcome_7',]:
    
    num_Outcome = EBR_Breakdown_Dict_SumAcrossAll28CI[keyname]
    
    propOf_LowEBR = num_Outcome / All_LowEBR_Counter
    propOf_TotalEBR = num_Outcome / Total_Pos_HighAndLow_EBR
    
    
    print(keyname, num_Outcome, "    ", propOf_LowEBR, propOf_TotalEBR)
    
    total += propOf_LowEBR
    
    
print(total)

In [None]:
EBR_Breakdown_Dict_SumAcrossAll28CI

In [None]:
STOPPP!!!

In [None]:

EBR_Breakdown_Dict_SumAcrossAll28CI = {}

for keyname in dictOf_EBR_Breakdown_Dicts_28CI['N0153'].keys():
    
    EBR_Breakdown_Dict_SumAcrossAll28CI[keyname] = 0

All_HighEBR_Counter = 0

All_HighEBR_Outcome1_Counter = 0 
#All_HighEBR_Outcome2_Counter = 0


All_LowEBR_Counter = 0

All_LowEBR_Outcome2_Counter = 0

All_LowEBR_Outcome4_Counter = 0
All_LowEBR_Outcome4_LowCov_Invalid_Counter = 0
All_LowEBR_Outcome4_LowCov_Total_Counter = 0

All_LowEBR_Outcome4_LowCov_Invalid_LowMQ_Counter = 0
All_LowEBR_Outcome4_LowCov_Invalid_FlaggedPairs_Counter = 0


All_LowEBR_Outcome5_FalsePASS_Counter = 0 

All_HighEBR_Outcome6_DP2_NtAgree_Counter = 0 
All_LowEBR_Outcome6_DP2_NtDisagree_Counter = 0 


All_LowEBR_Outcome7_Counter = 0 

All_LowEBR_Outcome3_Counter = 0 


for SampleID in tqdm(SampleIDs_28CI_WiCircA): 
    
    
    i_count_EBR_Outcomes_Dict = dictOf_EBR_Breakdown_Dicts_28CI[SampleID]
    
    TotalPos_HighEBR = i_count_EBR_Outcomes_Dict["Outcome_1"]  
    TotalPos_LowEBR = i_count_EBR_Outcomes_Dict["Outcome_2"] + i_count_EBR_Outcomes_Dict["Outcome_3"] + i_count_EBR_Outcomes_Dict["Outcome_4"] + i_count_EBR_Outcomes_Dict["Outcome_5"] + i_count_EBR_Outcomes_Dict["Outcome_6_DP2_NtAgree"] + i_count_EBR_Outcomes_Dict["Outcome_6_DP2_NtDisagree"] + i_count_EBR_Outcomes_Dict["Outcome_7"]       
    
    
    
    All_LowEBR_Counter += TotalPos_LowEBR
    All_HighEBR_Counter += TotalPos_HighEBR
    
    TotalPos = TotalPos_HighEBR + TotalPos_LowEBR

    All_HighEBR_Outcome1_Counter += i_count_EBR_Outcomes_Dict["Outcome_1"]
    All_LowEBR_Outcome2_Counter += i_count_EBR_Outcomes_Dict["Outcome_2"]



    PropOfLowEBR_By_General_LowCov = i_count_EBR_Outcomes_Dict["Outcome_4"] / TotalPos_LowEBR
    All_LowEBR_Outcome4_Counter += i_count_EBR_Outcomes_Dict["Outcome_4"]
    All_LowEBR_Outcome5_FalsePASS_Counter += i_count_EBR_Outcomes_Dict["Outcome_5"]

    All_LowEBR_Outcome4_LowCov_Invalid_LowMQ_Counter += i_count_EBR_Outcomes_Dict["Outcome_4_SO_LowCov_Insufficient_ValidCoverage_LowMQ"]
    All_LowEBR_Outcome4_LowCov_Invalid_FlaggedPairs_Counter += i_count_EBR_Outcomes_Dict["Outcome_4_SO_LowCov_Insufficient_ValidCoverage_FlaggedPairs"]

    All_LowEBR_Outcome4_LowCov_Total_Counter += i_count_EBR_Outcomes_Dict["Outcome_4_SO_LowCov_Insufficient_TotalCoverage"]
    
    
    #All_LowEBR_Outcome6_Counter += i_count_EBR_Outcomes_Dict["Outcome_6"]
    All_HighEBR_Outcome6_DP2_NtAgree_Counter += i_count_EBR_Outcomes_Dict["Outcome_6_DP2_NtAgree"]    
    All_LowEBR_Outcome6_DP2_NtDisagree_Counter += i_count_EBR_Outcomes_Dict["Outcome_6_DP2_NtDisagree"]    
    
    
    All_LowEBR_Outcome7_Counter += i_count_EBR_Outcomes_Dict["Outcome_7"]

    
    All_LowEBR_Outcome3_Counter += i_count_EBR_Outcomes_Dict["Outcome_3"]

    
    PropOfLowEBR_By_Insufficient_Cov = i_count_EBR_Outcomes_Dict["Outcome_4_SO_LowCov_Insufficient_TotalCoverage"] / TotalPos_LowEBR    
    
    PropOfLowEBR_By_Insufficient_Valid_Cov_LowMQ = i_count_EBR_Outcomes_Dict["Outcome_4_SO_LowCov_Insufficient_ValidCoverage_LowMQ"] / TotalPos_LowEBR    
    PropOfLowEBR_By_Insufficient_Valid_Cov_FlaggedPairs = i_count_EBR_Outcomes_Dict["Outcome_4_SO_LowCov_Insufficient_ValidCoverage_FlaggedPairs"] / TotalPos_LowEBR    
    
    
    for keyname in EBR_Breakdown_Dict_SumAcrossAll28CI.keys():
    
        EBR_Breakdown_Dict_SumAcrossAll28CI[keyname] += i_count_EBR_Outcomes_Dict[keyname]
    


    print(SampleID,)
    print("Number of Cat-3 outcomes (Nucleotide disagreement) :", i_count_EBR_Outcomes_Dict["Outcome_3"])
    print("PropOfLowEBR_By_General_LowCov :", PropOfLowEBR_By_General_LowCov)

    print("PropOfLowEBR_By_Insufficient_Cov :", PropOfLowEBR_By_Insufficient_Cov)

    print("PropOfLowEBR_By_Insufficient_Valid_Cov B/c MQ = 0 :", PropOfLowEBR_By_Insufficient_Valid_Cov_LowMQ)
    print("PropOfLowEBR_By_Insufficient_Valid_Cov B/c Flagged Pairs :", PropOfLowEBR_By_Insufficient_Valid_Cov_FlaggedPairs)                

    print()


In [None]:
All_HighEBR_Counter

In [None]:
All_LowEBR_Counter

In [None]:
Total_Pos_HighAndLow_EBR = All_LowEBR_Counter + All_HighEBR_Counter

In [None]:
All_LowEBR_Counter / (All_LowEBR_Counter + All_HighEBR_Counter)

In [None]:
All_HighEBR_Counter

In [None]:
All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome4_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome4_Counter / All_LowEBR_Counter

In [None]:
All_HighEBR_Outcome6_DP2_NtAgree_Counter

In [None]:
All_LowEBR_Outcome6_DP2_NtDisagree_Counter

In [None]:
All_LowEBR_Outcome5_FalsePASS_Counter 

In [None]:
All_LowEBR_Outcome5_FalsePASS_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome7_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome4_LowCov_Invalid_LowMQ_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome4_LowCov_Invalid_FlaggedPairs_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome4_LowCov_Invalid_LowMQ_Counter + All_LowEBR_Outcome4_LowCov_Invalid_FlaggedPairs_Counter 

In [None]:
689215 / All_LowEBR_Counter

### 27% of all zero EBR positions were due to insufficient "Valid covereage" by MQ = 0 (No unique reads)

In [None]:
All_LowEBR_Outcome4_LowCov_Invalid_LowMQ_Counter / All_LowEBR_Counter

### 20% of all zero EBR positions were due to insufficient "Valid covereage" by Pilon Hueristics

In [None]:
All_LowEBR_Outcome4_LowCov_Invalid_FlaggedPairs_Counter / All_LowEBR_Counter

### 36% of all zero EBR positions were due to insufficient "Total Coverage"

In [None]:
All_LowEBR_Outcome4_LowCov_Total_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome2_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome4_LowCov_Counter / All_LowEBR_Counter

In [None]:
27 + 20 + 36

In [None]:
NumAll_LowCov_LowEBR_Pos = (All_LowEBR_Outcome4_LowCov_Total_Counter + All_LowEBR_Outcome4_LowCov_Invalid_LowMQ_Counter + All_LowEBR_Outcome4_LowCov_Invalid_FlaggedPairs_Counter)

NumAll_LowCov_LowEBR_Pos / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome4_LowCov_Counter / All_LowEBR_Counter

### 14% of all zero EBR positions were due to false confidence by Pilon (PASS when PacBio does not support)

In [None]:
All_LowEBR_Outcome5_FalsePASS_Counter / All_LowEBR_Counter

### 0.03% of all zero EBR positions were due to GENOTYPE DISAGREEMENTS in DUPLICATed regions in the PacBio assembly and Pilon being PASS

In [None]:
All_HighEBR_Outcome6_DP2_NtAgree_Counter

In [None]:
All_LowEBR_Outcome6_DP2_NtDisagree_Counter

In [None]:
All_LowEBR_Outcome6_DP2_NtDisagree_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome6_DP2_NtDisagree_Counter / All_LowEBR_Counter

In [None]:
All_HighEBR_Counter

In [None]:
All_LowEBR_Counter

In [None]:
All_HighEBR_Outcome6_DP2_NtAgree_Counter / All_HighEBR_Counter

In [None]:
All_HighEBR_Outcome6_DP2_NtAgree_Counter / (All_HighEBR_Counter + All_LowEBR_Counter)

In [None]:
(All_HighEBR_Outcome6_DP2_NtAgree_Counter +  All_LowEBR_Outcome6_DP2_NtDisagree_Counter) / (All_HighEBR_Counter + All_LowEBR_Counter)

### 1% of all zero EBR positions were due to DUPLICATIONS in the PacBio assembly and Pilon being NOT PASS

In [None]:
All_LowEBR_Outcome7_Counter / All_LowEBR_Counter

### **<<< 1%** of all zero EBR positions were due to nucleotide disagreements between

In [None]:
All_LowEBR_Outcome3_Counter

In [None]:
All_LowEBR_Outcome3_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Outcome3_Counter / All_LowEBR_Counter

In [None]:
All_LowEBR_Counter

In [None]:
44 + 34 + 13 + 8 + 1 

In [None]:
All_HighEBR_Counter

In [None]:
All_HighEBR_Outcome1_Counter 
All_HighEBR_Outcome2_Counter
    

In [None]:
All_HighEBR_Outcome1_Counter / All_HighEBR_Counter

In [None]:
All_HighEBR_Outcome2_Counter / All_HighEBR_Counter

In [None]:
All_HighEBR_Outcome6_DP2_NtAgree_Counter / All_HighEBR_Counter