# Exploring variant caller (Pilon) tags associated with poorly recalled genomic positions (Across all 36 isolates evaluated) 

### Goal: To summarize the variant caller annotations associated with poorly recalled genomic positions. 

In [1]:
import numpy as np
import pandas as pd
import vcf
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

%matplotlib inline

#### Pandas Viewing Settings

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Parse Assembly and Sequence analysis results Sample Info

In [3]:
Repo_DataDir = "../../Data"

PMP_SM_ResultsSummary_Dir_210108 = Repo_DataDir + "/210108_PMP_SM_50CI_V7_ResultsSummary"

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_36CI_CircularOnly_F2Filtered_AtLeast40XMeanDepthIllumina_AssemblySummary_V7.tsv"       

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary = pd.read_csv(PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_AnalysisSet_AssemblySummary = PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary


SampleIDs_36CI_SOI = list( PMP_36CI_AnalysisSet_AssemblySummary["SampleID"].values )


print(','.join(SampleIDs_36CI_SOI) )

# Make sample to metadata mapping dicts

ID_To_IlluminaAvrgCov_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'IlluminaWGSToH37rv_AvrgCov']].values)                     
ID_To_Lineage_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'PrimaryLineage_PB']].values)
ID_To_Dataset_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'Dataset_Tag']].values)

M0011368_9,M0014888_3,M0016395_7,M0010874_7,01_R1430,02_R0894,02_R1708,02_R1896,M0016737_0,M0017522_5,01_R1134,M0003941_3,02_R1179,N1176,N0072,N0153,N0145,N0155,N0004,N1274,N0054,N1272,N0091,N1202,N1177,RW-TB008,DNA028,DNA075,DNA091,DNA044,DNA020,AZE_02_042,DNA019_Rose,DNA120,DNA188,DNA086


# 0) Read in pickles of processed data

## Read back in NPY for EBR-36CI

In [4]:
PB_Vs_Illumina_DataAnalysis_Dir = "../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI"

# Define directory for EBR analysis data
PBvIll_EBR_Dir = PB_Vs_Illumina_DataAnalysis_Dir + "/210112_EBR_H37rv_36CI_MM2vsPilon_V7"         

PBvsIll_EBR_IndivSample_NPZs = f"{PBvIll_EBR_Dir}/210112_EBR_H37rv_IndividualSampleRecall_NPZs"


# Parse in aggregated EBR-36CI array
EBR_36CI_WGS40X_NPZ_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.npz"

EBR_36CI_Array_A4 = np.load(EBR_36CI_WGS40X_NPZ_PATH)["arr_0"]


In [5]:
#print(EBR_31CI_Array.shape)

In [6]:
print("# of total H37Rv positions:", EBR_36CI_Array_A4.shape[0]  )

# of total H37Rv positions: 4411532


In [7]:
print("# of H37Rv positions w/ EBR < 1:", EBR_36CI_Array_A4[ EBR_36CI_Array_A4 < 1].shape[0])    

# of H37Rv positions w/ EBR < 1: 267471


  """Entry point for launching an IPython kernel.


In [8]:
EBR_36CI_Array_A4[ EBR_36CI_Array_A4 < 1].shape[0] /  EBR_36CI_Array_A4.shape[0]

  """Entry point for launching an IPython kernel.


0.060629958028186125

In [9]:
EBR_36CI_Array_A4[ EBR_36CI_Array_A4 < 0.95].shape[0] /  EBR_36CI_Array_A4.shape[0]

  """Entry point for launching an IPython kernel.


0.03752596603628853

In [10]:
EBR_36CI_Array_A4[ EBR_36CI_Array_A4 < 0.9].shape[0] /  EBR_36CI_Array_A4.shape[0]

  """Entry point for launching an IPython kernel.


0.028594828281875775

### Define directories of PMP-SM pipeline

In [11]:
# Define varaint calling pipeline output directories

PacBio_ProjectDir = "/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project"
PMP_SM_Outputs_Dir = PacBio_ProjectDir + "/PacmanPipe_SM_Outputs"
PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir = PMP_SM_Outputs_Dir + "/201201_PMP_SM_TB_Portals_R1_Output_V2"     



In [12]:
dictOf_EBR_IndivIsolate_DFs = {}

#dictOf_EBR_IndivIsolate_NPYs = {}

#dictOf_BED_DFs_IndivSample_EBRs = {}

dictOf_EBR_Breakdown_Dicts = {}

SumOf_EBR_Breakdown_Dicts = {"Outcome_5" : 0,
                             "Outcome_7" : 0}


sample_Counter = 1

for SampleID in tqdm(SampleIDs_36CI_SOI):
    
    Sample_Output_Dir = PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir + "/" + SampleID
    
    EBR_Output_Dir = f"{Sample_Output_Dir}/EmpiricalBasePairRecall_Analysis_V7_PacBio_Vs_IlluminaPilon"
    
    # Let's read in EBR TSV
    i_EBR_Agreement_DF_TSV_PATH = f"{EBR_Output_Dir}/EBR.V7.IndivIsolate.{SampleID}.tsv"
        
    i_EBR_Outcome_Breakdown_Dict_JSON_PATH = f"{EBR_Output_Dir}/EBR.V7.IndivIsolate.{SampleID}.OutcomeBreakdown.json"        
    
    with open(i_EBR_Outcome_Breakdown_Dict_JSON_PATH) as json_file: i_count_EBR_Outcomes_Dict = json.load(json_file) 
    
    i_EBR_DF = pd.read_csv(i_EBR_Agreement_DF_TSV_PATH, sep = "\t" )

    #i_EBR_WithAmbFilled_Array = i_EBR_DF["Agreement"].fillna("Ambiguous").values
    
    #dictOf_BED_DFs_IndivSample_EBRs[SampleID] = convert_GenomeNParray_To_BED_DF(i_EBR_WithAmbFilled_Array)
    
    
    dictOf_EBR_IndivIsolate_DFs[SampleID] = i_EBR_DF

    #dictOf_EBR_IndivIsolate_NPYs[SampleID] = i_EBR_DF["Agreement"].values
        
    dictOf_EBR_Breakdown_Dicts[SampleID] = i_count_EBR_Outcomes_Dict

    
    SumOf_EBR_Breakdown_Dicts["Outcome_5"] += i_count_EBR_Outcomes_Dict["Outcome_5"]
    SumOf_EBR_Breakdown_Dicts["Outcome_7"] += i_count_EBR_Outcomes_Dict["Outcome_7"]

    
    
    

100%|██████████| 36/36 [02:22<00:00,  3.72s/it]


In [17]:
dictOf_EBR_IndivIsolate_DFs["N0153"].head()

Unnamed: 0,Agreement,Ill_Pilon_Tag,Ill_Pilon_TD,Ill_Pilon_DP,Ill_Pilon_MQ
0,1.0,PASS,65.0,40.0,42.0
1,1.0,PASS,66.0,41.0,42.0
2,1.0,PASS,66.0,41.0,42.0
3,1.0,PASS,66.0,41.0,42.0
4,1.0,PASS,66.0,41.0,42.0


In [18]:
ListOf_lowEBRpos_DFs = []
pd.set_option('mode.chained_assignment', None)

for SampleID in ( SampleIDs_36CI_SOI ) :
    
    i_EBR_DF = dictOf_EBR_IndivIsolate_DFs[SampleID]
    
    i_EBR_DF_lowEBRpos = i_EBR_DF[ i_EBR_DF["Agreement"] == 0]
        
    i_EBR_DF_lowEBRpos["SampleID"] = SampleID
    
    ListOf_lowEBRpos_DFs.append(i_EBR_DF_lowEBRpos)
    
    #print(SampleID, i_EBR_DF_lowEBRpos.shape)
    
ALL_lowEBRpos_36CI_DF = pd.concat(ListOf_lowEBRpos_DFs)
ALL_lowEBRpos_36CI_DF.shape


(1825385, 6)

In [19]:
ALL_lowEBRpos_36CI_DF.head(3)

Unnamed: 0,Agreement,Ill_Pilon_Tag,Ill_Pilon_TD,Ill_Pilon_DP,Ill_Pilon_MQ,SampleID
71335,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71336,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71337,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9


In [20]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts().sum()

1825385

In [21]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts()

LowCov            1145079
Del;LowCov         302826
Del                257423
PASS               114517
Amb                  3773
Amb;LowCov            858
Del;Amb               719
Del;Amb;LowCov        190
Name: Ill_Pilon_Tag, dtype: int64

In [22]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts(normalize=True)

LowCov            0.627308
Del;LowCov        0.165897
Del               0.141024
PASS              0.062736
Amb               0.002067
Amb;LowCov        0.000470
Del;Amb           0.000394
Del;Amb;LowCov    0.000104
Name: Ill_Pilon_Tag, dtype: float64

In [23]:

PilonTags_For_Del_Classification = ["Del", "Del;LowCov", "Del;Amb", "Del;Amb;LowCov"]

#PilonTags_For_LowCov_Classification = ["LowCov","Amb;LowCov",]

#PilonTags_For_LowCov_Classification = ["LowCov", "Del;LowCov", "Amb;LowCov", "Del;Amb;LowCov"]


PilonTags_For_LowCov_Classification = ["LowCov","Amb;LowCov",]


PilonTags_For_Amb_Classification = ["Amb"]

PilonTags_For_PASS_Classification = ["PASS"]



In [24]:
Del_Class_lowEBRpos_36CI_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(PilonTags_For_Del_Classification) ]

LowCov_Class_lowEBRpos_36CI_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(PilonTags_For_LowCov_Classification) ]

Amb_Class_lowEBRpos_36CI_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(["Amb"]) ]

PASS_Class_lowEBRpos_36CI_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(["PASS"]) ]


In [25]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts().sum()

1825385

# What are the breakdown Pilon tags associated with positions of poor recall. 

### A) How many low recall positions were FALSE deletion calls (Del)?

In [26]:
Del_Class_lowEBRpos_36CI_DF.shape

(561158, 6)

In [27]:
Del_Class_lowEBRpos_36CI_DF.shape[0] / ALL_lowEBRpos_36CI_DF.shape[0]

0.30741898284471497

### B) How many low recall positions were tagged as LowCov (with no deletion call)?

In [28]:
LowCov_Class_lowEBRpos_36CI_DF.shape

(1145937, 6)

In [29]:
LowCov_Class_lowEBRpos_36CI_DF.shape[0] / ALL_lowEBRpos_36CI_DF.shape[0]

0.6277782495199643

### C) How many low recall positions were tagged as Amb (with no Del or LowCov tag)?

In [30]:
Amb_Class_lowEBRpos_36CI_DF.shape

(3773, 6)

In [31]:
Amb_Class_lowEBRpos_36CI_DF.shape[0] / ALL_lowEBRpos_36CI_DF.shape[0]

0.0020669612164009236

### D) How many low recall positions were FALSEY tagged as PASS?

In [32]:
PASS_Class_lowEBRpos_36CI_DF.shape

(114517, 6)

In [33]:
PASS_Class_lowEBRpos_36CI_DF.shape[0] / ALL_lowEBRpos_36CI_DF.shape[0]

0.06273580641891985

In [13]:
SumOf_EBR_Breakdown_Dicts

{'Outcome_5': 669, 'Outcome_7': 113848}

In [63]:
669 + 113848

114517

In [64]:
669 / ALL_lowEBRpos_36CI_DF.shape[0]

0.0003664980264437365

In [65]:
113848 / ALL_lowEBRpos_36CI_DF.shape[0] 

0.062369308392476105

## Verify that total counts match up

In [49]:
ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].value_counts().sum()

1825385

In [50]:
Del_Class_lowEBRpos_36CI_DF.shape[0]+ LowCov_Class_lowEBRpos_36CI_DF.shape[0] + Amb_Class_lowEBRpos_36CI_DF.shape[0] + PASS_Class_lowEBRpos_36CI_DF.shape[0]

1825385

In [51]:
ALL_lowEBRpos_36CI_DF.head()

Unnamed: 0,Agreement,Ill_Pilon_Tag,Ill_Pilon_TD,Ill_Pilon_DP,Ill_Pilon_MQ,SampleID
71335,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71336,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71337,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71338,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9
71339,0.0,Del;LowCov,4.0,4.0,60.0,M0011368_9


In [52]:

AllPilonTags_Containing_LowCov_Classification = ["LowCov", "Del;LowCov", "Amb;LowCov", "Del;Amb;LowCov"]


lowEBRpos_36CI_AllPositionsWithAnyLowCovTag_DF = ALL_lowEBRpos_36CI_DF[ ALL_lowEBRpos_36CI_DF["Ill_Pilon_Tag"].isin(AllPilonTags_Containing_LowCov_Classification ) ]      

lowEBRpos_36CI_LowCov_MQZero_DF = LowCov_Class_lowEBRpos_36CI_DF.query("(Ill_Pilon_TD >= 5) & (Ill_Pilon_DP < 5) & (Ill_Pilon_MQ == 0) ")
lowEBRpos_36CI_LowCov_InvalidPairs_DF = LowCov_Class_lowEBRpos_36CI_DF.query("(Ill_Pilon_TD >= 5) & (Ill_Pilon_DP < 5) & (Ill_Pilon_MQ > 0) ")      
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF = LowCov_Class_lowEBRpos_36CI_DF.query("(Ill_Pilon_TD < 5)  ")



## How much total low-EBR positions have the LowCov tag?
Answer: 1,446,305

In [53]:
LowCov_Class_lowEBRpos_36CI_DF.shape

(1145937, 6)

## How much total low-EBR positions have the LowCov and had insufficient total depth ?
Answer: ____ (__% of all Low-EBR and LowCov positions)

In [54]:
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF.shape

(525373, 6)

In [55]:
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF.shape[0] / LowCov_Class_lowEBRpos_36CI_DF.shape[0]

0.45846586679721485

## How much total low-EBR positions have the LowCov and had insufficient depth due to MQ = 0 ?
Answer: 316,473 (27.6% of all Low-EBR and LowCov positions)

In [56]:
lowEBRpos_36CI_LowCov_MQZero_DF.shape

(316304, 6)

In [57]:
lowEBRpos_36CI_LowCov_MQZero_DF.shape[0] / LowCov_Class_lowEBRpos_36CI_DF.shape[0]

0.2760221547955952

## How much total low-EBR positions have the LowCov and had insufficient depth due to Invalid Pairs (MQ > 0)?
Answer: _____ (__% of all Low-EBR and LowCov positions)

In [58]:
lowEBRpos_36CI_LowCov_InvalidPairs_DF.shape

(304260, 6)

In [59]:
lowEBRpos_36CI_LowCov_InvalidPairs_DF.shape[0] / LowCov_Class_lowEBRpos_36CI_DF.shape[0]

0.26551197840718993

### Verify that total counts add up (For LowCov - LowEBR positions)

In [60]:
LowCov_Class_lowEBRpos_36CI_DF.shape[0]

1145937

In [61]:
lowEBRpos_36CI_LowCov_Insufficient_TotalCov_DF.shape[0] + lowEBRpos_36CI_LowCov_InvalidPairs_DF.shape[0] + lowEBRpos_36CI_LowCov_MQZero_DF.shape[0]

1145937