# Parsing EBR calculations from EBR TSVs

### Maximillian Marin
### mgmarin@g.harvard.edu

### Goal: To write code that will read EBR results from Disc and produce same data structures as before (in memory).



In [1]:
import numpy as np
import pandas as pd
import vcf
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

%matplotlib inline

#### Pandas Viewing Settings

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
!bedtools --version

bedtools v2.29.2


## Define function for output of a BED file from a NP array with values for each basepair position

In [70]:
# BED format specifications: https://useast.ensembl.org/info/website/upload/bed.html

def convert_EBR_GenomeNParray_To_BED_DF(input_GenomeNParray, genomeChrom = "NC_000962.3"):
    """ """
     
    #input_GenomeNParray = pd.Series(input_GenomeNParray).fillna("Ambiguous")
    #input_GenomeNParray = pd.Series(input_GenomeNParray).fillna(0)
    input_GenomeNParray = pd.Series(input_GenomeNParray).fillna(-1)
    
    last_Score = input_GenomeNParray[0]

    startOfRegion = 0
    listOfBED_Tuples = []
    RegionCounter = 1

    for RefPos_0based in (np.arange(len(input_GenomeNParray))):

        EBR_Score = input_GenomeNParray[RefPos_0based]

        if EBR_Score != last_Score:

            endOfRegion = RefPos_0based
            lengthOfRegion = endOfRegion - startOfRegion 

            BED_EntryTuple = (genomeChrom, startOfRegion, endOfRegion, f"Region{RegionCounter}_Length_{lengthOfRegion}_bp", last_Score,)
            listOfBED_Tuples.append(BED_EntryTuple)

            RegionCounter += 1

            #print(f"{H37rv_ChrName}, {startOfRegion}, {RefPos_0based}, {lengthOfRegion}_bp, {last_Score}, .")

            startOfRegion = RefPos_0based 

            #1 Output the last range
            #2 Store the new score    

        last_Score = EBR_Score #2 Store the new score   

        
        
    endOfRegion = RefPos_0based + 1
    lengthOfRegion = endOfRegion - startOfRegion 

    BED_EntryTuple = (genomeChrom, startOfRegion, endOfRegion, f"Region{RegionCounter}_Length_{lengthOfRegion}_bp", last_Score)
    listOfBED_Tuples.append(BED_EntryTuple)       

    BED_DF = pd.DataFrame(listOfBED_Tuples)
    
    BED_DF.columns = ["chrom", "chromStart", "chromEnd", "name", "score" ]
    
    
    return BED_DF

In [5]:
def convert_EBR_Array_To_TSV(input_EBR_Array, EBR_Output_TSV_PATH):
    
    input_EBR_TSV_DF = pd.DataFrame(input_EBR_Array)

    input_EBR_TSV_DF.columns = ["EmpiricalBasePairRecall"]
    input_EBR_TSV_DF["H37rv_RefPos_0based"] = input_EBR_TSV_DF.index
    input_EBR_TSV_DF["H37rv_RefPos_1based"] = input_EBR_TSV_DF["H37rv_RefPos_0based"] + 1

    input_EBR_TSV_DF = input_EBR_TSV_DF[["H37rv_RefPos_0based", "H37rv_RefPos_1based", "EmpiricalBasePairRecall"]  ]     

    input_EBR_TSV_DF.to_csv(EBR_Output_TSV_PATH, sep = "\t", index = False)
    
    print("TSV output to:", EBR_Output_TSV_PATH)
    
    return input_EBR_TSV_DF

# Parse Assembly and Sequence analysis results Sample Info

In [6]:
Repo_DataDir = "../../Data"

PMP_SM_ResultsSummary_Dir_210108 = Repo_DataDir + "/210108_PMP_SM_50CI_V7_ResultsSummary"

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_36CI_CircularOnly_F2Filtered_AtLeast40XMeanDepthIllumina_AssemblySummary_V7.tsv"       

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary = pd.read_csv(PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_AnalysisSet_AssemblySummary = PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary


SampleIDs_36CI_SOI = list( PMP_36CI_AnalysisSet_AssemblySummary["SampleID"].values )


print(','.join(SampleIDs_36CI_SOI) )

# Make sample to metadata mapping dicts

ID_To_IlluminaAvrgCov_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'IlluminaWGSToH37rv_AvrgCov']].values)                     
ID_To_Lineage_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'PrimaryLineage_PB']].values)
ID_To_Dataset_Dict = dict(PMP_36CI_AnalysisSet_AssemblySummary[['SampleID', 'Dataset_Tag']].values)

M0011368_9,M0014888_3,M0016395_7,M0010874_7,01_R1430,02_R0894,02_R1708,02_R1896,M0016737_0,M0017522_5,01_R1134,M0003941_3,02_R1179,N1176,N0072,N0153,N0145,N0155,N0004,N1274,N0054,N1272,N0091,N1202,N1177,RW-TB008,DNA028,DNA075,DNA091,DNA044,DNA020,AZE_02_042,DNA019_Rose,DNA120,DNA188,DNA086


#### What is the distribution of samples from each dataset?

In [7]:
PMP_36CI_AnalysisSet_AssemblySummary["Dataset_Tag"].value_counts()

Farhat_Peru_2019        13
ChinerOms_2019          12
TB_Portals_15CI_R1      10
Ngabonziza_Lin8_2020     1
Name: Dataset_Tag, dtype: int64

In [8]:
len(SampleIDs_36CI_SOI)

36

# Define directory for output of EBR results

In [9]:
PB_Vs_Illumina_DataAnalysis_Dir = "../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI"

PBvIll_EBR_Dir = PB_Vs_Illumina_DataAnalysis_Dir + "/210112_EBR_H37rv_36CI_MM2vsPilon_V7"     

PBvsIll_EBR_IndivSample_NPZs = f"{PBvIll_EBR_Dir}/210112_EBR_H37rv_IndividualSampleRecall_NPZs"

!mkdir $PB_Vs_Illumina_DataAnalysis_Dir
!mkdir $PBvIll_EBR_Dir
!mkdir $PBvsIll_EBR_IndivSample_NPZs

mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI’: File exists
mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7’: File exists
mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210112_EBR_H37rv_IndividualSampleRecall_NPZs’: File exists


In [10]:
#!ls -lah $PB_Vs_Illumina_DataAnalysis_Dir

In [11]:
#!ls -lah $PBvIll_IndivEBR_Results_Dir | head 

# Parse EBR results from all individual isolates 

### Define directories of PMP-SM pipeline

In [12]:
# Define varaint calling pipeline output directories

PacBio_ProjectDir = "/n/data1/hms/dbmi/farhat/mm774/Projects/PacBio_Evaluation_Project"
PMP_SM_Outputs_Dir = PacBio_ProjectDir + "/PacmanPipe_SM_Outputs"
PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir = PMP_SM_Outputs_Dir + "/201201_PMP_SM_TB_Portals_R1_Output_V2"


In [13]:
dictOf_EBR_IndivIsolate_DFs = {}

dictOf_EBR_IndivIsolate_NPYs = {}

dictOf_BED_DFs_IndivSample_EBRs = {}

dictOf_EBR_Breakdown_Dicts = {}

sample_Counter = 1

for SampleID in tqdm(SampleIDs_36CI_SOI):
    
    Sample_Output_Dir = PMP_SM_TB_Portals_R1_Illumina_WGS_201201_OutputDir + "/" + SampleID
    
    EBR_Output_Dir = f"{Sample_Output_Dir}/EmpiricalBasePairRecall_Analysis_V7_PacBio_Vs_IlluminaPilon"
    
    # Let's read in EBR TSV
    i_EBR_Agreement_DF_TSV_PATH = f"{EBR_Output_Dir}/EBR.V7.IndivIsolate.{SampleID}.tsv"
        
    i_EBR_Outcome_Breakdown_Dict_JSON_PATH = f"{EBR_Output_Dir}/EBR.V7.IndivIsolate.{SampleID}.OutcomeBreakdown.json"        

    
    
    
    with open(i_EBR_Outcome_Breakdown_Dict_JSON_PATH) as json_file: i_count_EBR_Outcomes_Dict = json.load(json_file) 
    
    i_EBR_DF = pd.read_csv(i_EBR_Agreement_DF_TSV_PATH, sep = "\t" )

    i_EBR_Array_WiNA = i_EBR_DF["Agreement"].values

    i_EBR_WithAmbFilled_Array = i_EBR_DF["Agreement"].fillna("Ambiguous").values

    
    
    
    output_i_EBR_Agreement_NPZ_PATH = f"{PBvsIll_EBR_IndivSample_NPZs}/EBR.V7.IndivIsolate.{SampleID}.npz"
    np.savez_compressed(output_i_EBR_Agreement_NPZ_PATH, i_EBR_Array_WiNA )

    
    dictOf_BED_DFs_IndivSample_EBRs[SampleID] = convert_EBR_GenomeNParray_To_BED_DF(i_EBR_WithAmbFilled_Array)
    
    
    dictOf_EBR_IndivIsolate_DFs[SampleID] = i_EBR_DF

    dictOf_EBR_IndivIsolate_NPYs[SampleID] = i_EBR_Array_WiNA
        
    dictOf_EBR_Breakdown_Dicts[SampleID] = i_count_EBR_Outcomes_Dict


100%|██████████| 36/36 [08:52<00:00, 14.67s/it]


In [14]:
SampleIDs_36CI_SOI[:2]

['M0011368_9', 'M0014888_3']

In [15]:
dictOf_BED_DFs_IndivSample_EBRs["M0011368_9"].head()

Unnamed: 0,chrom,chromStart,chromEnd,name,score
0,NC_000962.3,0,71335,Region1_Length_71335_bp,1
1,NC_000962.3,71335,71584,Region2_Length_249_bp,0
2,NC_000962.3,71584,79504,Region3_Length_7920_bp,1
3,NC_000962.3,79504,79513,Region4_Length_9_bp,0
4,NC_000962.3,79513,91786,Region5_Length_12273_bp,1


In [16]:
#dictOf_BED_DFs_IndivSample_EBRs["M0011368_9"]["score"].value_counts()

# Saving parsed data (as Python Pickle)

## Lets save ("Pickle") the dictionary of EBR np arrays on O2

### NOTE: Commented out b/c .pickle file is > 1 Gb in size

In [45]:

# Pickle_PATH_dictOf_EBR_IndivIsolate_NPYs = PBvIll_EBR_Dir + "/210112_dictOf_EBR_V7_IndivIsolate_NPYs_36CI.pickle"         

#with open(Pickle_PATH_dictOf_EBR_IndivIsolate_NPYs, 'wb') as outputFile:
    #pickle.dump(dictOf_EBR_IndivIsolate_NPYs, outputFile)

# Create Empirical Base Pair Recall Arrays

## EBR_36CI_WGS40X_Array: EBR for all 36 Mtb CI that have >= 40X Illumina WGS coverage


# Finalizing EBR Calculation

https://numpy.org/doc/stable/reference/generated/numpy.nanmean.html

### Aggregated EBR will be calculated the following way:
    
Aggregated-EBR = the mean EBR score of all VALID (0 or 1) EBR scores at a position of interest.

IF there are <= 25% (10 or more) NAN values at a position
only average EBR between VALID EBR scores.

IF there are more than 25% NAN values at a position, set EBR-36CI to "AMB" to signify that position has issues.



In [71]:
len(SampleIDs_36CI_SOI)

36

In [72]:
0.25 * 36

9.0

In [73]:
EBR_36CI_Array_A4 = np.zeros( (4411532,) )

Num_NonNAN_Values_Array_A4 = np.zeros( (4411532,) )

NumTotalSamples = len(SampleIDs_36CI_SOI)


for SampleID in ( SampleIDs_36CI_SOI ) :
    
    i_EBR_Array = dictOf_EBR_IndivIsolate_DFs[SampleID]["Agreement"].values
    
    #This numpy array represents the counts of VALID EBR scores at a given position.
    Num_NonNAN_Values_Array_A4 += (dictOf_EBR_IndivIsolate_DFs[SampleID]["Agreement"].fillna(-1).values != -1).astype(int)
    
    i_EBR_Array = np.nan_to_num(i_EBR_Array, nan=0 ) 
    
    EBR_36CI_Array_A4 += i_EBR_Array
    
# 

NumNANs_H37Rv_Pos_36CI_NPY = (NumTotalSamples - Num_NonNAN_Values_Array_A4)
    
Pos_WithGreaterThan_9NANs_36CI = (NumNANs_H37Rv_Pos_36CI_NPY > 10)
    
    
print(EBR_36CI_Array_A4.shape, np.mean(EBR_36CI_Array_A4) )

EBR_36CI_Array_A4 = EBR_36CI_Array_A4 / Num_NonNAN_Values_Array_A4 


print(EBR_36CI_Array_A4.shape, np.mean(EBR_36CI_Array_A4) )

EBR_36CI_Array_A4[np.isposinf(EBR_36CI_Array_A4)] = np.nan


EBR_36CI_Array_A4 = EBR_36CI_Array_A4.astype('float')


EBR_36CI_Array_A4[Pos_WithGreaterThan_9NANs_36CI] = np.nan


print(EBR_36CI_Array_A4.shape, np.mean(EBR_36CI_Array_A4) )
print(EBR_36CI_Array_A4.shape, np.nanmean(EBR_36CI_Array_A4) )


(4411532,) 35.4379576074706
(4411532,) nan
(4411532,) nan
(4411532,) 0.9886257598437438




#### Some QC of Aggregated EBR-36CI array

In [74]:
np.isnan(EBR_36CI_Array_A4).sum()

15813

In [75]:
(EBR_36CI_Array_A4 == 0).sum()

2619

In [76]:
(EBR_36CI_Array_A4 == 1).sum() / EBR_36CI_Array_A4.shape[0]

0.9357855729030187

In [77]:
(EBR_36CI_Array_A4 == 1).sum() + (EBR_36CI_Array_A4 < 1).sum() + np.isnan(EBR_36CI_Array_A4).sum()

  """Entry point for launching an IPython kernel.


4411532

In [78]:
EBR_36CI_Array_A4.shape

(4411532,)

In [79]:
np.isnan(EBR_36CI_Array_A4).sum()

15813

In [80]:
np.mean( EBR_36CI_Array_A4[ ~np.isnan(EBR_36CI_Array_A4) ] )

0.9886257598437431

In [81]:
np.nanmean( EBR_36CI_Array_A4 )

0.9886257598437438

#### END QC 

In [82]:
np.nan == np.nan

False

# Output EBR-36CI data into various formats (TSV, BED, BEDGRAPH etc)

## Let's output the Numpy array of Aggregated Empirical Basepair Recall (EBR-36CI)

In [83]:
## Let's output the arrays as .npy files 

EBR_36CI_WGS40X_NPY_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.npy"
EBR_36CI_WGS40X_NPZ_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.npz"

#np.save(EBR_36CI_WGS40X_NPY_PATH, EBR_36CI_Array_A4 )
np.savez_compressed(EBR_36CI_WGS40X_NPZ_PATH, EBR_36CI_Array_A4 )


## Let's output the Empirical Basepair Recall (EBR) BED & BEDGRAPH files

In [84]:

EBR_36CI_WGS40X_BED_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.bed"
EBR_36CI_WGS40X_BEDGRAPH_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.bedgraph"       


In [85]:
EBR_36CI_WGS40X_BED_DF = convert_EBR_GenomeNParray_To_BED_DF(EBR_36CI_Array_A4)

EBR_36CI_WGS40X_BED_DF.columns = ["chrom", "chromStart", "chromEnd", "name", "EBR_Score"]

EBR_36CI_WGS40X_BED_DF.to_csv(EBR_36CI_WGS40X_BED_PATH,
                           sep = "\t",
                           index = False,
                           header = False)

!cut -f 1,2,3,5 $EBR_36CI_WGS40X_BED_PATH > $EBR_36CI_WGS40X_BEDGRAPH_PATH

## Let's output the Empirical Basepair Recall (EBR) per position TSV file

In [86]:
EBR_36CI_TSV_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.tsv" 

EBR_36CI_TSV_DF = convert_EBR_Array_To_TSV(EBR_36CI_Array_A4, EBR_36CI_TSV_PATH)


TSV output to: ../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210112_EBR_V7_36CI.tsv


In [87]:
EBR_36CI_TSV_DF.shape

(4411532, 3)

In [88]:
EBR_36CI_TSV_DF.head(3)

Unnamed: 0,H37rv_RefPos_0based,H37rv_RefPos_1based,EmpiricalBasePairRecall
0,0,1,0.916667
1,1,2,0.916667
2,2,3,0.916667


#### Let's look at the output files

In [89]:
!echo $PBvIll_EBR_Dir

../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7


In [90]:
!ls -lah $PBvIll_EBR_Dir

total 26M
drwxrwsr-x  5 mm774 farhat  329 Mar 26 22:53 .
drwxrwsr-x 11 mm774 farhat  529 Mar 26 23:36 ..
drwxrwsr-x  2 mm774 farhat 1.8K Mar 26 14:44 210112_EBR_H37rv_IndividualSampleRecall_NPZs
-rw-rw-r--  1 mm774 farhat 1.4M Mar 29 17:48 210112_EBR_V7_36CI.bed
-rw-rw-r--  1 mm774 farhat 915K Mar 29 17:48 210112_EBR_V7_36CI.bedgraph
-rw-rw-r--  1 mm774 farhat 113K Mar 29 17:48 210112_EBR_V7_36CI.npz
-rw-rw-r--  1 mm774 farhat  86M Mar 29 17:48 210112_EBR_V7_36CI.tsv
drwxrwsr-x  2 mm774 farhat  207 Mar 26 23:17 210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap
drwxrwsr-x  2 mm774 farhat  188 Mar 26 14:49 EBR_36CI_AmbigousRegions


# Output regions below certain EBR thresholds (For masking of future analyses)

In [91]:
EBR_36CI_WGS40X_BED_DF.head()

Unnamed: 0,chrom,chromStart,chromEnd,name,EBR_Score
0,NC_000962.3,0,8,Region1_Length_8_bp,0.916667
1,NC_000962.3,8,26,Region2_Length_18_bp,0.944444
2,NC_000962.3,26,33,Region3_Length_7_bp,0.972222
3,NC_000962.3,33,174,Region4_Length_141_bp,1.0
4,NC_000962.3,174,185,Region5_Length_11_bp,0.972222


In [92]:
EBR_36CI_WGS40X_BED_DF.shape

(21613, 5)

In [93]:
EBR_36CI_WGS40X_BED_DF["EBR_Score"].value_counts().head()

0.972222    3666
1.000000    3379
0.944444    1328
0.916667    1115
0.888889    1009
Name: EBR_Score, dtype: int64

## Filter for regions with EBR < 1 OR AMBIGUOUS 

In [97]:
EBR_36CI_WGS40X_BED_DF_Below1 = EBR_36CI_WGS40X_BED_DF[ (EBR_36CI_WGS40X_BED_DF["EBR_Score"] < 1) ]  
EBR_36CI_WGS40X_BED_DF_Below1.shape

(18234, 5)

In [98]:
EBR_36CI_BED_Below_1_AllPos_PATH = f"{PBvIll_EBR_Dir}/EBR_V7_36CI.Below_1.And.Ambiguous.AllPositions.bed"
EBR_36CI_BED_Below_1_Regions_PATH = f"{PBvIll_EBR_Dir}/EBR_V7_36CI.Below_1.And.Ambiguous.Regions.bed"


EBR_36CI_WGS40X_BED_DF_Below1.to_csv(EBR_36CI_BED_Below_1_AllPos_PATH,
                           sep = "\t",
                           index = False,
                           header = False)


# Merge/condense adjacent basepairs that are below the defined threshold
!bedtools merge -i $EBR_36CI_BED_Below_1_AllPos_PATH > $EBR_36CI_BED_Below_1_Regions_PATH

## Filter for regions with EBR < 0.9 OR AMBIGUOUS 

In [99]:
EBR_36CI_WGS40X_BED_DF_Below09 = EBR_36CI_WGS40X_BED_DF[ (EBR_36CI_WGS40X_BED_DF["EBR_Score"] < 0.9) | (EBR_36CI_WGS40X_BED_DF["EBR_Score"] == "Ambiguous")]  
EBR_36CI_WGS40X_BED_DF_Below09.shape

(11672, 5)

In [100]:
EBR_36CI_BED_Below_09_AllPos_PATH = f"{PBvIll_EBR_Dir}/EBR_V7_36CI.Below_0.9.And.Ambiguous.AllPositions.bed"
EBR_36CI_BED_Below_09_Regions_PATH = f"{PBvIll_EBR_Dir}/EBR_V7_36CI.Below_0.9.And.Ambiguous.Regions.bed"


EBR_36CI_WGS40X_BED_DF_Below09.to_csv(EBR_36CI_BED_Below_09_AllPos_PATH,
                           sep = "\t",
                           index = False,
                           header = False)


# Merge/condense adjacent basepairs that are below the defined threshold
!bedtools merge -i $EBR_36CI_BED_Below_09_AllPos_PATH > $EBR_36CI_BED_Below_09_Regions_PATH

## Filter for regions with EBR < 0.8 OR AMBIGUOUS 

In [101]:
EBR_36CI_WGS40X_BED_DF_Below08 = EBR_36CI_WGS40X_BED_DF[ ( (EBR_36CI_WGS40X_BED_DF["EBR_Score"] == "Ambiguous") | EBR_36CI_WGS40X_BED_DF["EBR_Score"] < 0.8)]  
EBR_36CI_WGS40X_BED_DF_Below08.shape

(60, 5)

In [102]:
EBR_36CI_BED_Below_08_AllPos_PATH = f"{PBvIll_EBR_Dir}/EBR_V7_36CI.Below_0.8.And.Ambiguous.AllPositions.bed"
EBR_36CI_BED_Below_08_Regions_PATH = f"{PBvIll_EBR_Dir}/EBR_V7_36CI.Below_0.8.And.Ambiguous.Regions.bed"


EBR_36CI_WGS40X_BED_DF_Below09.to_csv(EBR_36CI_BED_Below_08_AllPos_PATH,
                           sep = "\t",
                           index = False,
                           header = False)


# Merge/condense adjacent basepairs that are below the defined threshold
!bedtools merge -i $EBR_36CI_BED_Below_08_AllPos_PATH > $EBR_36CI_BED_Below_08_Regions_PATH

In [103]:
#STOP!!!

# Try parsing back in data generated in this notebook

In [104]:
#!ls -1 ../../../

In [105]:
PB_Vs_Illumina_DataAnalysis_Dir = "../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI"

PBvIll_EBR_Dir = PB_Vs_Illumina_DataAnalysis_Dir + "/210112_EBR_H37rv_36CI_MM2vsPilon_V7"         

PBvsIll_EBR_IndivSample_NPZs = f"{PBvIll_EBR_Dir}/210112_EBR_H37rv_IndividualSampleRecall_NPZs"


## Read back in NPYs to recreate the "dictOf_EBR_IndivIsolate_NPYs"

In [106]:
dictOf_EBR_IndivIsolate_NPYs = {}

for SampleID in tqdm(SampleIDs_36CI_SOI):
    
    i_EBR_Agreement_NPZ_PATH = f"{PBvsIll_EBR_IndivSample_NPZs}/EBR.V7.IndivIsolate.{SampleID}.npz"

    i_EBR_NP = np.load(i_EBR_Agreement_NPZ_PATH)["arr_0"]

    dictOf_EBR_IndivIsolate_NPYs[SampleID] = i_EBR_NP


100%|██████████| 36/36 [00:02<00:00, 13.35it/s]


## Read back in NP array of "EBR_36CI_WGS40X_NPY"

In [107]:
EBR_36CI_WGS40X_NPY_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.npy"

#EBR_36CI_Array_A4 = np.load(EBR_36CI_WGS40X_NPY_PATH)


In [108]:
np.isnan(EBR_36CI_Array_A4).sum()

15813

In [109]:
np.mean( EBR_36CI_Array_A4[ ~np.isnan(EBR_36CI_Array_A4) ] )

0.9886257598437431

In [110]:
np.nanmean( EBR_36CI_Array_A4 )

0.9886257598437438

## Read back in NP array of "EBR_36CI_WGS40X_NPZ"

In [111]:
EBR_36CI_WGS40X_NPZ_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.npz"

EBR_36CI_Array_A4 = np.load(EBR_36CI_WGS40X_NPZ_PATH)["arr_0"]


In [112]:
EBR_36CI_Array_A4

array([0.91666667, 0.91666667, 0.91666667, ..., 1.        , 1.        ,
       1.        ])

In [113]:
np.isnan(EBR_36CI_Array_A4).sum()

15813

In [114]:
np.mean( EBR_36CI_Array_A4[ ~np.isnan(EBR_36CI_Array_A4) ] )

0.9886257598437431

In [115]:
np.nanmean( EBR_36CI_Array_A4 )

0.9886257598437438

### Look at size of output files

In [116]:
!du -sh $PBvIll_EBR_Dir/*

1.8M	../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210112_EBR_H37rv_IndividualSampleRecall_NPZs
344K	../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210112_EBR_V7_36CI.bed
248K	../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210112_EBR_V7_36CI.bedgraph
240K	../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210112_EBR_V7_36CI.npz
25M	../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210112_EBR_V7_36CI.tsv
2.9M	../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap
200K	../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/EBR_36CI_AmbigousRegions
816K	../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/EBR_V7_36CI.Below_0.8.And.Ambiguous.AllPositions.bed
24K	../../../21011