# Parsing and merging data for gene level exploration of PB vs Illumina agreement and mappability

### Maximillian Marin
### mgmarin@g.harvard.edu
### 20/07/23

Goal: 

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
# import gffutils

%matplotlib inline

In [2]:
from Bio import SeqIO

#### Pandas Viewing Settings

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Define functions

In [4]:
gc_content = lambda s: 100.0*len([c for c in s if c in "GC"]) / len(s)

## Parse the H37rv Genbank (GBK) file with BioPython
H37rv genome is stored as a SeqIO record

In [5]:
Mtb_RefDir="/n/data1/hms/dbmi/farhat/mm774/References"
H37rv_Ref_GBK_PATH = f"{Mtb_RefDir}/GCF_000195955.2_ASM19595v2_genomic.gbk"
records = list(SeqIO.parse(H37rv_Ref_GBK_PATH, "genbank"))

Mtb_H37rv_SeqIO_Record = records[0]

# Convert sequence of SeqIO record to just a normal string
Mtb_H37rv_Sequence = str( Mtb_H37rv_SeqIO_Record.seq )

print( len(Mtb_H37rv_Sequence) )

4411532


## Parse Assembly and Sequence analysis results Sample Info

In [6]:
Repo_DataDir = "../../Data"

PMP_SM_ResultsSummary_Dir_210108 = Repo_DataDir + "/210108_PMP_SM_50CI_V7_ResultsSummary"


PMP_50CI_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_50CI_AssemblySummary_V7.tsv"

PMP_42CI_CicularOnly_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_42CI_CircularOnly_F2Filtered_AssemblySummary_V7.tsv" 

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH = PMP_SM_ResultsSummary_Dir_210108 + "/210108_PMP_36CI_CircularOnly_F2Filtered_AtLeast40XMeanDepthIllumina_AssemblySummary_V7.tsv"       


PMP_50CI_AssemblySummary = pd.read_csv(PMP_50CI_AssemblySummary_TSV_PATH, sep = "\t")

PMP_42CI_CicularOnly_AssemblySummary = pd.read_csv(PMP_42CI_CicularOnly_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary = pd.read_csv(PMP_36CI_CicularOnly_F2Filtered_AtLeast40XIllDepth_AssemblySummary_TSV_PATH, sep = "\t")

PMP_36CI_AnalysisSet_AssemblySummary = PMP_36CI_CicularOnly_F2Filtered_AtLeast40XDepth_AssemblySummary


SampleIDs_36CI_SOI = list( PMP_36CI_AnalysisSet_AssemblySummary["SampleID"].values )


print(','.join(SampleIDs_36CI_SOI) )

# Make sample to lineage mapping dict

ID_To_IlluminaAvrgCov_Dict = dict(PMP_50CI_AssemblySummary[['SampleID', 'IlluminaWGSToH37rv_AvrgCov']].values)                     
ID_To_Lineage_Dict = dict(PMP_50CI_AssemblySummary[['SampleID', 'PrimaryLineage_PB']].values)
ID_To_Dataset_Dict = dict(PMP_50CI_AssemblySummary[['SampleID', 'Dataset_Tag']].values)

M0011368_9,M0014888_3,M0016395_7,M0010874_7,01_R1430,02_R0894,02_R1708,02_R1896,M0016737_0,M0017522_5,01_R1134,M0003941_3,02_R1179,N1176,N0072,N0153,N0145,N0155,N0004,N1274,N0054,N1272,N0091,N1202,N1177,RW-TB008,DNA028,DNA075,DNA091,DNA044,DNA020,AZE_02_042,DNA019_Rose,DNA120,DNA188,DNA086


#### What is the distribution of samples from each dataset?

In [7]:
PMP_36CI_AnalysisSet_AssemblySummary["Dataset_Tag"].value_counts()

Farhat_Peru_2019        13
ChinerOms_2019          12
TB_Portals_15CI_R1      10
Ngabonziza_Lin8_2020     1
Name: Dataset_Tag, dtype: int64

In [8]:
len(SampleIDs_36CI_SOI)

36

# 0) Read in pickles of processed data

## Read back in .npy and .pickle files of EBR scores 

In [9]:
PB_Vs_Illumina_DataAnalysis_Dir = "../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI"

# Define directory for EBR analysis data
PBvIll_EBR_Dir = PB_Vs_Illumina_DataAnalysis_Dir + "/210112_EBR_H37rv_36CI_MM2vsPilon_V7"         

EBR_36CI_WGS40X_NPY_PATH = f"{PBvIll_EBR_Dir}/210112_EBR_V7_36CI.npy"

# Parse in aggregated EBR-40CI array
EBR_36CI_Array_A4 = np.load(EBR_36CI_WGS40X_NPY_PATH)


# Parse in pickle of individual arrays
Pickle_PATH_dictOf_EBR_IndivIsolate_NPYs = PBvIll_EBR_Dir + "/210112_dictOf_EBR_V7_IndivIsolate_NPYs_36CI.pickle"                   

#with open(Pickle_PATH_dictOf_EBR_IndivIsolate_NPYs, "rb") as f: dictOf_EBR_IndivIsolate_NPYs = pickle.load(f)


In [10]:
np.mean(EBR_36CI_Array_A4)

nan

In [11]:
np.nanmean(EBR_36CI_Array_A4)

0.9886257528374396

In [12]:
EBR_36CI_Array_A4

array([0.91666667, 0.91666667, 0.91666667, ..., 1.        , 1.        ,
       1.        ])

In [13]:
print(EBR_36CI_Array_A4.shape)

(4411532,)


In [14]:
#len( dictOf_EBR_IndivIsolate_NPYs.keys() )

## Read back in pickle of Genmap pileup mappability calculations

In [15]:
Genmap_Map_AnalysisDir = PB_Vs_Illumina_DataAnalysis_Dir + "/201027_Genmap_Mappability_H37rv_V1"  
ParsedAndPickled_GenmapOutput = f"{Genmap_Map_AnalysisDir}/201027_ParsedAndPickled_GenmapOutput"

Pickle_PATH_dictOf_GM_PileupMap_Arrays = ParsedAndPickled_GenmapOutput + "/201027_dictOf_GM_PileupMap_Arrays.pickle"   

with open(Pickle_PATH_dictOf_GM_PileupMap_Arrays, "rb") as f: dictOf_GM_PileupMap_Arrays = pickle.load(f)

In [16]:
dictOf_GM_PileupMap_Arrays.keys()

dict_keys(['K50_E0', 'K50_E2', 'K50_E4', 'K75_E0', 'K75_E2', 'K75_E4', 'K100_E0', 'K100_E2', 'K100_E4', 'K125_E0', 'K125_E2', 'K125_E4', 'K150_E0', 'K150_E2', 'K150_E4'])

## Read in GC content pickles (GC% calculated as a sliding window)

In [17]:
Mtb_RefDir="/n/data1/hms/dbmi/farhat/mm774/References"
GCcontent_OutputDir = f"{Mtb_RefDir}/191217_H37rv_GC_CircGenome_Arrays"

Pickle_PATH_H37rv_GC_10bp_SW_Array = GCcontent_OutputDir + "/191217_H37rv_GC_10bp_SW_Array.pickle"   
Pickle_PATH_H37rv_GC_50bp_SW_Array = GCcontent_OutputDir + "/191217_H37rv_GC_50bp_SW_Array.pickle"   
Pickle_PATH_H37rv_GC_100bp_SW_Array = GCcontent_OutputDir + "/191217_H37rv_GC_100bp_SW_Array.pickle"   

with open(Pickle_PATH_H37rv_GC_10bp_SW_Array, "rb") as f: H37rv_GC_10bp_SW_Array = pickle.load(f)
with open(Pickle_PATH_H37rv_GC_50bp_SW_Array, "rb") as f: H37rv_GC_50bp_SW_Array = pickle.load(f)
with open(Pickle_PATH_H37rv_GC_100bp_SW_Array, "rb") as f: H37rv_GC_100bp_SW_Array = pickle.load(f)

# Import and parse processed H37rv genome annotations

In [18]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"

H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"
H37Rv_GenomeAnnotations_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.IntergenicRegions.tsv"
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv"    


## H37Rv Gene Annotations TSV
#H37Rv_GenomeAnno_Genes_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_TSV, sep = "\t")

## H37Rv_Integenic_Regions_TSV
#H37Rv_IntergenicRegions_DF = pd.read_csv(H37Rv_GenomeAnnotations_IntergenicRegions_TSV, sep = "\t")

## H37Rv_GeneAndIntegenic_Regions_TSV
H37Rv_AllRegions_DF = pd.read_csv(H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_TSV, sep = "\t")


In [19]:
H37Rv_AllRegions_DF.shape

(7151, 17)

In [20]:
H37Rv_AllRegions_DF.head(3)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,NotExcluded,,Chromosomal replication initiator protein DnaA,information pathways,No,,,,,
1,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,Intergenic,,,Intergenic,,False,Rv0001,NotExcluded,Rv0002,NotExcluded
2,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,NotExcluded,,DNA polymerase III (beta chain) DnaN (DNA nucl...,information pathways,No,,,,,


In [21]:
H37Rv_AllRegions_DF["Feature"].value_counts()

CDS           4031
Intergenic    3072
tRNA            45
rRNA             3
Name: Feature, dtype: int64

# Calculate feature-level EBR, Pileup Mappability, and GC% across all annotated features of H37Rv

In [22]:
listOf_FeatureLevelAnalysis_Rows = []


for index, row  in tqdm(H37Rv_AllRegions_DF.iterrows()) :

    gene_StartPos_0based = row["Start"]
    gene_EndPos_0based = row["End"]
    
    ### EBR Analysis ###
    EBR_Aggr_Subset = EBR_36CI_Array_A4[gene_StartPos_0based :gene_EndPos_0based ]
    
    #gene_Mean_EBR_28CI_Score = np.mean(EBR_28CI_Subset)
    
    
    gene_Mean_EBR_Aggr_Score = np.nanmean(EBR_Aggr_Subset) # Calculate mean EBR score across gene (EXCLUDING NANs)
    
    gene_NumberOfNANs = np.isnan(EBR_Aggr_Subset).sum()
    
    gene_Length = gene_EndPos_0based - gene_StartPos_0based
    
    gene_ProportionWith_NANs_EBR_Aggr = gene_NumberOfNANs / gene_Length
    
    gene_PercentPerfect_EBR_Aggr = (EBR_Aggr_Subset >= 1).sum() / len(EBR_Aggr_Subset) * 100
    
    ### Pileup Mappabiltiy Analysis ###

    gene_Mean_PBvI_MapScore_K50_E4 = np.nanmean(  dictOf_GM_PileupMap_Arrays["K50_E4"][gene_StartPos_0based :gene_EndPos_0based ] )
    gene_Mean_PBvI_MapScore_K100_E4 = np.nanmean(  dictOf_GM_PileupMap_Arrays["K100_E4"][gene_StartPos_0based :gene_EndPos_0based ] )
    gene_Mean_PBvI_MapScore_K125_E4 = np.nanmean(  dictOf_GM_PileupMap_Arrays["K125_E4"][gene_StartPos_0based :gene_EndPos_0based ] )


    GM_Pmap_K50_E4_Subset = dictOf_GM_PileupMap_Arrays["K50_E4"][gene_StartPos_0based :gene_EndPos_0based ]
    gene_PercentPerfect_MapScore_K50_E4 = (GM_Pmap_K50_E4_Subset >= 1).sum() / len(GM_Pmap_K50_E4_Subset) * 100   
    
    
    ### GC Content ###
    gene_GC_Content = gc_content( Mtb_H37rv_Sequence[gene_StartPos_0based :gene_EndPos_0based] )
    
    ### Add columns to row ###
    row["Mean_EBR_36CI"] =  gene_Mean_EBR_Aggr_Score
    row["gene_PercentPerfect_EBR_36CI"] =  gene_PercentPerfect_EBR_Aggr
    row["gene_ProportionWith_NANs_EBR_36CI"] = gene_ProportionWith_NANs_EBR_Aggr
    
    row["gene_GC_Content"]  = gene_GC_Content
    row["Mean_MapScore_K50_E4"]   = gene_Mean_PBvI_MapScore_K50_E4
    row["Mean_MapScore_K100_E4"]  = gene_Mean_PBvI_MapScore_K100_E4 
    row["Mean_MapScore_K125_E4"]  = gene_Mean_PBvI_MapScore_K125_E4 

    
    listOf_FeatureLevelAnalysis_Rows.append(row)


  from ipykernel import kernelapp as app
7151it [00:59, 120.94it/s]


In [23]:
H37rv_FeatureLevelAnalysis_DF = pd.DataFrame(listOf_FeatureLevelAnalysis_Rows)

H37rv_FeatureLevelAnalysis_DF["Length"] = H37rv_FeatureLevelAnalysis_DF["End"] - H37rv_FeatureLevelAnalysis_DF["Start"]
H37rv_FeatureLevelAnalysis_DF.shape

(7151, 25)

In [24]:
H37rv_FeatureLevelAnalysis_DF["Feature"].value_counts()

CDS           4031
Intergenic    3072
tRNA            45
rRNA             3
Name: Feature, dtype: int64

### Subset for gene's (creating the gene-level analysis DF)

In [25]:
H37rv_GeneLevelAnalysis_DF = H37rv_FeatureLevelAnalysis_DF[ H37rv_FeatureLevelAnalysis_DF["Feature"].isin( ["CDS", "tRNA", "rRNA"])  ]

Columns_ToDropFrom_GLA = ['IntergenicRegion_IsNextTo_PLC_Gene', 'Intergenic_GeneToTheLeft', 'Intergenic_GeneToTheLeft_FuncCategory', 
                          'Intergenic_GeneToTheRight', 'Intergenic_GeneToTheRight_FuncCategory']

H37rv_GeneLevelAnalysis_DF = H37rv_GeneLevelAnalysis_DF.drop(Columns_ToDropFrom_GLA, axis = 1)


H37rv_IntergenicLevelAnalysis_DF = H37rv_FeatureLevelAnalysis_DF[ H37rv_FeatureLevelAnalysis_DF["Feature"].isin( ["Intergenic"])  ]


FLA_DF = H37rv_FeatureLevelAnalysis_DF
GLA_DF = H37rv_GeneLevelAnalysis_DF
ILA_DF = H37rv_IntergenicLevelAnalysis_DF


In [26]:
H37rv_GeneLevelAnalysis_DF.columns

Index(['Chrom', 'Start', 'End', 'Strand', 'H37rv_GeneID', 'Symbol', 'Feature', 'ExcludedGroup_Category', 'PEandPPE_Subfamily', 'Product', 'Functional_Category', 'Is_Pseudogene', 'Mean_EBR_36CI', 'gene_PercentPerfect_EBR_36CI', 'gene_ProportionWith_NANs_EBR_36CI', 'gene_GC_Content', 'Mean_MapScore_K50_E4', 'Mean_MapScore_K100_E4', 'Mean_MapScore_K125_E4', 'Length'], dtype='object')

In [27]:
FLA_DF.shape

(7151, 25)

In [28]:
GLA_DF.shape

(4079, 20)

In [29]:
ILA_DF.shape

(3072, 25)

In [30]:
FLA_DF.columns

Index(['Chrom', 'Start', 'End', 'Strand', 'H37rv_GeneID', 'Symbol', 'Feature', 'ExcludedGroup_Category', 'PEandPPE_Subfamily', 'Product', 'Functional_Category', 'Is_Pseudogene', 'IntergenicRegion_IsNextTo_PLC_Gene', 'Intergenic_GeneToTheLeft', 'Intergenic_GeneToTheLeft_FuncCategory', 'Intergenic_GeneToTheRight', 'Intergenic_GeneToTheRight_FuncCategory', 'Mean_EBR_36CI', 'gene_PercentPerfect_EBR_36CI', 'gene_ProportionWith_NANs_EBR_36CI', 'gene_GC_Content', 'Mean_MapScore_K50_E4', 'Mean_MapScore_K100_E4', 'Mean_MapScore_K125_E4', 'Length'], dtype='object')

In [31]:
FLA_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory,Mean_EBR_36CI,gene_PercentPerfect_EBR_36CI,gene_ProportionWith_NANs_EBR_36CI,gene_GC_Content,Mean_MapScore_K50_E4,Mean_MapScore_K100_E4,Mean_MapScore_K125_E4,Length
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,NotExcluded,,Chromosomal replication initiator protein DnaA,information pathways,No,,,,,,0.99743,92.979003,0.0,58.923885,1.0,1.0,1.0,1524
1,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,Intergenic,,,Intergenic,,False,Rv0001,NotExcluded,Rv0002,NotExcluded,0.999315,97.533207,0.0,56.925996,1.0,1.0,1.0,527


In [32]:
GLA_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,Mean_EBR_36CI,gene_PercentPerfect_EBR_36CI,gene_ProportionWith_NANs_EBR_36CI,gene_GC_Content,Mean_MapScore_K50_E4,Mean_MapScore_K100_E4,Mean_MapScore_K125_E4,Length
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,NotExcluded,,Chromosomal replication initiator protein DnaA,information pathways,No,0.99743,92.979003,0.0,58.923885,1.0,1.0,1.0,1524
2,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,NotExcluded,,DNA polymerase III (beta chain) DnaN (DNA nucl...,information pathways,No,0.998093,93.134822,0.0,59.966915,1.0,1.0,1.0,1209


In [33]:
ILA_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory,Mean_EBR_36CI,gene_PercentPerfect_EBR_36CI,gene_ProportionWith_NANs_EBR_36CI,gene_GC_Content,Mean_MapScore_K50_E4,Mean_MapScore_K100_E4,Mean_MapScore_K125_E4,Length
1,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,Intergenic,,,Intergenic,,False,Rv0001,NotExcluded,Rv0002,NotExcluded,0.999315,97.533207,0.0,56.925996,1.0,1.0,1.0,527
3,NC_000962.3,3260,3279,,IntergenicRegion_2_Rv0002-Rv0003,,Intergenic,Intergenic,,,Intergenic,,False,Rv0002,NotExcluded,Rv0003,NotExcluded,1.0,100.0,0.0,73.684211,1.0,1.0,1.0,19


## Save TSV of gene level mappability and PBvsIll agreement + annotations (On O2)

In [34]:
# Define directory for analysis of SNP data

Repo_DataDir = "../../Data"

FeatureLevelAnalysis_Dir_GitRepo = Repo_DataDir + "/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap" 
FeatureLevelAnalysis_Dir_O2 = PBvIll_EBR_Dir + "/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap" 

#!mkdir $FeatureLevelAnalysis_Dir_O2 $FeatureLevelAnalysis_Dir_GitRepo
!mkdir $FeatureLevelAnalysis_Dir_O2


H37Rv_FeatureLevelAnalysis_EBR_Pmap_TSV_O2 = f"{FeatureLevelAnalysis_Dir_O2}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.tsv"
H37Rv_FeatureLevelAnalysis_EBR_Pmap_TSV_GitRepo = f"{FeatureLevelAnalysis_Dir_GitRepo}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.tsv"

H37Rv_GeneLevelAnalysis_EBR_Pmap_TSV_O2 = f"{FeatureLevelAnalysis_Dir_O2}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.Genes.tsv"
H37Rv_GeneLevelAnalysis_EBR_Pmap_TSV_GitRepo = f"{FeatureLevelAnalysis_Dir_GitRepo}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.Genes.tsv"

H37Rv_IntergenicLevelAnalysis_EBR_Pmap_TSV_O2 = f"{FeatureLevelAnalysis_Dir_O2}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.IntergenicRegions.tsv"
H37Rv_IntergenicLevelAnalysis_EBR_Pmap_TSV_GitRepo = f"{FeatureLevelAnalysis_Dir_GitRepo}/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.IntergenicRegions.tsv"        



# Feature Level Analysis (Gene + intergenic)
FLA_DF.to_csv(H37Rv_FeatureLevelAnalysis_EBR_Pmap_TSV_O2, sep = "\t", index = False)
#FLA_DF.to_csv(H37Rv_FeatureLevelAnalysis_EBR_Pmap_TSV_GitRepo, sep = "\t", index = False)


# Gene level Analysis 
GLA_DF.to_csv(H37Rv_GeneLevelAnalysis_EBR_Pmap_TSV_O2, sep = "\t", index = False)
#GLA_DF.to_csv(H37Rv_GeneLevelAnalysis_EBR_Pmap_TSV_GitRepo, sep = "\t", index = False)


# Intergenic Region Analysis 
ILA_DF.to_csv(H37Rv_IntergenicLevelAnalysis_EBR_Pmap_TSV_O2, sep = "\t", index = False)
#ILA_DF.to_csv(H37Rv_IntergenicLevelAnalysis_EBR_Pmap_TSV_GitRepo, sep = "\t", index = False)



mkdir: cannot create directory ‘../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap’: File exists


## Look at output directory on O2 directory

In [35]:
!ls -lah $FeatureLevelAnalysis_Dir_O2

total 4.1M
drwxrwsr-x 2 mm774 farhat  207 Jan 18 13:29 .
drwxrwsr-x 4 mm774 farhat  398 Feb  2 19:19 ..
-rw-rw-r-- 1 mm774 farhat 808K Feb 26 16:24 H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.Genes.tsv
-rw-rw-r-- 1 mm774 farhat 597K Feb 26 16:24 H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.IntergenicRegions.tsv
-rw-rw-r-- 1 mm774 farhat 1.4M Feb 26 16:24 H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.tsv


In [41]:
!wc -l $FeatureLevelAnalysis_Dir_O2/*

   4080 ../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.Genes.tsv
   3073 ../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.IntergenicRegions.tsv
   7152 ../../../210112_PBvsI_VCeval_AnalysisDir_V7_36CI/210112_EBR_H37rv_36CI_MM2vsPilon_V7/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap/H37Rv_FeatureLevelAnalysis.EBR_And_Pmap.tsv
  14305 total


## Look at output directory on in Git Repo Data/ directory

In [40]:
!ls -lah $FeatureLevelAnalysis_Dir_GitRepo

ls: cannot access ../../Data/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap: No such file or directory


In [39]:
!wc -l $FeatureLevelAnalysis_Dir_GitRepo/*

wc: ../../Data/210113_H37Rv_FeatureLevelAnalysis_EBR_Pmap/*: No such file or directory
