# Processing

### Maximillian Marin
### mgmarin@g.harvard.edu
### 2020/10/27

Goal: Creating a single table with comprehensive annotations of H37Rv, along with additional annotations regarding groupings of genes (ie. PLC, Not Excluded, etc)


Specifically, I want to produce a TSV with H37Rv annotations (0-based coordinates) that has: <br>
1) Core information regarding each gene from Mycobrowser (Coordinates, functional annotation, etc) <br>
2) Annotate INTERGENIC REGIONS <br>
3) INTERGENIC REGIONs should be annotated with their neighboring genes <br>



In [85]:
import numpy as np
import pandas as pd
import vcf
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

#### Pandas Viewing Settings

In [86]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Define PATHs to starting annotation files for H37Rv

### Starting annotations:

#### 1) Mycobacterium_tuberculosis_H37Rv_txt_v3.txt.tsv
#### 2) 190927_MycoBrowserV3_FunctionalCategories.tsv (This has all annotated features of H37Rv defined by Mycobrowser)
#### 3) Coscolla_etal_2015_Table_S9_GenesExcludedFromTBanalysis.tsv
#### 4) 190927_LouisReview_Indepth_PE_Annotations.tsv
#### 5) 190927_LouisReview_Indepth_PPE_Annotations.tsv


In [87]:
# Define path to references directory in the associated Git repository
GitRepo_References_Dir = "../../References"
# Define directory for aggregation of gene lists
H37rv_GeneAnnotationsAndLists_Dir = GitRepo_References_Dir + "/190927_H37rv_GeneAnnotationsAndLists" 

In [88]:
!ls -1 $H37rv_GeneAnnotationsAndLists_Dir

190927_Ates2020_Indepth_PE_Annotations.tsv
190927_Ates2020_Indepth_PPE_Annotations.tsv
190927_H37rv_ListOf_ESXgenes.tsv
190927_LouisReview_Indepth_PE_Annotations.tsv
190927_LouisReview_Indepth_PPE_Annotations.tsv
190927_MycoBrowserV3_FunctionalCategories.tsv
190927_Standard_GenesToFiler_PEPPEsAndCoscolla.tsv
190927_Standard_GenesToFiler_PEPPEsAndCoscolla.txt
190927_Vargas_etal_ST3_GeneCategories.csv
191007_Standard_GenesToFilter_PEPPEsMGEsAndCoscolla.tsv
191007_Standard_GenesToFitler_PEPPEsAndCoscolla.tsv
191217_FromRoger_H37rv_genome_summary.tsv
Coscolla_etal_2015_Table_S9_GenesExcludedFromTBanalysis.tsv


In [89]:
!head $H37rv_GeneAnnotationsAndLists_Dir/Coscolla_etal_2015_Table_S9_GenesExcludedFromTBanalysis.tsv

Gene name	Locus identifier	Description
-	Rv0094c	hypothetical protein
-	Rv0095c	hypothetical protein
-	Rv0257	hypothetical protein
-	Rv0277c	hypothetical protein
hspR	Rv0353	HEAT shock
-	Rv0393	REP13E12 repeat
-	Rv0397	REP13E12 repeat family
-	Rv0487	hypothetical protein
senX3	Rv0490	putative two


### A) Mycobrowser Functional Categories

In [90]:
#### A) Mycobrowser Functional Categories

H37rv_MycoBrowser_FuncCate_TSV_PATH = H37rv_GeneAnnotationsAndLists_Dir + "/190927_MycoBrowserV3_FunctionalCategories.tsv"

H37rv_MycoBrowser_FuncCate_DF = pd.read_csv(H37rv_MycoBrowser_FuncCate_TSV_PATH, sep="\t")

MycoBrowser_FunctionalCategoryMapping = dict(H37rv_MycoBrowser_FuncCate_DF[['Locus', 'Functional_Category']].values)

print( H37rv_MycoBrowser_FuncCate_DF.shape )


(4187, 3)


In [91]:
H37rv_MycoBrowser_FuncCate_DF["Functional_Category"].value_counts()

conserved hypotheticals                    1042
intermediary metabolism and respiration     939
cell wall and cell processes                772
lipid metabolism                            274
information pathways                        244
virulence, detoxification, adaptation       241
regulatory proteins                         201
PE/PPE                                      168
insertion seqs and phages                   147
stable RNAs                                 144
unknown                                      15
Name: Functional_Category, dtype: int64

### B) PE and PPE gene sub-family annotations

In [92]:
#### B) PE and PPE gene sub-family annotations
## Adding PPE and PE subfamily annotations to DF

Ates_2020_PE_Annotations_TSV_PATH = H37rv_GeneAnnotationsAndLists_Dir + "/190927_Ates2020_Indepth_PE_Annotations.tsv"      
Ates_2020_PPE_Annotations_TSV_PATH = H37rv_GeneAnnotationsAndLists_Dir + "/190927_Ates2020_Indepth_PPE_Annotations.tsv"   


PE_Anno_DF = pd.read_csv(Ates_2020_PE_Annotations_TSV_PATH, sep = "\t")
PPE_Anno_DF = pd.read_csv(Ates_2020_PPE_Annotations_TSV_PATH, sep = "\t")


PPE_Anno_DF["PEandPPE_Subfamily"] = "PPE_" +  "SL-" + PPE_Anno_DF["Sublineage"].map(str) +"_"+ PPE_Anno_DF["Subfamily"].map(str)
PE_Anno_DF["PEandPPE_Subfamily"] = "PE_" + PE_Anno_DF["Subfamily"].map(str) + "_" + PE_Anno_DF["PGRS"].fillna('')

ColumnsOfInterest_Mapping = ["H37Rv gene", "PEandPPE_Subfamily"]
PEandPPE_SubgroupMapping_DF = pd.concat( [ PPE_Anno_DF[ColumnsOfInterest_Mapping], PE_Anno_DF[ColumnsOfInterest_Mapping] ] )      

print(  PEandPPE_SubgroupMapping_DF.shape )

(160, 2)


#### C) Genes with repetitive sequence content as defined in Coscolla et. al. 2015

In [93]:
#### C) Genes with repetitive sequence content as defined in Coscolla et. al. 2015

CoscollaEtAl_ListOf_82genes_TSV_PATH = f"{H37rv_GeneAnnotationsAndLists_Dir}/Coscolla_etal_2015_Table_S9_GenesExcludedFromTBanalysis.tsv"

Coscolla_DF = pd.read_csv(CoscollaEtAl_ListOf_82genes_TSV_PATH, sep="\t")

# Import and parse H37rv genome annotations (MycoBrowser V3)

In [94]:
Mtb_RefDir = "/n/data1/hms/dbmi/farhat/mm774/References"

Mycobrowser_H37rv_RefFiles_Dir = f"{Mtb_RefDir}/190619_Mycobrowser_H37rv_ReferenceFiles"

Mycobrowser_H37rv_GenomeAnno_TSV = f"{Mycobrowser_H37rv_RefFiles_Dir}/Mycobacterium_tuberculosis_H37Rv_txt_v3.txt.tsv"

In [95]:
# !ls -1 $Mycobrowser_H37rv_RefFiles_Dir

In [96]:
Mycobrowser_H37rv_GenomeAnno_DF = pd.read_csv(Mycobrowser_H37rv_GenomeAnno_TSV, sep = "\t").set_index('Locus')

Mycobrowser_H37rv_GenomeAnno_DF["H37rv_GeneID"] = Mycobrowser_H37rv_GenomeAnno_DF.index
Mycobrowser_H37rv_GenomeAnno_DF = Mycobrowser_H37rv_GenomeAnno_DF.sort_values("Start")
Mycobrowser_H37rv_GenomeAnno_DF["chromStart_0based"] = Mycobrowser_H37rv_GenomeAnno_DF["Start"] - 1

Mycobrowser_H37rv_GenomeAnno_DF["Feature_Length"] = Mycobrowser_H37rv_GenomeAnno_DF["Stop"] - Mycobrowser_H37rv_GenomeAnno_DF["chromStart_0based"]


Mycobrowser_H37rv_GenomeAnno_DF["Chrom"] = Mycobrowser_H37rv_GenomeAnno_DF["Refseq_ID"]

Mycobrowser_H37rv_GenomeAnno_DF = Mycobrowser_H37rv_GenomeAnno_DF.reset_index(drop=True)

Mycobrowser_H37rv_GenomeAnno_Genes_DF = Mycobrowser_H37rv_GenomeAnno_DF[ ~Mycobrowser_H37rv_GenomeAnno_DF["Feature"].isin(['promoter', "-10_signal", "-35_signal", 'misc_RNA', "ncRNA"]) ]   

#Mycobrowser_H37rv_GenomeAnno_Genes_DF = Mycobrowser_H37rv_GenomeAnno_DF[ Mycobrowser_H37rv_GenomeAnno_DF["Feature"].isin(["CDS"]) ]                      


H37rv_GenomeAnno_ncRNA_DF = Mycobrowser_H37rv_GenomeAnno_DF[ Mycobrowser_H37rv_GenomeAnno_DF["Feature"].isin(['ncRNA']) ]
H37rv_GenomeAnno_tRNA_DF = Mycobrowser_H37rv_GenomeAnno_DF[ Mycobrowser_H37rv_GenomeAnno_DF["Feature"].isin(['tRNA']) ]


In [97]:
Mycobrowser_H37rv_GenomeAnno_DF["Feature"].value_counts()

CDS           4031
ncRNA           92
tRNA            45
promoter         6
-35_signal       4
-10_signal       4
rRNA             3
misc_RNA         2
Name: Feature, dtype: int64

In [98]:
Mycobrowser_H37rv_GenomeAnno_Genes_DF["Feature"].value_counts()

CDS     4031
tRNA      45
rRNA       3
Name: Feature, dtype: int64

In [99]:
Mycobrowser_H37rv_GenomeAnno_Genes_DF["Functional_Category"].value_counts()

conserved hypotheticals                    1042
intermediary metabolism and respiration     936
cell wall and cell processes                772
lipid metabolism                            272
information pathways                        242
virulence, detoxification, adaptation       239
regulatory proteins                         198
PE/PPE                                      168
insertion seqs and phages                   147
stable RNAs                                  48
unknown                                      15
Name: Functional_Category, dtype: int64

In [100]:
Mycobrowser_H37rv_GenomeAnno_Genes_DF["Functional_Category"].value_counts()

conserved hypotheticals                    1042
intermediary metabolism and respiration     936
cell wall and cell processes                772
lipid metabolism                            272
information pathways                        242
virulence, detoxification, adaptation       239
regulatory proteins                         198
PE/PPE                                      168
insertion seqs and phages                   147
stable RNAs                                  48
unknown                                      15
Name: Functional_Category, dtype: int64

In [101]:
Mycobrowser_H37rv_GenomeAnno_Genes_DF["Strand"].value_counts()

+    2049
-    2030
Name: Strand, dtype: int64

In [102]:
Mycobrowser_H37rv_GenomeAnno_Genes_DF.head(1)

Unnamed: 0,Refseq_ID,Mycobrowser_Version,Feature,Start,Stop,Score,Strand,Frame,Name,Function,Product,Comments,UniProt_AC,Is_Pseudogene,Functional_Category,Orthologues M. leprae,Orthologues M. marinum,Orthologues M. smegmatis,Orthologues M. bovis,Orthologues M. lepromatosis,Orthologues M. tuberculosis,Orthologues M. abscessus,Orthologues M. haemophilum,H37rv_GeneID,chromStart_0based,Feature_Length,Chrom
0,NC_000962.3,Mycobrowser_v3,CDS,1,1524,.,+,0.0,dnaA,Plays an important role in the initiation and ...,Chromosomal replication initiator protein DnaA,"Rv0001, (MT0001, MTV029.01, P49993), len: 507 ...",P9WNW3,No,information pathways,ML0001,MMAR_0001,MSMEG_6947,Mb0001,,,,,Rv0001,0,1524,NC_000962.3


## Strip down Mycobrowser Annotation table to core information

In [103]:
Mycobrowser_H37rv_GenomeAnno_Genes_DF.columns

Index(['Refseq_ID', 'Mycobrowser_Version', 'Feature', 'Start', 'Stop', 'Score', 'Strand', 'Frame', 'Name', 'Function', 'Product', 'Comments', 'UniProt_AC', 'Is_Pseudogene', 'Functional_Category', 'Orthologues M. leprae', 'Orthologues M. marinum', 'Orthologues M. smegmatis', 'Orthologues M. bovis', 'Orthologues M. lepromatosis', 'Orthologues M. tuberculosis', 'Orthologues M. abscessus', 'Orthologues M. haemophilum', 'H37rv_GeneID', 'chromStart_0based', 'Feature_Length', 'Chrom'], dtype='object')

In [104]:
MycoBrowser_TSV_ColumnsToKeep = ["Chrom", "chromStart_0based", "Stop", "Strand",
                                 "H37rv_GeneID", "Name", "Feature", "Functional_Category", "Is_Pseudogene", "Product"]

H37Rv_GenomeAnno_Genes_DF = Mycobrowser_H37rv_GenomeAnno_Genes_DF[MycoBrowser_TSV_ColumnsToKeep]

H37Rv_GenomeAnno_Genes_DF.columns = ["Chrom", "Start", "End", "Strand",
                                       "H37rv_GeneID", "Symbol", "Feature", "Functional_Category", "Is_Pseudogene", "Product"]


In [105]:
H37Rv_GenomeAnno_Genes_DF.shape

(4079, 10)

In [106]:
H37Rv_GenomeAnno_Genes_DF.head()

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA
1,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,information pathways,No,DNA polymerase III (beta chain) DnaN (DNA nucl...
2,NC_000962.3,3279,4437,+,Rv0003,recF,CDS,information pathways,No,DNA replication and repair protein RecF (singl...
3,NC_000962.3,4433,4997,+,Rv0004,Rv0004,CDS,conserved hypotheticals,No,Conserved hypothetical protein
4,NC_000962.3,5239,7267,+,Rv0005,gyrB,CDS,information pathways,No,DNA gyrase (subunit B) GyrB (DNA topoisomerase...


# Add gene sub-grouping annotations to the Mycobrowser genome annotations

## A) Add PE/PPE Sub-family annotations to DF

In [107]:
# Add PE and PPE subfamily mapping
PEPPE_SubfamilyMapping = dict(PEandPPE_SubgroupMapping_DF[['H37Rv gene', 'PEandPPE_Subfamily']].values)
H37Rv_GenomeAnno_Genes_DF['PEandPPE_Subfamily'] = H37Rv_GenomeAnno_Genes_DF.H37rv_GeneID.map(PEPPE_SubfamilyMapping)

H37Rv_GenomeAnno_Genes_DF['PEandPPE_Subfamily'] = H37Rv_GenomeAnno_Genes_DF['PEandPPE_Subfamily'].fillna("None")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


## B) Add "identified Repetative Gene" (defined in Coscolla-et-al-2015) annotations to DF

In [108]:
ListOf_82_RepGenes_CoscollaEtAl = list(Coscolla_DF["Locus identifier"])


H37Rv_GenomeAnno_Genes_DF['Coscolla_etal_RepetitiveGene'] = ( H37Rv_GenomeAnno_Genes_DF["H37rv_GeneID"].isin(ListOf_82_RepGenes_CoscollaEtAl) )
H37Rv_GenomeAnno_Genes_DF['Coscolla_etal_RepetitiveGene'] = H37Rv_GenomeAnno_Genes_DF['Coscolla_etal_RepetitiveGene'].fillna("None")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [109]:
H37Rv_GenomeAnno_Genes_DF.shape

(4079, 12)

In [110]:
H37Rv_GenomeAnno_Genes_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,Coscolla_etal_RepetitiveGene
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,False
1,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,information pathways,No,DNA polymerase III (beta chain) DnaN (DNA nucl...,,False


## C) Annotate all genes by whether they are PLC or Not-Excluded:
a) Putative Low Confidence (PLC) <br>
b) Not commonly excluded (Non-PLC) <br>

In [111]:
# 1
H37Rv_GenomeAnno_Genes_PEPPEs_DF = H37Rv_GenomeAnno_Genes_DF[ H37Rv_GenomeAnno_Genes_DF["Functional_Category"] == "PE/PPE"]

# 2
H37Rv_GenomeAnno_Genes_82_CoscollaGenes_DF = H37Rv_GenomeAnno_Genes_DF[ H37Rv_GenomeAnno_Genes_DF["Coscolla_etal_RepetitiveGene"] == True]
H37Rv_GenomeAnno_Genes_69_CoscollaGenes_NoMGE_DF = H37Rv_GenomeAnno_Genes_82_CoscollaGenes_DF[ H37Rv_GenomeAnno_Genes_82_CoscollaGenes_DF["Functional_Category"] != "insertion seqs and phages"]       

# 3
H37Rv_GenomeAnno_Genes_MGEs_DF = H37Rv_GenomeAnno_Genes_DF[ H37Rv_GenomeAnno_Genes_DF["Functional_Category"] == "insertion seqs and phages"]


# Add "ExcludedGroup_Category" column to each of the 3 dataframes

# 1
H37Rv_GenomeAnno_Genes_PEPPEs_DF["ExcludedGroup_Category"] = "PE/PPEs"

# 2
H37Rv_GenomeAnno_Genes_69_CoscollaGenes_NoMGE_DF["ExcludedGroup_Category"] = "Coscolla Repetitive Genes"

# 3
H37Rv_GenomeAnno_Genes_MGEs_DF["ExcludedGroup_Category"] = "InsertionSeqs_And_Phages"



ExcludedGenes_384_DF = pd.concat( [H37Rv_GenomeAnno_Genes_PEPPEs_DF, H37Rv_GenomeAnno_Genes_69_CoscollaGenes_NoMGE_DF,
                                   H37Rv_GenomeAnno_Genes_MGEs_DF])


ListOf_ExcludedGenes_384 = ExcludedGenes_384_DF["H37rv_GeneID"]

IncludedGenes_3789_DF = H37Rv_GenomeAnno_Genes_DF[ ~H37Rv_GenomeAnno_Genes_DF["H37rv_GeneID"].isin(ListOf_ExcludedGenes_384) ]
IncludedGenes_3789_DF["ExcludedGroup_Category"] = "NotExcluded"

H37Rv_GenomeAnno_Genes_DF = pd.concat([IncludedGenes_3789_DF, ExcludedGenes_384_DF])
H37Rv_GenomeAnno_Genes_DF = H37Rv_GenomeAnno_Genes_DF.drop("Coscolla_etal_RepetitiveGene", axis = 1)
H37Rv_GenomeAnno_Genes_DF = H37Rv_GenomeAnno_Genes_DF.sort_values(["Start", "End"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://

In [112]:
H37Rv_GenomeAnno_Genes_DF.head()

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,NotExcluded
1,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,information pathways,No,DNA polymerase III (beta chain) DnaN (DNA nucl...,,NotExcluded
2,NC_000962.3,3279,4437,+,Rv0003,recF,CDS,information pathways,No,DNA replication and repair protein RecF (singl...,,NotExcluded
3,NC_000962.3,4433,4997,+,Rv0004,Rv0004,CDS,conserved hypotheticals,No,Conserved hypothetical protein,,NotExcluded
4,NC_000962.3,5239,7267,+,Rv0005,gyrB,CDS,information pathways,No,DNA gyrase (subunit B) GyrB (DNA topoisomerase...,,NotExcluded


In [113]:
H37Rv_GenomeAnno_Genes_DF["Strand"].value_counts()

+    2049
-    2030
Name: Strand, dtype: int64

In [114]:
H37Rv_GenomeAnno_Genes_DF["Is_Pseudogene"].value_counts()

No     4066
Yes      13
Name: Is_Pseudogene, dtype: int64

In [115]:
H37Rv_GenomeAnno_Genes_DF["ExcludedGroup_Category"].value_counts()

NotExcluded                  3695
PE/PPEs                       168
InsertionSeqs_And_Phages      147
Coscolla Repetitive Genes      69
Name: ExcludedGroup_Category, dtype: int64

## Generate a DataFrame of all intergenic regions (and annotate by nearby genes)

In [116]:

PreviousGene_H37Rv_ID = "None"

PreviousGene_EndPos = -1000

PreviousGene_ExcludedGroupCategory = None

IntergenicGene_Counter = 0

listOf_IntergenicRegionTuples = []

for index, row  in H37Rv_GenomeAnno_Genes_DF.iterrows() :
    

    gene_StartPos_0based = (row['Start'])
    gene_EndPos_0based = row['End']
    
    gene_H37Rv_ID = row['H37rv_GeneID']
    gene_ExcludedGroupCategory = row['ExcludedGroup_Category']
    
    
    if IntergenicGene_Counter == 0: 
        IntergenicGene_Counter += 1
        continue
    
    prevGene_ClosestToLeft = H37Rv_GenomeAnno_Genes_DF[(H37Rv_GenomeAnno_Genes_DF["Start"] < gene_StartPos_0based)  ].sort_values(["End"]).tail(1)
    
    PreviousGene_H37Rv_ID = prevGene_ClosestToLeft["H37rv_GeneID"].values[0]
    PreviousGene_EndPos = prevGene_ClosestToLeft["End"].values[0]
    PreviousGene_ExcludedGroupCategory = prevGene_ClosestToLeft["ExcludedGroup_Category"].values[0]


    
    
    IntergenicRegionID = f"IntergenicRegion_{IntergenicGene_Counter}_{PreviousGene_H37Rv_ID}-{gene_H37Rv_ID}"
    
    
    IntergenicRegion_StartPos = PreviousGene_EndPos
    IntergenicRegion_End = gene_StartPos_0based
    
    GeneID_ToTheLeft = PreviousGene_H37Rv_ID
    GeneID_ToTheRight = gene_H37Rv_ID
    
    
    Gene_ToTheLeft_FuncCategory = PreviousGene_ExcludedGroupCategory
    Gene_ToTheRight_FuncCategory = gene_ExcludedGroupCategory

    IntergenicRegion_IsNextTo_PLC_Gene = (Gene_ToTheLeft_FuncCategory != "NotExcluded") or (Gene_ToTheRight_FuncCategory != "NotExcluded")
    
    
    
    Intergenic_Region_Info = ("NC_000962.3", IntergenicRegion_StartPos, IntergenicRegion_End, IntergenicRegionID,
                              IntergenicRegion_IsNextTo_PLC_Gene, GeneID_ToTheLeft, Gene_ToTheLeft_FuncCategory,
                              GeneID_ToTheRight, Gene_ToTheRight_FuncCategory)


    
    # if PreviousGene_EndPos >= gene_StartPos_0based: # ">=" is important, using ">" results in extra intragenic regions with length = 0
    
    NumberOfOverlappingGenes_WithEndOfIntegenicRegion = H37Rv_GenomeAnno_Genes_DF[(H37Rv_GenomeAnno_Genes_DF["Start"] <= gene_StartPos_0based) & (H37Rv_GenomeAnno_Genes_DF["End"] >= gene_StartPos_0based) & (H37Rv_GenomeAnno_Genes_DF["H37rv_GeneID"] != gene_H37Rv_ID)].shape[0] 
    
    #print(NumberOfOverlappingGenes_WithEndOfIntegenicRegion)
    if NumberOfOverlappingGenes_WithEndOfIntegenicRegion > 0:
        continue
    
    if IntergenicGene_Counter != 0: # Skip only the first gene
        #print(Intergenic_Region_Info)
        
        listOf_IntergenicRegionTuples.append(Intergenic_Region_Info)
    

    PreviousGene_H37Rv_ID = gene_H37Rv_ID
    PreviousGene_EndPos = gene_EndPos_0based
    PreviousGene_ExcludedGroupCategory = gene_ExcludedGroupCategory

    
    IntergenicGene_Counter += 1
    
    #if IntergenicGene_Counter > 50: break 
    
    
###### Add final Intergenic region after the last annotated gene of H37Rv ######

Final_EndOfGenomeCoordinate = 4411532
FinalGene_H37Rv_ID = "Rv0001"
FinalGene_ToTheRight_FuncCategory = "NotExcluded"


IntergenicRegion_StartPos = PreviousGene_EndPos
GeneID_ToTheLeft = PreviousGene_H37Rv_ID
Gene_ToTheLeft_FuncCategory = PreviousGene_ExcludedGroupCategory

IntergenicRegion_IsNextTo_PLC_Gene = (Gene_ToTheLeft_FuncCategory != "NotExcluded") or (Gene_ToTheRight_FuncCategory != "NotExcluded")

IntergenicRegionID = f"IntergenicRegion_{IntergenicGene_Counter}_{PreviousGene_H37Rv_ID}-{FinalGene_H37Rv_ID}"


Final_Intergenic_Region_Info = ("NC_000962.3", IntergenicRegion_StartPos, Final_EndOfGenomeCoordinate, IntergenicRegionID,
                                IntergenicRegion_IsNextTo_PLC_Gene, GeneID_ToTheLeft, Gene_ToTheLeft_FuncCategory,
                                FinalGene_H37Rv_ID, FinalGene_ToTheRight_FuncCategory)

listOf_IntergenicRegionTuples.append(Final_Intergenic_Region_Info)

################################################################################


H37Rv_IntergenicRegions_DF = pd.DataFrame(listOf_IntergenicRegionTuples)
H37Rv_IntergenicRegions_DF.columns = ['Chrom', 'Start', 'End', 'H37rv_GeneID', "IntergenicRegion_IsNextTo_PLC_Gene",
                                     'Intergenic_GeneToTheLeft', 'Intergenic_GeneToTheLeft_FuncCategory',
                                     'Intergenic_GeneToTheRight', 'Intergenic_GeneToTheRight_FuncCategory']



In [117]:
H37Rv_IntergenicRegions_DF.head(3)

Unnamed: 0,Chrom,Start,End,H37rv_GeneID,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,1524,2051,IntergenicRegion_1_Rv0001-Rv0002,False,Rv0001,NotExcluded,Rv0002,NotExcluded
1,NC_000962.3,3260,3279,IntergenicRegion_2_Rv0002-Rv0003,False,Rv0002,NotExcluded,Rv0003,NotExcluded
2,NC_000962.3,4997,5239,IntergenicRegion_3_Rv0004-Rv0005,False,Rv0004,NotExcluded,Rv0005,NotExcluded


In [118]:
H37Rv_IntergenicRegions_DF.shape

(3072, 9)

## Combine annotations for Genes & Intergenic regions

In [119]:
H37Rv_AllRegions_DF = pd.concat([H37Rv_GenomeAnno_Genes_DF, H37Rv_IntergenicRegions_DF], axis=0, ignore_index=True)

H37Rv_AllRegions_DF = H37Rv_AllRegions_DF[ ['Chrom',  'Start', 'End', 'Strand', 'H37rv_GeneID', 'Symbol',  'Feature', 'ExcludedGroup_Category', 'PEandPPE_Subfamily', 'Product',  'Functional_Category', 'Is_Pseudogene', 'IntergenicRegion_IsNextTo_PLC_Gene', 'Intergenic_GeneToTheLeft', 'Intergenic_GeneToTheLeft_FuncCategory','Intergenic_GeneToTheRight',  'Intergenic_GeneToTheRight_FuncCategory', ]  ]   


H37Rv_AllRegions_DF = H37Rv_AllRegions_DF.sort_values("Start")
H37Rv_AllRegions_DF = H37Rv_AllRegions_DF.reset_index(drop=True)

H37Rv_AllRegions_DF["Feature"] = H37Rv_AllRegions_DF["Feature"].fillna("Intergenic")

H37Rv_AllRegions_DF["ExcludedGroup_Category"] = H37Rv_AllRegions_DF["ExcludedGroup_Category"].fillna("Intergenic")

H37Rv_AllRegions_DF["Functional_Category"] = H37Rv_AllRegions_DF["Functional_Category"].fillna("Intergenic")


In [120]:
H37Rv_GenomeAnno_Genes_DF.shape

(4079, 12)

In [121]:
H37Rv_IntergenicRegions_DF.shape

(3072, 9)

In [122]:
H37Rv_AllRegions_DF.shape

(7151, 17)

In [123]:
H37Rv_AllRegions_DF["ExcludedGroup_Category"].value_counts()

NotExcluded                  3695
Intergenic                   3072
PE/PPEs                       168
InsertionSeqs_And_Phages      147
Coscolla Repetitive Genes      69
Name: ExcludedGroup_Category, dtype: int64

In [124]:
H37Rv_IntergenicRegions_DF.head(2)

Unnamed: 0,Chrom,Start,End,H37rv_GeneID,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,1524,2051,IntergenicRegion_1_Rv0001-Rv0002,False,Rv0001,NotExcluded,Rv0002,NotExcluded
1,NC_000962.3,3260,3279,IntergenicRegion_2_Rv0002-Rv0003,False,Rv0002,NotExcluded,Rv0003,NotExcluded


In [125]:
H37Rv_AllRegions_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,NotExcluded,,Chromosomal replication initiator protein DnaA,information pathways,No,,,,,
1,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,Intergenic,,,Intergenic,,False,Rv0001,NotExcluded,Rv0002,NotExcluded


In [126]:
H37Rv_AllRegions_DF[H37Rv_AllRegions_DF["ExcludedGroup_Category"] == np.nan]

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory


# Files to output:

1) Output a TSV which has the annotations for ALL GENES of H37Rv 

2) Output a TSV which has the annotations for ALL INTERGENIC REGIONS of H37Rv 

3) Output a TSV which has the annotations for ALL REGIONS (GENE & INTERGENIC) of H37Rv 


In [127]:
RepoRef_Dir = "../../References"

In [128]:
!ls -lah $RepoRef_Dir

total 1.1M
drwxrwsr-x 11 mm774 farhat  898 Oct 29 12:02 .
drwxrwsr-x 10 mm774 farhat  228 Jun 28  2019 ..
-rw-rw-r--  1 mm774 farhat 5.9K Jul  1  2019 190619_PEPPEsubfamily_Anno.tsv
-rw-rw-r--  1 mm774 farhat  139 Jul  1  2019 190701_PPE_MPTR_SubfamilyList_BasedOn_Pittius_etal2006.txt
drwxrwsr-x  2 mm774 farhat  834 Oct 28 21:49 190927_H37rv_GeneAnnotationsAndLists
-rw-rw-r--  1 mm774 farhat  25K Sep 29  2019 190927_LouisReviewPreview_PE-PPEs_Annotations_MGMmodified.xlsx
-rw-rw-r--  1 mm774 farhat  22K Sep 29  2019 190927_LouisReviewPreview_PE-PPEs_Annotations.xlsx
-rw-rw-r--  1 mm774 farhat  20K Jun 24  2020 200624_CoscollaExcludedGenes.bed
drwxrwsr-x  2 mm774 farhat  160 Sep  4 09:52 200901_H37rv_AnnotatedGenes_And_IntergenicRegions
drwxrwsr-x  2 mm774 farhat  371 Feb  2 15:37 201027_H37rv_AnnotatedGenes_And_IntergenicRegions
-rw-rw-r--  1 mm774 farhat 2.9K Jun 10  2019 Coscolla_etal_2015_Table_S9_GenesExcludedFromTBanalysis.tsv
drwxr-sr-x  2 mm774 farhat  115 May 19  2020 Hicketal20

In [129]:
RepoRef_Dir = "../../References"

AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"
!mkdir $AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir

H37Rv_GenomeAnnotations_Genes_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.tsv"
H37Rv_GenomeAnnotations_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.IntergenicRegions.tsv"
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_TSV = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv"
H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed"


mkdir: cannot create directory ‘../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions’: File exists


## 1) Output H37Rv Gene Annotations TSV

In [130]:
H37Rv_GenomeAnno_Genes_DF.to_csv(H37Rv_GenomeAnnotations_Genes_TSV,
                                 sep = "\t", index = False, header = True)

## 2) Output H37Rv_Integenic_Regions_TSV

In [131]:
H37Rv_IntergenicRegions_DF.to_csv(H37Rv_GenomeAnnotations_IntergenicRegions_TSV,
                                 sep = "\t", index = False, header = True)

## 3) Output H37Rv_GeneAndIntegenic_Regions_TSV

In [132]:
H37Rv_AllRegions_DF.to_csv(H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_TSV,
                                 sep = "\t", index = False, header = True)

In [133]:
H37Rv_AllRegions_DF.columns

Index(['Chrom', 'Start', 'End', 'Strand', 'H37rv_GeneID', 'Symbol', 'Feature', 'ExcludedGroup_Category', 'PEandPPE_Subfamily', 'Product', 'Functional_Category', 'Is_Pseudogene', 'IntergenicRegion_IsNextTo_PLC_Gene', 'Intergenic_GeneToTheLeft', 'Intergenic_GeneToTheLeft_FuncCategory', 'Intergenic_GeneToTheRight', 'Intergenic_GeneToTheRight_FuncCategory'], dtype='object')

In [134]:
### To BED format too

In [135]:
columnsToKeep_ForBED = ['Chrom', 'Start', 'End', 'Strand', 'H37rv_GeneID', 'Symbol', 'ExcludedGroup_Category', 'PEandPPE_Subfamily', 'Functional_Category']

H37Rv_AllRegions_DF_BED = H37Rv_AllRegions_DF[columnsToKeep_ForBED]
H37Rv_AllRegions_DF_BED.head()

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,ExcludedGroup_Category,PEandPPE_Subfamily,Functional_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,NotExcluded,,information pathways
1,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,,Intergenic
2,NC_000962.3,2051,3260,+,Rv0002,dnaN,NotExcluded,,information pathways
3,NC_000962.3,3260,3279,,IntergenicRegion_2_Rv0002-Rv0003,,Intergenic,,Intergenic
4,NC_000962.3,3279,4437,+,Rv0003,recF,NotExcluded,,information pathways


In [136]:

H37Rv_AllRegions_DF_BED.to_csv(H37Rv_GenomeAnnotations_Genes_And_IntergenicRegions_BED,
                                 sep = "\t", index = False, header = False)

In [137]:
H37Rv_AllRegions_DF_BED.columns

Index(['Chrom', 'Start', 'End', 'Strand', 'H37rv_GeneID', 'Symbol', 'ExcludedGroup_Category', 'PEandPPE_Subfamily', 'Functional_Category'], dtype='object')

In [138]:
## Look at output directory

In [139]:
!ls -lah $AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir

total 4.4M
drwxrwsr-x  2 mm774 farhat  371 Feb  2 15:37 .
drwxrwsr-x 11 mm774 farhat  898 Oct 29 12:02 ..
-rw-rw-r--  1 mm774 farhat 348K Feb  2 22:26 201027_Mtb_H37rv_AllGenes_Info.bed
-rw-rw-r--  1 mm774 farhat  83K Feb  2 22:26 201027_Mtb_H37rv_AllIntergenicRegions.bed
-rw-rw-r--  1 mm774 farhat 623K Feb  2 22:28 H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed
-rw-rw-r--  1 mm774 farhat 1.1M Feb  2 22:28 H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv
-rw-rw-r--  1 mm774 farhat 569K Feb  2 22:28 H37Rv_GenomeAnnotations.Genes.tsv
-rw-rw-r--  1 mm774 farhat 331K Feb  2 22:28 H37Rv_GenomeAnnotations.IntergenicRegions.tsv


In [140]:
!wc -l $AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir/*

   4079 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/201027_Mtb_H37rv_AllGenes_Info.bed
   3072 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/201027_Mtb_H37rv_AllIntergenicRegions.bed
   7151 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed
   7152 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv
   4080 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.Genes.tsv
   3073 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.IntergenicRegions.tsv
  28607 total


## Let's inspect the dataframes which were just exported to TSV

### Genes

In [141]:
H37Rv_GenomeAnno_Genes_DF.shape

(4079, 12)

In [142]:
H37Rv_GenomeAnno_Genes_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,NotExcluded
1,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,information pathways,No,DNA polymerase III (beta chain) DnaN (DNA nucl...,,NotExcluded


### Intergenic Regions

In [143]:
H37Rv_IntergenicRegions_DF.shape

(3072, 9)

In [144]:
H37Rv_IntergenicRegions_DF.head(2)

Unnamed: 0,Chrom,Start,End,H37rv_GeneID,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,1524,2051,IntergenicRegion_1_Rv0001-Rv0002,False,Rv0001,NotExcluded,Rv0002,NotExcluded
1,NC_000962.3,3260,3279,IntergenicRegion_2_Rv0002-Rv0003,False,Rv0002,NotExcluded,Rv0003,NotExcluded


### All regions (Genes & Intergenic Regions)

In [145]:
H37Rv_AllRegions_DF.shape

(7151, 17)

In [146]:
H37Rv_AllRegions_DF.head(2)

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,NotExcluded,,Chromosomal replication initiator protein DnaA,information pathways,No,,,,,
1,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,Intergenic,,,Intergenic,,False,Rv0001,NotExcluded,Rv0002,NotExcluded


In [147]:
H37Rv_AllRegions_DF.head()

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,ExcludedGroup_Category,PEandPPE_Subfamily,Product,Functional_Category,Is_Pseudogene,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,NotExcluded,,Chromosomal replication initiator protein DnaA,information pathways,No,,,,,
1,NC_000962.3,1524,2051,,IntergenicRegion_1_Rv0001-Rv0002,,Intergenic,Intergenic,,,Intergenic,,False,Rv0001,NotExcluded,Rv0002,NotExcluded
2,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,NotExcluded,,DNA polymerase III (beta chain) DnaN (DNA nucl...,information pathways,No,,,,,
3,NC_000962.3,3260,3279,,IntergenicRegion_2_Rv0002-Rv0003,,Intergenic,Intergenic,,,Intergenic,,False,Rv0002,NotExcluded,Rv0003,NotExcluded
4,NC_000962.3,3279,4437,+,Rv0003,recF,CDS,NotExcluded,,DNA replication and repair protein RecF (singl...,information pathways,No,,,,,


# Output H37rv gene and integenic region info in BED format

In [148]:
H37Rv_GenomeAnno_Genes_DF.head()

Unnamed: 0,Chrom,Start,End,Strand,H37rv_GeneID,Symbol,Feature,Functional_Category,Is_Pseudogene,Product,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,+,Rv0001,dnaA,CDS,information pathways,No,Chromosomal replication initiator protein DnaA,,NotExcluded
1,NC_000962.3,2051,3260,+,Rv0002,dnaN,CDS,information pathways,No,DNA polymerase III (beta chain) DnaN (DNA nucl...,,NotExcluded
2,NC_000962.3,3279,4437,+,Rv0003,recF,CDS,information pathways,No,DNA replication and repair protein RecF (singl...,,NotExcluded
3,NC_000962.3,4433,4997,+,Rv0004,Rv0004,CDS,conserved hypotheticals,No,Conserved hypothetical protein,,NotExcluded
4,NC_000962.3,5239,7267,+,Rv0005,gyrB,CDS,information pathways,No,DNA gyrase (subunit B) GyrB (DNA topoisomerase...,,NotExcluded


In [149]:
H37rv_AllGenesAnnotated_BED_DF = H37Rv_GenomeAnno_Genes_DF[["Start", "End", "H37rv_GeneID", "Symbol", "Functional_Category", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]
H37rv_AllGenesAnnotated_BED_DF["Chrom"] = "NC_000962.3"
H37rv_AllGenesAnnotated_BED_DF = H37rv_AllGenesAnnotated_BED_DF[["Chrom", "Start", "End", "H37rv_GeneID", "Symbol", "Functional_Category", "PEandPPE_Subfamily", "ExcludedGroup_Category"]]     
H37rv_AllGenesAnnotated_BED_DF = H37rv_AllGenesAnnotated_BED_DF.sort_values(["Chrom", "Start", "End"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [150]:
H37rv_AllGenesAnnotated_BED_DF.shape

(4079, 8)

In [151]:
H37rv_AllGenesAnnotated_BED_DF.head()

Unnamed: 0,Chrom,Start,End,H37rv_GeneID,Symbol,Functional_Category,PEandPPE_Subfamily,ExcludedGroup_Category
0,NC_000962.3,0,1524,Rv0001,dnaA,information pathways,,NotExcluded
1,NC_000962.3,2051,3260,Rv0002,dnaN,information pathways,,NotExcluded
2,NC_000962.3,3279,4437,Rv0003,recF,information pathways,,NotExcluded
3,NC_000962.3,4433,4997,Rv0004,Rv0004,conserved hypotheticals,,NotExcluded
4,NC_000962.3,5239,7267,Rv0005,gyrB,information pathways,,NotExcluded


#### Output H37rv_AllGenesAnnotated_BED_DF to BED format

In [152]:
RepoRef_Dir = "../../References"

In [153]:
!ls -1 $RepoRef_Dir

190619_PEPPEsubfamily_Anno.tsv
190701_PPE_MPTR_SubfamilyList_BasedOn_Pittius_etal2006.txt
190927_H37rv_GeneAnnotationsAndLists
190927_LouisReviewPreview_PE-PPEs_Annotations_MGMmodified.xlsx
190927_LouisReviewPreview_PE-PPEs_Annotations.xlsx
200624_CoscollaExcludedGenes.bed
200901_H37rv_AnnotatedGenes_And_IntergenicRegions
201027_H37rv_AnnotatedGenes_And_IntergenicRegions
Coscolla_etal_2015_Table_S9_GenesExcludedFromTBanalysis.tsv
Hicketal2018_GenomicRegionsRemoved
HMMs_Pfam_PE_and_PPE
MTBC_RD_References
pLowConfideceRegions_CoscollaEtAlScheme_Files
References_README.md
Vargas_etal_SupTables_V2
Varges_etal_SupTables_V1


In [154]:
RepoRef_Dir = "../../References"
AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir = f"{RepoRef_Dir}/201027_H37rv_AnnotatedGenes_And_IntergenicRegions"

!mkdir $AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir

H37rv_AllGenesAnnotated_BED_DF_PATH = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/201027_Mtb_H37rv_AllGenes_Info.bed"


# Output pLC Gene Level bed file
H37rv_AllGenesAnnotated_BED_DF.to_csv(H37rv_AllGenesAnnotated_BED_DF_PATH, sep = "\t", index = False, header = False)


### Output BED file with ranges of intergenic regions which do NOT overlap with any annotated genes

RepoRef_Dir = "../../References"

H37rv_BED_GENOME = "/n/data1/hms/dbmi/farhat/mm774/References/GCF_000195955.2_ASM19595v2_genomic.fasta.bedtools.genome"     

H37rv_AllIntergenicRegions_BED_DF_PATH = f"{AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir}/201027_Mtb_H37rv_AllIntergenicRegions.bed"

!bedtools complement -i $H37rv_AllGenesAnnotated_BED_DF_PATH -g $H37rv_BED_GENOME > $H37rv_AllIntergenicRegions_BED_DF_PATH 

mkdir: cannot create directory ‘../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions’: File exists


#### Look at output files

In [155]:
!wc -l $AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir/*

   4079 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/201027_Mtb_H37rv_AllGenes_Info.bed
   3072 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/201027_Mtb_H37rv_AllIntergenicRegions.bed
   7151 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed
   7152 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv
   4080 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.Genes.tsv
   3073 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.IntergenicRegions.tsv
  28607 total


In [156]:
!wc -l $AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir/*IntergenicRegions*

   3072 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/201027_Mtb_H37rv_AllIntergenicRegions.bed
   7151 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed
   7152 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv
   3073 ../../References/201027_H37rv_AnnotatedGenes_And_IntergenicRegions/H37Rv_GenomeAnnotations.IntergenicRegions.tsv
  20448 total


In [157]:
!ls -lah $AnnotatedGenes_And_IntergenicRegions_RepoRef_Dir

total 3.3M
drwxrwsr-x  2 mm774 farhat  371 Feb  2 15:37 .
drwxrwsr-x 11 mm774 farhat  898 Oct 29 12:02 ..
-rw-rw-r--  1 mm774 farhat 348K Feb  2 22:28 201027_Mtb_H37rv_AllGenes_Info.bed
-rw-rw-r--  1 mm774 farhat  83K Feb  2 22:28 201027_Mtb_H37rv_AllIntergenicRegions.bed
-rw-rw-r--  1 mm774 farhat 623K Feb  2 22:28 H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.bed
-rw-rw-r--  1 mm774 farhat 1.1M Feb  2 22:28 H37Rv_GenomeAnnotations.Genes.And.IntergenicRegions.tsv
-rw-rw-r--  1 mm774 farhat 569K Feb  2 22:28 H37Rv_GenomeAnnotations.Genes.tsv
-rw-rw-r--  1 mm774 farhat 331K Feb  2 22:28 H37Rv_GenomeAnnotations.IntergenicRegions.tsv


In [158]:
!head -n 2 $H37rv_AllIntergenicRegions_BED_DF_PATH

NC_000962.3	1524	2051
NC_000962.3	3260	3279


In [159]:
H37Rv_IntergenicRegions_DF.head(2)

Unnamed: 0,Chrom,Start,End,H37rv_GeneID,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
0,NC_000962.3,1524,2051,IntergenicRegion_1_Rv0001-Rv0002,False,Rv0001,NotExcluded,Rv0002,NotExcluded
1,NC_000962.3,3260,3279,IntergenicRegion_2_Rv0002-Rv0003,False,Rv0002,NotExcluded,Rv0003,NotExcluded


In [160]:
!tail -n 2 $H37rv_AllIntergenicRegions_BED_DF_PATH

NC_000962.3	4408897	4408968
NC_000962.3	4410929	4411532


In [161]:
H37Rv_IntergenicRegions_DF.tail(2)

Unnamed: 0,Chrom,Start,End,H37rv_GeneID,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
3070,NC_000962.3,4408897,4408968,IntergenicRegion_3071_Rv3920c-Rv3921c,False,Rv3920c,NotExcluded,Rv3921c,NotExcluded
3071,NC_000962.3,4410789,4411532,IntergenicRegion_3072_Rv3923c-Rv0001,False,Rv3923c,NotExcluded,Rv0001,NotExcluded


In [162]:
!head -n 1385 $H37rv_AllIntergenicRegions_BED_DF_PATH | tail -n 3

NC_000962.3	1989006	1989041
NC_000962.3	1989566	1989832
NC_000962.3	1992577	1993152


In [163]:
H37Rv_IntergenicRegions_DF.head(1385).tail(3)

Unnamed: 0,Chrom,Start,End,H37rv_GeneID,IntergenicRegion_IsNextTo_PLC_Gene,Intergenic_GeneToTheLeft,Intergenic_GeneToTheLeft_FuncCategory,Intergenic_GeneToTheRight,Intergenic_GeneToTheRight_FuncCategory
1382,NC_000962.3,1989006,1989041,IntergenicRegion_1383_Rv1757c-Rv1758,True,Rv1757c,InsertionSeqs_And_Phages,Rv1758,NotExcluded
1383,NC_000962.3,1989566,1989832,IntergenicRegion_1384_Rv1758-Rv1759c,True,Rv1758,NotExcluded,Rv1759c,PE/PPEs
1384,NC_000962.3,1992577,1993152,IntergenicRegion_1385_Rv1759c-Rv1760,True,Rv1759c,PE/PPEs,Rv1760,NotExcluded


# Output BED file of Putative Low Confidence regions (ranges of GENEs excluded in Coscolla et al. 2015 SCHEME)

In [164]:
ExcludedGenes_384_DF = H37Rv_GenomeAnno_Genes_DF[H37Rv_GenomeAnno_Genes_DF["ExcludedGroup_Category"] != "NotExcluded"]    

In [165]:
ExcludedGenes_384_DF.shape

(384, 12)

In [166]:
CoscollaExcludedGenes_BED_DF = ExcludedGenes_384_DF[["Start", "End", "H37rv_GeneID", "Symbol", "ExcludedGroup_Category", "PEandPPE_Subfamily"]]
CoscollaExcludedGenes_BED_DF["Chrom"] = "NC_000962.3"
CoscollaExcludedGenes_BED_DF = CoscollaExcludedGenes_BED_DF[["Chrom", "Start", "End", "H37rv_GeneID", "Symbol", "ExcludedGroup_Category", "PEandPPE_Subfamily"]]
#CoscollaExcludedGenes_BED_DF = CoscollaExcludedGenes_BED_DF.sort_values(["Chrom", "Start", "End"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [167]:
CoscollaExcludedGenes_BED_DF["ExcludedGroup_Category"].value_counts()

PE/PPEs                      168
InsertionSeqs_And_Phages     147
Coscolla Repetitive Genes     69
Name: ExcludedGroup_Category, dtype: int64

#### Output BED files to TSV

In [168]:
RepoRef_Dir = "../../References"
pLC_ExcludedRegionsScheme_RepoRef_Dir = f"{RepoRef_Dir}/pLowConfideceRegions_CoscollaEtAlScheme_Files"

!mkdir $pLC_ExcludedRegionsScheme_RepoRef_Dir

Mtb_H37rv_pLCRegions_Coscolla_BED_PATH = f"{pLC_ExcludedRegionsScheme_RepoRef_Dir}/201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed"

Mtb_H37rv_pLCRegions_Coscolla_Subset_PEPPEs_BED_PATH = f"{pLC_ExcludedRegionsScheme_RepoRef_Dir}/201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.PEPPEs.bed"

Mtb_H37rv_pLCRegions_Coscolla_Subset_MGEs_BED_PATH = f"{pLC_ExcludedRegionsScheme_RepoRef_Dir}/201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.MGEs.bed"

Mtb_H37rv_pLCRegions_Coscolla_Subset_RepetitiveGenes_BED_PATH = f"{pLC_ExcludedRegionsScheme_RepoRef_Dir}/201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.RepetitiveGenes.bed"



Mtb_H37rv_pLCRegions_Only_PE_PGRS_And_PPE_MPTR_BED_PATH = f"{pLC_ExcludedRegionsScheme_RepoRef_Dir}/201027_Mtb_H37rv_pLC_Regions_Subset_For_PE_PGRS_And_PPE_MPTR_Only_85_genes.bed"

Mtb_H37rv_pLCRegions_Coscolla_BED_MERGED_PATH = f"{pLC_ExcludedRegionsScheme_RepoRef_Dir}/201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed"

# Output pLC Gene Level bed file
CoscollaExcludedGenes_BED_DF.to_csv(Mtb_H37rv_pLCRegions_Coscolla_BED_PATH, sep = "\t", index = False, header = False)

## A) 
!grep -e 'PE/PPEs' $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH > $Mtb_H37rv_pLCRegions_Coscolla_Subset_PEPPEs_BED_PATH

## B) 
!grep -e 'InsertionSeqs_And_Phages' $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH > $Mtb_H37rv_pLCRegions_Coscolla_Subset_MGEs_BED_PATH

## C) 
!grep -e 'Coscolla Repetitive Genes' $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH > $Mtb_H37rv_pLCRegions_Coscolla_Subset_RepetitiveGenes_BED_PATH





# Output pLC Region Level bed file
!bedtools merge -i $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH > $Mtb_H37rv_pLCRegions_Coscolla_BED_MERGED_PATH



### Output BED file with ranges of GENEs NOT excluded by the Coscolla SCHEME

RepoRef_Dir = "../../References"

H37rv_BED_GENOME = "/n/data1/hms/dbmi/farhat/mm774/References/GCF_000195955.2_ASM19595v2_genomic.fasta.bedtools.genome"     

Mtb_H37rv_HighConfidenceRegions_NONCoscollaRegions_BED_PATH = f"{pLC_ExcludedRegionsScheme_RepoRef_Dir}/201027_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed"

!bedtools complement -i $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH -g $H37rv_BED_GENOME > $Mtb_H37rv_HighConfidenceRegions_NONCoscollaRegions_BED_PATH 

mkdir: cannot create directory ‘../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files’: File exists


In [169]:
!grep -e 'PGRS' -e 'MPTR' $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH > $Mtb_H37rv_pLCRegions_Only_PE_PGRS_And_PPE_MPTR_BED_PATH

In [170]:
!ls -1 $pLC_ExcludedRegionsScheme_RepoRef_Dir

200630_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
200630_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
200630_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
200819_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
200819_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
200819_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
200819_Mtb_H37rv_pLC_Regions_Subset_For_PE_PGRS_And_PPE_MPTR_Only_85_genes.bed
201027_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.MGEs.bed
201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.PEPPEs.bed
201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.RepetitiveGenes.bed
201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
201027_Mtb_H37rv_pLC_Regions_Subset_For_PE_PGRS_And_PPE_MPTR_Only_85_genes.bed


In [171]:
!wc -l $pLC_ExcludedRegionsScheme_RepoRef_Dir/*

   331 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200630_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
   384 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200630_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
   330 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200630_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
   331 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200819_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
   384 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200819_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
   330 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200819_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
    85 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200819_Mtb_H37rv_pLC_Regions_Subset_For_PE_PGRS_And_PPE_MPTR_Only_85_genes.bed
   331 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/201027_

#### Let's check that the union of the two feature sets (and make sure it makes up all of H37rv)

In [172]:
# Let's check that the union of the two feature sets (and make sure it makes up all of H37rv)
!cat $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH $Mtb_H37rv_HighConfidenceRegions_NONCoscollaRegions_BED_PATH | cut -f 1,2,3 | sort -k 1,1 -k2,2n | bedtools merge     

NC_000962.3	0	4411532


In [173]:
!head $H37rv_BED_GENOME

NC_000962.3	4411532


In [174]:
CoscollaExcludedGenes_BED_DF["ExcludedGroup_Category"].value_counts()

PE/PPEs                      168
InsertionSeqs_And_Phages     147
Coscolla Repetitive Genes     69
Name: ExcludedGroup_Category, dtype: int64

In [178]:
!wc -l $Mtb_H37rv_pLCRegions_Coscolla_Subset_PEPPEs_BED_PATH

168 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.PEPPEs.bed


In [179]:
!wc -l $Mtb_H37rv_pLCRegions_Coscolla_Subset_MGEs_BED_PATH

147 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.MGEs.bed


In [180]:
!wc -l $Mtb_H37rv_pLCRegions_Coscolla_Subset_RepetitiveGenes_BED_PATH

69 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.RepetitiveGenes.bed


In [86]:
HighConfidenceRegions_BED_DF = pd.read_csv(Mtb_H37rv_HighConfidenceRegions_NONCoscollaRegions_BED_PATH, sep = "\t", header = None)
HighConfidenceRegions_BED_DF.columns = ["Chrom", "Start", "End"]
HighConfidenceRegions_BED_DF["Length"] = HighConfidenceRegions_BED_DF["End"] - HighConfidenceRegions_BED_DF["Start"]
HighConfidenceRegions_BED_DF.shape

(331, 4)

In [87]:
LowConfidenceRegions_BED_DF = pd.read_csv(Mtb_H37rv_pLCRegions_Coscolla_BED_MERGED_PATH, sep = "\t", header = None)
LowConfidenceRegions_BED_DF.columns = ["Chrom", "Start", "End"]
LowConfidenceRegions_BED_DF["Length"] = LowConfidenceRegions_BED_DF["End"] - LowConfidenceRegions_BED_DF["Start"]
LowConfidenceRegions_BED_DF.shape

(330, 4)

In [88]:
LowConfidenceRegions_BED_DF["Length"].sum()

469501

In [89]:
HighConfidenceRegions_BED_DF["Length"].sum()

3942031

In [90]:
HighConfidenceRegions_BED_DF["Length"].sum() + LowConfidenceRegions_BED_DF["Length"].sum()

4411532

### What % of the genome is excluded in the Coscolla gene filtering scheme?

Answer: 10.6% of the H37rv reference genome is systematically excluded from a majority of analyses

In [91]:
LowConfidenceRegions_BED_DF["Length"].sum()

469501

In [92]:
LowConfidenceRegions_BED_DF["Length"].sum() / 4411532

0.10642584027498837

In [93]:
!head $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH

NC_000962.3	33581	33794	Rv0031	Rv0031	InsertionSeqs_And_Phages	None
NC_000962.3	103709	104663	Rv0094c	Rv0094c	InsertionSeqs_And_Phages	None
NC_000962.3	104804	105215	Rv0095c	Rv0095c	InsertionSeqs_And_Phages	None
NC_000962.3	105323	106715	Rv0096	PPE1	PE/PPEs	PPE_SL-2_PPE-PPW
NC_000962.3	131381	132872	Rv0109	PE_PGRS1	PE/PPEs	PE_V_PGRS
NC_000962.3	149532	150996	Rv0124	PE_PGRS2	PE/PPEs	PE_V_PGRS
NC_000962.3	177542	179309	Rv0151c	PE1	PE/PPEs	PE_V_
NC_000962.3	179318	180896	Rv0152c	PE2	PE/PPEs	PE_V_
NC_000962.3	187432	188839	Rv0159c	PE3	PE/PPEs	PE_V_
NC_000962.3	188930	190439	Rv0160c	PE4	PE/PPEs	PE_V_


In [94]:
!grep 'PGRS' $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH | wc -l 

65


In [95]:
!grep 'MPTR' $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH | wc -l 

20


In [96]:
!grep -e 'PGRS' -e 'MPTR' $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH | wc -l 

85


In [97]:
!grep PGRS $Mtb_H37rv_pLCRegions_Coscolla_BED_PATH

NC_000962.3	131381	132872	Rv0109	PE_PGRS1	PE/PPEs	PE_V_PGRS
NC_000962.3	149532	150996	Rv0124	PE_PGRS2	PE/PPEs	PE_V_PGRS
NC_000962.3	333436	336310	Rv0278c	PE_PGRS3	PE/PPEs	PE_V_PGRS
NC_000962.3	336559	339073	Rv0279c	PE_PGRS4	PE/PPEs	PE_V_PGRS
NC_000962.3	361333	363109	Rv0297	PE_PGRS5	PE/PPEs	PE_V_PGRS
NC_000962.3	622792	624577	Rv0532	PE_PGRS6	PE/PPEs	PE_V_PGRS
NC_000962.3	671995	675916	Rv0578c	PE_PGRS7	PE/PPEs	PE_V_PGRS
NC_000962.3	832980	833508	Rv0742	PE_PGRS8	PE/PPEs	PE_V_PGRS
NC_000962.3	835700	838052	Rv0746	PE_PGRS9	PE/PPEs	PE_V_PGRS
NC_000962.3	838450	840856	Rv0747	PE_PGRS10	PE/PPEs	PE_V_PGRS
NC_000962.3	846158	847913	Rv0754	PE_PGRS11	PE/PPEs	PE_V_PGRS
NC_000962.3	924950	925364	Rv0832	PE_PGRS12	PE/PPEs	PE_V_PGRS
NC_000962.3	925360	927610	Rv0833	PE_PGRS13	PE/PPEs	PE_V_PGRS
NC_000962.3	927836	930485	Rv0834c	PE_PGRS14	PE/PPEs	PE_V_PGRS
NC_000962.3	968423	970244	Rv0872c	PE_PGRS15	PE/PPEs	PE_V_PGRS
NC_000962.3	1090372	1093144	Rv0977	PE_PGRS16	PE/PPEs	PE_V_PGRS
NC_000962.3	1093360	109435

In [98]:
!ls -1 $pLC_ExcludedRegionsScheme_RepoRef_Dir

200630_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
200630_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
200630_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
200819_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
200819_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
200819_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
200819_Mtb_H37rv_pLC_Regions_Subset_For_PE_PGRS_And_PPE_MPTR_Only_85_genes.bed
201027_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
201027_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
201027_Mtb_H37rv_pLC_Regions_Subset_For_PE_PGRS_And_PPE_MPTR_Only_85_genes.bed


In [99]:
!wc -l $pLC_ExcludedRegionsScheme_RepoRef_Dir/*

   331 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200630_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
   384 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200630_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
   330 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200630_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
   331 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200819_Mtb_H37rv_HighConfidence_Regions_NONpLC_Regions.bed
   384 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200819_Mtb_H37rv_pLC_Regions_CoscollaExcludedGenes.bed
   330 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200819_Mtb_H37rv_pLC_Regions_CoscollaExcludedRegion.Merged.bed
    85 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/200819_Mtb_H37rv_pLC_Regions_Subset_For_PE_PGRS_And_PPE_MPTR_Only_85_genes.bed
   331 ../../References/pLowConfideceRegions_CoscollaEtAlScheme_Files/201027_