# This notebook takes all of the genes from H37Rv and places them into different categories

In [6]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [7]:
import vcf

%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from itertools import compress
from pylab import MaxNLocator
import seaborn as sns; sns.set()
from matplotlib.colors import LogNorm
from matplotlib import gridspec
import ast
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import fastcluster
from sklearn import cluster, datasets
import scipy.cluster.hierarchy as hier
from sklearn.cluster import KMeans
import time
import sys

import Bio
from Bio.Alphabet import IUPAC
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import pairwise2
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.SeqUtils import GC

from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import MutableSeq

import networkx as nx
import scipy

In [8]:
############ Filter Out tRNAs from H37Rv annotation ############

#H37Rv REFERENCE ANNOTATION
H37Rv_annotation = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/H37Rv/h37rv_genome_summary.txt', '\t').set_index('name')

#filter out annotation for t-RNAs and other misc RNAs
RNAs_filter = []
for CDS_i in H37Rv_annotation.index:
    if ('tRNA' in H37Rv_annotation.loc[CDS_i , 'description'].split(' ')) or ('Rvn' in CDS_i):
        RNAs_filter.append(True)
        
    else:
        RNAs_filter.append(False)

non_RNAs_filter = [not RNA for RNA in RNAs_filter]

H37Rv_annotation_no_RNA = H37Rv_annotation[non_RNAs_filter]

In [9]:
#Categories from Comas et. al. 2010 (use to get essential, family-proteins & mobile genetic elements)
gene_categories_from_comas_2010 = pd.read_csv( '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/gene_categories/gene_categories_from_comas_2010.csv' , sep = ',' , names = ['gene_category'])

############ Family Proteins ############
family_protein_genes = []
#Family Proteins from Comas et. al. 2010
for i in gene_categories_from_comas_2010.index:
    
    gene_category_list = gene_categories_from_comas_2010.loc[i].values[0].split(' ')
    gene = gene_category_list[0]
        
    #look at PE, PE-PGRS & PPE family proteins
    if ('PE' in gene_category_list) or ('PE-PGRS' in gene_category_list) or ('PPE' in gene_category_list):
        family_protein_genes.append(gene)
        
############ Antibiotic Resistance Genes (from Farhat website & Maha & Table with AR associated regions) ############
antibiotic_resistance_genes = ['Rv0005','Rv0006','Rv0341','Rv0342','Rv0343','Rv0667','Rv0668','Rv0682','Rv1207','Rv1484','Rv1630','Rv1694','Rv1908c','Rv2043c','Rv2245','Rv2428','Rv2447c','Rv2754c','Rv2763c','Rv2764c','Rv3423c','Rv3608c','Rv3793','Rv3794','Rv3795','Rv3806c','Rv3854c','Rv3919c']
        
############ Antigens (Not Including Family Proteins or Antibiotic Resistance Genes) ############
#Antigens from IEDB epitopes
epitope_peptide_seqs_genomic_coords = pd.read_csv('/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/epitope_peptide_sequences/iedb_filtered_epitope_peptides_seqs_and_genomic_coords_05_23_18.csv' ,  sep = ',').set_index('Unnamed: 0')
antigen_genes = list( set(epitope_peptide_seqs_genomic_coords.loc[:, 'H37Rv Homolog']) - set(family_protein_genes) - set(antibiotic_resistance_genes))

############ Essential & Mobile Genetic Elements ############
essential_genes = []
mobile_genetic_element_genes = []

#iterate through gene categories from comas et. al. 2010
for i in gene_categories_from_comas_2010.index:
    
    gene_category_list = gene_categories_from_comas_2010.loc[i].values[0].split(' ')
    gene = gene_category_list[0]
    
    if ('essential' in gene_category_list) and (gene not in antigen_genes) and (gene not in antibiotic_resistance_genes):
        essential_genes.append(gene)
    
    if ('transposase' in gene_category_list) or ('integrase' in gene_category_list) or ('phage' in gene_category_list) or ('prophage' in gene_category_list) or ('IS' in gene_category_list) or ('IS1533' in gene_category_list) or ('prophage' in gene_category_list) or ('phiRV1' in gene_category_list) or ('phiRv1' in gene_category_list) or ('phiRv2' in gene_category_list) or ('putative' in gene_category_list) and (gene not in antibiotic_resistance_genes):
        mobile_genetic_element_genes.append(gene)

Number of Family-Protein genes also flagged as Antigen genes (kept in seperate categories)

In [10]:
len( set(epitope_peptide_seqs_genomic_coords.loc[:, 'H37Rv Homolog']).intersection(set(family_protein_genes)) )

89

In [11]:
############ Construct into DataFrame ############
gene_categories = pd.DataFrame(index = H37Rv_annotation_no_RNA.index , columns = ['gene_id', 'Gene_Category'])

for gene_id in gene_categories.index:
    
    if gene_id in mobile_genetic_element_genes:
        gene_categories.loc[gene_id,:] = [gene_id , 'Mobile Genetic Element']
    
    elif gene_id in family_protein_genes:
        gene_categories.loc[gene_id,:] = [gene_id , 'PE/PPE']
    
    elif gene_id in antigen_genes:
        gene_categories.loc[gene_id,:] = [gene_id , 'Antigen']
        
    elif gene_id in antibiotic_resistance_genes:
        gene_categories.loc[gene_id,:] = [gene_id , 'Antibiotic Resistance']
    
    elif gene_id in essential_genes:
        gene_categories.loc[gene_id,:] = [gene_id , 'Essential']
        
    else:
        gene_categories.loc[gene_id,:] = [gene_id , 'Non-Essential']

In [12]:
gene_categories.head()

Unnamed: 0_level_0,gene_id,Gene_Category
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Rv0001,Rv0001,Essential
Rv0002,Rv0002,Non-Essential
Rv0003,Rv0003,Non-Essential
Rv0004,Rv0004,Non-Essential
Rv0005,Rv0005,Antibiotic Resistance


In [13]:
np.shape(gene_categories)

(3994, 2)

In [14]:
sum(gene_categories.Gene_Category == 'Essential')

682

In [15]:
sum(gene_categories.Gene_Category == 'Non-Essential')

2752

In [16]:
sum(gene_categories.Gene_Category == 'Antigen')

257

In [17]:
sum(gene_categories.Gene_Category == 'PE/PPE')

167

In [18]:
sum(gene_categories.Gene_Category == 'Antibiotic Resistance')

28

In [19]:
sum(gene_categories.Gene_Category == 'Mobile Genetic Element') #all of these CDS regions will be dropped

108

In [20]:
############ Save as CSV file ############
gene_categories.to_csv( '/n/data1/hms/dbmi/farhat/Roger/inhost_TB_dynamics_project/CSV_files/gene_categories/gene_categories.csv' , sep = ',')