## Native Locations
#### This notebook searches the local genomic context of the experimentally discovered defense systems for the specific strains that they were discovered in (referred here as "native location"). Data from this notebook used to create figure S4.

### 0. Packages and Files

In [None]:
#Packages used in this notebook
from Bio import SeqIO
import re, math
import random
import pandas as pd
import subprocess
import numpy as np
import glob
import os
import sys
from Bio import SearchIO
import pickle
from multiprocessing import Pool
OUT_FOLDER = '/home/cdoering/ChrisSysInContext/NativeLocations/'
project_folder = '/home/cdoering/ChrisSysInContext/' 
DIdb = project_folder+"DefenseDomains.hmm" #HMM database of defense domains
VOGdb = "/home/cdoering/allVOG.hmm" #HMM database of all pVOG domains
DIsignFile = project_folder+'DISign.txt' #File denoting if a given domain in DIdb is either "positive" (defense-related) or "negative" (housekeeping-related)

### 1. Local Region Analysis

In [None]:
sysLocs = pd.read_csv('DefSysNativeLocations.txt',sep = '\t',names = ['System','Accession','ProtID'])
sysLocs

In [None]:
MultiGeneSys = ['T4_12','T4_28','T4_RT06','T4_RT11','Lambda_36','Lambda_37','Lambda_49','Lambda_51','T7_2','T7_5']
SingleGeneSys = ['T4_11','T4_16','T4_34','T4_38','T4_43','T4_58','Lambda_11','Lambda_64','T7_38','T7_74','T7_43']
sys2IDs = dict()
for sys in MultiGeneSys:
    sys2IDs[sys] = sysLocs[sysLocs['System'].str.contains(sys)]['ProtID'].tolist()
for sys in SingleGeneSys:
    sys2IDs[sys] = sysLocs[sysLocs['System'] == sys]['ProtID'].tolist()
parts2Sys = dict()
for part in sysLocs['System'].tolist():
    if any([name in part for name in MultiGeneSys]):
        parts2Sys[part] = [name for name in MultiGeneSys if name in part][0]
    else:
        parts2Sys[part] = part
sys2IDs, parts2Sys

In [None]:
genbank = pd.read_csv('/home/cdoering/assembly_summary.txt',sep = '\t',skiprows = 1,usecols = ['# assembly_accession','taxid','species_taxid'])
refseq = pd.read_csv('/home/cdoering/assembly_summary_refseq.txt',sep = '\t',skiprows = 1,usecols = ['# assembly_accession','taxid','species_taxid'])

In [None]:
#Double check presence of native location in database
genbankAcc = genbank['# assembly_accession'].tolist()
refseqAcc = refseq['# assembly_accession'].tolist()
for index, row in sysLocs.iterrows():
    if (row['Accession'] in genbankAcc) or (row['Accession'] in refseqAcc):
        print(row['System']+' present in DB')
    else:
        print(row['System']+' absent')

In [None]:
genbankLinkList = pd.read_csv('/home/cdoering/assembly_summary_full_genomes.txt',sep = '\t',header = None,squeeze = True).tolist()
refseqLinkList = pd.read_csv('/home/cdoering/assembly_summary_refseq_links.txt',sep = '\t',header = None,squeeze = True).tolist()

In [None]:
#Function to download a fasta file from the NCBI ftp page.
#Inputs:
    #genome_id - Accession and strain ID for a given bacterial genome
    #genbankLinkList - txt file of all Genbank full bacterial genomes ftp site links
    #refseqLinkList - txt file of all Refseq bacterial genomes ftp site links
#Output: None, function will download the protein fasta file for a genome (if it exists) from the NCBI ftp site and then unzip the file.
def DownloadFasta(genome_id,genbankLinkList,refseqLinkList):
    for link in genbankLinkList:
        if genome_id in link:
            command = ['wget','-P','/mnt/disks/storage/ncbi-genomes-2021-04-29/',link+'/*_protein.faa.gz']
            subprocess.run(command)
            protFile = glob.glob('/mnt/disks/storage/ncbi-genomes-2021-04-29/*'+genome_id+'*_protein.faa.gz')
            if protFile:
                protFile = protFile[0]
                command = ['gzip','-d',protFile]
                subprocess.run(command)
                return
    for link in refseqLinkList:
        if genome_id in link:
            command = ['wget','-P','/mnt/disks/storage/ncbi-genomes-2021-04-29/',link+'/*_protein.faa.gz']
            subprocess.run(command)
            protFile = glob.glob('/mnt/disks/storage/ncbi-genomes-2021-04-29/*'+genome_id+'*_protein.faa.gz')
            if protFile:
                protFile = protFile[0]
                command = ['gzip','-d',protFile]
                subprocess.run(command)
                return
    return

In [None]:
def buildFasta(file,protIDs,sysName):    
    genome_idwPath = ('_').join(file.split('_')[:-2]) #remove _feature_table.txt from file name
    genome_id = os.path.basename(genome_idwPath) #isolate accession and strain ID for genome
    protFile = genome_idwPath+'_protein.faa' #Makes fasta file name
    if not os.path.isfile(protFile):
        print('No fasta file, attempting download...')
        DownloadFasta(genome_id,genbankLinkList,refseqLinkList)
        if not os.path.isfile(protFile): #Check again for successful download and if not pass and continue
            print('Download failed. Continuing...')
            return
    #Read in feature table and identify any homologs present in this feature table        
    FeatTable = pd.read_csv(file,sep = '\t')
    FeatTable = FeatTable[FeatTable['# feature'] == 'CDS']
    prots = FeatTable[(FeatTable['product_accession'].isin(protIDs)) | (FeatTable['non-redundant_refseq'].isin(protIDs))]
    aroundIDs = dict()
    #If homologs are present, grap 10kb on either side of the homologs and save down into a file.
    for index, row in prots.iterrows():
        
        if str(row['chromosome']) == 'nan':
            contig = row['genomic_accession']
            up10 = row['start'] - 10000
            down10 = row['end'] + 10000
            aroundProt = FeatTable[(FeatTable['start'] > up10) & 
                                         (FeatTable['end'] < down10) &
                                        (FeatTable['genomic_accession'] == contig)]
        else:
            chromosome = row['chromosome']
            contig = row['genomic_accession']
            up10 = row['start'] - 10000
            down10 = row['end'] + 10000
            aroundProt = FeatTable[(FeatTable['start'] > up10) & 
                                         (FeatTable['end'] < down10) &
                                        (FeatTable['chromosome'] == chromosome) &
                                        (FeatTable['genomic_accession'] == contig)]
        if row['product_accession'] in protIDs:
            aroundIDs[row['product_accession']] = [x for x in aroundProt['product_accession'].values.tolist() if str(x) != 'nan']
        elif row['non-redundant_refseq'] in protIDs:
            aroundIDs[row['non-redundant_refseq']] = [x for x in aroundProt['product_accession'].values.tolist() if str(x) != 'nan']
    for homolog in aroundIDs:
        #OUT = homologs2main[homolog]+'/'+genome_id+'_'+homolog+'.faa'
        OUT = OUT_FOLDER+sysName+'_'+genome_id+'_'+homolog+'.faa'
        if os.path.isfile(OUT):
            return
        prots2grab = aroundIDs[homolog]+[homolog]
        protRecords = []
        for record in SeqIO.parse(protFile,'fasta'):
            if record.id in prots2grab:
                protRecords.append(record)
        SeqIO.write(protRecords,OUT,'fasta')
    return

In [None]:
for index, row in sysLocs.iterrows():
    sysName = row['System']
    acc = row['Accession']
    protIDs = set([row['ProtID']])
    if protIDs == 'No_ID(pseudo)':
        continue
    file = glob.glob('/mnt/disks/storage/ncbi-genomes-2021-04-29/*'+acc+'*_feature_table.txt')[0]
    buildFasta(file,protIDs,sysName)

In [None]:
#Defense and Housekeeping Domain Calculations
def HMMERFiles(faFile):
    outFile = os.path.splitext(faFile)[0]+'_hmmer.txt'
    if os.path.isfile(outFile):
        return
    else:
#         print('Starting HMMsearch')
#         command = ['hmmsearch','-E',EVAL,'--tblout',outFile,DIdb,faFile]
        print('Starting Hmmscan')
        EVAL = '0.00001'
        command = ['hmmscan','-E',EVAL,'--tblout',outFile,DIdb,faFile]
        subprocess.run(command)
        return
#pVOG Calculations with Lower Evalue
def HMMERFiles_VOG(faFile):
    outFile = os.path.splitext(faFile)[0]+'_VOG_hmmer.txt'
    if os.path.isfile(outFile):
        return
    else:
        print('Starting HMMscan')
        EVAL = '0.000000000000001'
        command = ['hmmscan','-E',EVAL,'--tblout',outFile,VOGdb,faFile]
        subprocess.run(command)
        return
#Searches HMMER file to find out if it was a hmmscan or hmmsearch run and return result as a string
def hmmFileType(fileName):
    searchType = None
    with open(fileName,'r') as F:
        for line in F:
            if line.startswith('# Program:'):
                if 'hmmscan' in line:
                    searchType = 'scan'
                if 'hmmsearch' in line:
                    searchType = 'search'
    if searchType == None:
        raise ValueError('HMMER file type was not found')
    return searchType

#Function to extract domains from a given hmmsearch or hmmscan result tblout output
#Input: filepath to a hmmscan or hmmsearch tblout file
#Output: a dictionary of where every key is a protein accession number and the results are a list of all domain hits
def HMMERhit_lister(filePath,searchType = 'scan'):
    #HMMER files were generated using both hmmscan and hmmsearch functions which have slightly different output styles
    #Note: hmmscan runs much faster for our purposes. Hmmsearch was used at first when I did not know this.
    if searchType == 'scan':
        result = pd.read_csv(filePath, sep = ' ', comment = '#',header = None,skipinitialspace = True,usecols = [0,1,2],
                        names = ['Domain','DomainAcc','Query'])
    if searchType == 'search':
        result = pd.read_csv(filePath,sep = ' ',usecols = [0,2,3],skipinitialspace = True,header = None,comment = '#',
                         names = ['Query','Domain','DomainAcc'])
    resultDict = {}
    for index, row in result.iterrows():
        #Do to differences in the formatting of the COG and pVOG vs PFAM databases the domain name ... 
        #(and not a descriptive name) is stored in a different location (Domain vs DomainAcc for COG/pVOG vs PFAM)
        if row.Domain.startswith('COG') or row.Domain.startswith('VOG'): 
            if row.Query not in resultDict:
                resultDict[row.Query] = [row.Domain]
            else:
                resultDict[row.Query].append(row.Domain)
        elif row.DomainAcc.startswith('PF'):
            pfam = row.DomainAcc.split('.')[0]
            if row.Query not in resultDict:
                resultDict[row.Query] = [pfam]
            else:
                resultDict[row.Query].append(pfam)
    return resultDict

In [None]:
#Load in positive and negative association of defense island related domains from file
DISign = DIsignFile
posDI = set()
negDI = set()
with open(DISign) as f:
    for line in f:
        (domain, sign) = line.split()
        if sign == "negative":
            negDI.add(domain)
        elif sign == "positive":
            posDI.add(domain)

In [None]:
faFiles = glob.glob(OUT_FOLDER+'*.faa')
for file in faFiles:
    HMMERFiles(file)
    HMMERFiles_VOG(file)

In [None]:
MultiGeneSys = ['T4_12','T4_28','T4_RT06','T4_RT11','Lambda_36','Lambda_37','Lambda_49','Lambda_51','T7_2','T7_5']

regionSummary = pd.DataFrame(sysLocs['System'].tolist(),columns = ['System'])
regionSummary['Num. with Defense'] = 0
regionSummary['Num. Phage Associated'] = 0
regionSummary['Total Proteins in Region'] = 0
regionSummary['Defense Hits'] = np.empty((len(regionSummary), 0)).tolist()
regionSummary['pVOG Hits'] = np.empty((len(regionSummary), 0)).tolist()

OUT = OUT_FOLDER+'NativeLocationsSummaries.txt'
for index, row in regionSummary.iterrows():
    sysName = parts2Sys[row['System']]
        
    hmmerResults = glob.glob(OUT_FOLDER+sysName+'*_hmmer.txt')
    DIResults = [x for x in hmmerResults if '_VOG_hmmer.txt' not in x][0]
    
    
    originalIDs = sys2IDs[sysName]
    
    protAcc_wExt = os.path.basename(DIResults)
    protAcc = ('_').join(protAcc_wExt.split('_')[:-1])

    faFile = OUT_FOLDER+protAcc+'.faa'
    diFile = OUT_FOLDER+protAcc+'_hmmer.txt'
    vogFile = OUT_FOLDER+protAcc+'_VOG_hmmer.txt'

    #Read hmmer results into python
    DIsearchType = hmmFileType(diFile)
    DIhitDict = HMMERhit_lister(diFile,DIsearchType)

    VOGsearchType = hmmFileType(vogFile)
    VOGhitDict = HMMERhit_lister(vogFile,VOGsearchType)

    for record in SeqIO.parse(faFile,'fasta'):
        regionSummary.at[index,'Total Proteins in Region'] += 1

        if (record.id in DIhitDict) & (record.id not in originalIDs):
            DIdomains = DIhitDict[record.id]
            if any([dom in posDI for dom in DIdomains]):
                regionSummary.at[index,'Num. with Defense'] += 1
                regionSummary.at[index,'Defense Hits'].append((record.id,[dom for dom in DIdomains if dom in posDI]))
        if (record.id in VOGhitDict) & (record.id not in originalIDs):
            VOGdomains = VOGhitDict[record.id]
            if any([dom.startswith('VOG') for dom in VOGdomains]):
                regionSummary.at[index,'Num. Phage Associated'] += 1
                regionSummary.at[index,'pVOG Hits'].append((record.id,[dom for dom in VOGdomains if dom.startswith('VOG')]))

    regionSummary.to_csv(OUT,sep = '\t')