## Taxonomic Data for Test and Control Systems
#### This notebook searches for all homologs of the experimentally discovered phage defense systems - and controls - in our genomic database. It then records the TaxID of each genome in which the systems are found and saves this information. Genome accessions for experimentally found systems are pulled from the files generated from the analysis in SysinContext_final.ipynbData from this notebook used to create Figure 3E.

### 0. Needed packages and folders

In [None]:
from Bio import SeqIO
import re, math
import random
import pandas as pd
import subprocess
import numpy as np
import glob
import os
import sys
from Bio import SearchIO
import pickle
from multiprocessing import Pool
dbFolder = '/mnt/disks/storage/ncbi-genomes-2021-04-29/'

In [None]:
#Function to make a list of all accession numbers for a given original system name
def fetchHomologList(original_id):
    OGFastaList = glob.glob(OUT_FOLDER+original_id+"_blastHomologs")[0] #get blast output file
    IDs = pd.read_csv(OGFastaList,delim_whitespace = True,usecols=[0],header=None,squeeze = True).tolist() #grab accession numbers
    IDs = [x.split("|")[1] for x in IDs] #drop genbank or refseq demarcation and keep only the accession number
    return IDs

### 1. Build table of all genomic accessions with their TaxID and Species TaxID

In [None]:
FTs = glob.glob(dbFolder+'*feature_table.txt')
gAccessions = [None]*len(FTs)
ftNum = 0
for FT in FTs:
    if ftNum % 1000 == 0:
        print(str(ftNum)+' out of '+str(len(gAccessions)))
    genomeAccession = pd.read_csv(FT,sep = '\t',usecols = ['assembly'],squeeze = True).tolist()[0]
    gAccessions[ftNum] = genomeAccession
    ftNum += 1
with open('/home/cdoering/ChrisSysInContext/ChrisDBAssemblyAccessions.txt','w') as file:
    for accession in gAccessions:
        file.write(accession+' \n')

In [None]:
gAccessions = pd.read_csv('/home/cdoering/ChrisSysInContext/ChrisDBAssemblyAccessions.txt',sep = '\t',header = None,squeeze = True)
gAccessions = gAccessions.str.strip().tolist()
gAccessions

In [None]:
#Assembly taxid and species taxid information for every genbank and refseq genome and store in dictionary
genbank = pd.read_csv('/home/cdoering/assembly_summary.txt',sep = '\t',skiprows = 1,usecols = ['# assembly_accession','taxid','species_taxid'])
refseq = pd.read_csv('/home/cdoering/assembly_summary_refseq.txt',sep = '\t',skiprows = 1,usecols = ['# assembly_accession','taxid','species_taxid'])

genbankDict = {row[0]:(row[1],row[2]) for index, row in genbank.iterrows()}
refseqDict = {row[0]:(row[1],row[2]) for index, row in refseq.iterrows()}
Acc2Taxid = {**genbankDict, **refseqDict}

In [None]:
Write to file the taxid for every genome in the dataset
allDBTaxids.txt
allTaxids = pd.DataFrame(gAccessions,columns = ['Accession'])
allTaxids['Taxid'] = None
allTaxids['Species Taxid'] = None
for index, row in allTaxids.iterrows():
    allTaxids.at[index,'Taxid'] = Acc2Taxid[row['Accession']][0]
    allTaxids.at[index,'Species Taxid'] = Acc2Taxid[row['Accession']][1]
allTaxids

In [None]:
allTaxids.to_csv('Taxonomy/allDBTaxids.txt',sep = '\t',index_label = False)

In [None]:
allTaxids = pd.read_csv('Taxonomy/allDBTaxids.txt',sep = '\t')
allTaxids

### 2. TaxIDs for Experimental Systems

In [None]:
systems = glob.glob('/home/cdoering/ChrisSysInContext/Cov80Summaries/*')
for sys in systems:
    OUT = '/home/cdoering/ChrisSysInContext/Taxonomy/'+('_').join(os.path.basename(sys).split('_')[:-1])+'_TaxIDs'
    homologs = pd.read_csv(sys,sep = '\t',skiprows = 6,usecols = ['Accession'])
    outDF = pd.DataFrame([None]*len(homologs),columns = ['Accession'])
    outDF['Taxid'] = None
    outDF['Species Taxid'] = None
    for index, row in homologs.iterrows():
        file = os.path.basename(row[0])
        Acc = file.split('.')[0]
        version = file.split('.')[1].split('_')[0]
        Acc = Acc + '.' + version
        outDF.at[index,'Accession'] = Acc
        outDF.at[index,'Taxid'] = Acc2Taxid[Acc][0]
        outDF.at[index,'Species Taxid'] = Acc2Taxid[Acc][1]
    outDF.to_csv(OUT,sep = '\t',index_label = False)

### 3. TaxIDs for Control systems

In [None]:
OUT_FOLDER = 'TaxonomyControls/' #Folder to output data to
IN_FASTA = '20211208_ControlSystems.faa' #Input list of proteins to search for
MultiGeneSys = ['RM_1','RM_2','RM_3','RM_4','Cas','Zorya_I','ZoryaII','Kiwa','Durantia']

In [None]:
#Take in fasta file with all proteins and split into individual files, create folders to store each proteins results
for record in SeqIO.parse(IN_FASTA,'fasta'):
    SUB = OUT_FOLDER+record.id
    if os.path.isdir(SUB) == False:
        os.mkdir(SUB)
    if os.path.isfile(OUT_FOLDER+record.id+'.faa') == False:
        SeqIO.write(record,OUT_FOLDER+record.id+'.faa','fasta')

In [None]:
#BLASTP for Multi-Gene Systems. No coverage requirement
#Run blastp on all proteins against the non-redundant database to get protein accessions
EVAL = '0.00001'
num = 1
for system in MultiGeneSys:
    starters = glob.glob(OUT_FOLDER+system+"*.faa")
    for init in starters:
        print('Blasting #'+str(num)+', '+init)
        num += 1
        hitName = os.path.splitext(os.path.basename(init))[0]
        if os.path.isfile(OUT_FOLDER+hitName+'_blastHomologs') == False:
            command = ['blastp','-query',init,
                       '-db','/mnt/disks/storage/nr/nr',
                       '-evalue',EVAL,
                       '-out',OUT_FOLDER+hitName+'_blastHomologs',
                       '-outfmt','6 sseqid evalue length qlen qstart qend slen sstart send',
                      '-max_target_seqs','10000000',
                      '-num_threads','6',
                      '-taxidlist','bacterial.ids']
            subprocess.run(command)

In [None]:
#BLASTP for Single-Gene Systems
#Run blastp on all proteins against the non-redundant database to get protein accessions
EVAL = '0.00001'
starters = glob.glob(OUT_FOLDER+"*.faa")
num = 1
for init in starters:
    print('Blasting #'+str(num)+' out of '+str(len(starters)))
    num += 1
    hitName = os.path.splitext(os.path.basename(init))[0]
    if os.path.isfile(OUT_FOLDER+hitName+'_blastHomologs') == False:
        command = ['blastp','-query',init,
                   '-db','/mnt/disks/storage/nr/nr',
                   '-evalue',EVAL,
                   '-out',OUT_FOLDER+hitName+'_blastHomologs',
                   '-outfmt','6 sseqid evalue length qlen qstart qend slen sstart send',
                  '-max_target_seqs','10000000',
                  '-num_threads','6',
                  '-taxidlist','bacterial.ids',
                  '-qcov_hsp_perc','80']
        subprocess.run(command)

In [None]:
homologs = glob.glob(OUT_FOLDER+"*_blastHomologs")
homologs2main = dict() #To map protein accessions onto original names/IDs
protIDs = set() #All homolog accessions
for log in homologs:
    main = ('_').join(log.split('_')[:-1]) #remove _blastHomolog part of globbed name
    IDs = pd.read_csv(log,delim_whitespace = True,usecols=[0],header=None,squeeze = True).tolist() #read in all IDs
    IDs = [x.split("|")[1] for x in IDs] #remove genbank or refseq demarkation attached to accession by | mark
    homologs2main = {**homologs2main,**{key:main for key in IDs}} #Add accessions mapped to main names into dictionary
    protIDs.update(set(IDs)) #all all IDs into set
protIDs

In [None]:
#Multiprocessing 
#Search through feature tables to see if protein homologs are contained within those files
def CheckForHomologs(FT):
    FeatTable = pd.read_csv(FT,sep = '\t',usecols = ['product_accession','non-redundant_refseq'])
    command = ['printf','Checking table '+FT]
    subprocess.run(command)
    if (any(ID in protIDs for ID in FeatTable['product_accession'])) or (any(ID in protIDs for ID in FeatTable['non-redundant_refseq'])):
        prod = [ID for ID in FeatTable['product_accession'] if ID in protIDs]
        ref = [ID for ID in FeatTable['non-redundant_refseq'] if ID in protIDs]
        Accs = list(set(prod+ref))
        return (FT,Accs)
    else:
        return (FT,[])
if __name__ == '__main__':
    FTs = glob.glob('/mnt/disks/storage/ncbi-genomes-2021-04-29/*_feature_table.txt')
    with Pool() as pool:
        haveHomologs = pool.map(CheckForHomologs,FTs)      

In [None]:
haveHomolog = pd.DataFrame(haveHomologs,columns = ['Files','ID Present'])
haveHomolog.to_csv(OUT_FOLDER+'HomologsinFiles_TaxControls.txt',sep='\t')

In [None]:
haveHomolog = pd.read_csv(OUT_FOLDER+'HomologsinFiles_TaxControls.txt',sep = '\t',index_col = 0,converters={'ID Present': pd.eval})

In [None]:
haveHomolog = haveHomolog.where(haveHomolog['ID Present'].str.len() != 0).dropna()
haveHomolog

In [None]:
SingleGeneSys = ['ToxN']
for sys in SingleGeneSys:
    OUT = OUT_FOLDER+'TaxControlSummaries/'+sys+'_TaxIDs'
    fileFound = []
    homologs = fetchHomologList(sys)
    for index, row in haveHomolog.iterrows():
        if any([acc in homologs for acc in row['ID Present']]):
            fileFound.append(row['Files'])
    outDF = pd.DataFrame([None]*len(fileFound),columns = ['Accession'])
    outDF['Taxid'] = None
    outDF['Species Taxid'] = None
    for index, row in outDF.iterrows():
        file = fileFound[index]
        Acc = os.path.basename(file.split('.')[0])
        version = file.split('.')[1].split('_')[0]
        Acc = Acc + '.' + version
        outDF.at[index,'Accession'] = Acc
        outDF.at[index,'Taxid'] = Acc2Taxid[Acc][0]
        outDF.at[index,'Species Taxid'] = Acc2Taxid[Acc][1]
    outDF.to_csv(OUT,sep = '\t',index_label = False)

In [None]:
for sys in MultiGeneSys:
    OUT = OUT_FOLDER+'TaxControlSummaries/'+sys+'_TaxIDs'
    if os.path.isfile(OUT):
        continue
    fileFound = []
    parts = [os.path.basename(part[:-1]) for part in glob.glob(OUT_FOLDER+sys+'*/')]
    initHomologs = fetchHomologList(parts[0])
    otherHomologs = [fetchHomologList(ID) for ID in parts[1:]]
    for index, row in haveHomolog.iterrows():
        if any([ID in initHomologs for ID in row['ID Present']]):
            FeatTable = pd.read_csv(row['Files'],sep = '\t')
            FeatTable = FeatTable[FeatTable['# feature'] == 'CDS']
            prots = FeatTable[(FeatTable['product_accession'].isin(initHomologs)) | (FeatTable['non-redundant_refseq'].isin(initHomologs))]
            
            for indexFT, rowFT in prots.iterrows():

                if str(rowFT['chromosome']) == 'nan':
                    contig = rowFT['genomic_accession']
                    up10 = rowFT['start'] - 10000
                    down10 = rowFT['end'] + 10000
                    aroundProt = FeatTable[(FeatTable['start'] > up10) & 
                                                 (FeatTable['end'] < down10) &
                                                (FeatTable['genomic_accession'] == contig)]
                else:
                    chromosome = rowFT['chromosome']
                    contig = rowFT['genomic_accession']
                    up10 = rowFT['start'] - 10000
                    down10 = rowFT['end'] + 10000
                    aroundProt = FeatTable[(FeatTable['start'] > up10) & 
                                                 (FeatTable['end'] < down10) &
                                                (FeatTable['chromosome'] == chromosome) &
                                                (FeatTable['genomic_accession'] == contig)]
                allPartsPresent = [False]*len(otherHomologs)
                for i in range(len(allPartsPresent)):
                    allPartsPresent[i] = any([any([ID in aroundProt['product_accession'].tolist() for ID in otherHomologs[i]]),
                                              any([ID in aroundProt['non-redundant_refseq'].tolist() for ID in otherHomologs[i]])])
                if all(allPartsPresent):
                    fileFound.append(row['Files'])
    outDF = pd.DataFrame([None]*len(fileFound),columns = ['Accession'])
    outDF['Taxid'] = None
    outDF['Species Taxid'] = None
    for index, row in outDF.iterrows():
        file = fileFound[index]
        Acc = os.path.basename(file.split('.')[0])
        version = file.split('.')[1].split('_')[0]
        Acc = Acc + '.' + version
        outDF.at[index,'Accession'] = Acc
        outDF.at[index,'Taxid'] = Acc2Taxid[Acc][0]
        outDF.at[index,'Species Taxid'] = Acc2Taxid[Acc][1]
    outDF.to_csv(OUT,sep = '\t',index_label = False)