## Taxonomic Data for Test and Control Systems
#### This notebook searches for all homologs of the experimentally discovered phage defense systems - and controls - in our genomic database. It then records the TaxID of each genome in which the systems are found and saves this information. Genome accessions for experimentally found systems are pulled from the files generated from the analysis in SysinContext_final.ipynbData from this notebook used to create Figure 3E.

### 0. Needed packages and folders

In [1]:
from Bio import SeqIO
import re, math
import random
import pandas as pd
import subprocess
import numpy as np
import glob
import os
import sys
from Bio import SearchIO
import pickle
from multiprocessing import Pool
dbFolder = '/mnt/disks/storage/ncbi-genomes-2021-04-29/'

In [2]:
#Function to make a list of all accession numbers for a given original system name
def fetchHomologList(original_id):
    OGFastaList = glob.glob(OUT_FOLDER+original_id+"_blastHomologs")[0] #get blast output file
    IDs = pd.read_csv(OGFastaList,delim_whitespace = True,usecols=[0],header=None,squeeze = True).tolist() #grab accession numbers
    IDs = [x.split("|")[1] for x in IDs] #drop genbank or refseq demarcation and keep only the accession number
    return IDs

### 1. Build table of all genomic accessions with their TaxID and Species TaxID

In [19]:
FTs = glob.glob(dbFolder+'*feature_table.txt')
gAccessions = [None]*len(FTs)
ftNum = 0
for FT in FTs:
    if ftNum % 1000 == 0:
        print(str(ftNum)+' out of '+str(len(gAccessions)))
    genomeAccession = pd.read_csv(FT,sep = '\t',usecols = ['assembly'],squeeze = True).tolist()[0]
    gAccessions[ftNum] = genomeAccession
    ftNum += 1
with open('/home/cdoering/ChrisSysInContext/ChrisDBAssemblyAccessions.txt','w') as file:
    for accession in gAccessions:
        file.write(accession+' \n')

0 out of 844603
1000 out of 844603
2000 out of 844603
3000 out of 844603
4000 out of 844603
5000 out of 844603
6000 out of 844603
7000 out of 844603
8000 out of 844603
9000 out of 844603
10000 out of 844603
11000 out of 844603
12000 out of 844603
13000 out of 844603
14000 out of 844603
15000 out of 844603
16000 out of 844603
17000 out of 844603
18000 out of 844603
19000 out of 844603
20000 out of 844603
21000 out of 844603
22000 out of 844603
23000 out of 844603
24000 out of 844603
25000 out of 844603
26000 out of 844603
27000 out of 844603
28000 out of 844603
29000 out of 844603
30000 out of 844603
31000 out of 844603
32000 out of 844603
33000 out of 844603
34000 out of 844603
35000 out of 844603
36000 out of 844603
37000 out of 844603
38000 out of 844603
39000 out of 844603
40000 out of 844603
41000 out of 844603
42000 out of 844603
43000 out of 844603
44000 out of 844603
45000 out of 844603
46000 out of 844603
47000 out of 844603
48000 out of 844603
49000 out of 844603
50000 out of 

398000 out of 844603
399000 out of 844603
400000 out of 844603
401000 out of 844603
402000 out of 844603
403000 out of 844603
404000 out of 844603
405000 out of 844603
406000 out of 844603
407000 out of 844603
408000 out of 844603
409000 out of 844603
410000 out of 844603
411000 out of 844603
412000 out of 844603
413000 out of 844603
414000 out of 844603
415000 out of 844603
416000 out of 844603
417000 out of 844603
418000 out of 844603
419000 out of 844603
420000 out of 844603
421000 out of 844603
422000 out of 844603
423000 out of 844603
424000 out of 844603
425000 out of 844603
426000 out of 844603
427000 out of 844603
428000 out of 844603
429000 out of 844603
430000 out of 844603
431000 out of 844603
432000 out of 844603
433000 out of 844603
434000 out of 844603
435000 out of 844603
436000 out of 844603
437000 out of 844603
438000 out of 844603
439000 out of 844603
440000 out of 844603
441000 out of 844603
442000 out of 844603
443000 out of 844603
444000 out of 844603
445000 out of

791000 out of 844603
792000 out of 844603
793000 out of 844603
794000 out of 844603
795000 out of 844603
796000 out of 844603
797000 out of 844603
798000 out of 844603
799000 out of 844603
800000 out of 844603
801000 out of 844603
802000 out of 844603
803000 out of 844603
804000 out of 844603
805000 out of 844603
806000 out of 844603
807000 out of 844603
808000 out of 844603
809000 out of 844603
810000 out of 844603
811000 out of 844603
812000 out of 844603
813000 out of 844603
814000 out of 844603
815000 out of 844603
816000 out of 844603
817000 out of 844603
818000 out of 844603
819000 out of 844603
820000 out of 844603
821000 out of 844603
822000 out of 844603
823000 out of 844603
824000 out of 844603
826000 out of 844603
827000 out of 844603
828000 out of 844603
829000 out of 844603
830000 out of 844603
831000 out of 844603
832000 out of 844603
833000 out of 844603
834000 out of 844603
835000 out of 844603
836000 out of 844603
837000 out of 844603
838000 out of 844603
839000 out of

In [3]:
gAccessions = pd.read_csv('/home/cdoering/ChrisSysInContext/ChrisDBAssemblyAccessions.txt',sep = '\t',header = None,squeeze = True)
gAccessions = gAccessions.str.strip().tolist()
gAccessions

['GCF_004684395.1',
 'GCA_006551295.1',
 'GCA_015152675.1',
 'GCF_012844135.1',
 'GCA_013639475.1',
 'GCF_017590815.1',
 'GCA_013056095.1',
 'GCA_012151455.1',
 'GCF_000944655.1',
 'GCF_013297315.1',
 'GCA_014186835.1',
 'GCA_003200495.1',
 'GCA_900447105.1',
 'GCF_000939175.1',
 'GCF_000744635.1',
 'GCA_002268935.1',
 'GCA_017376595.1',
 'GCA_006373055.1',
 'GCF_009809275.1',
 'GCF_001633495.1',
 'GCA_011770725.1',
 'GCA_005037655.1',
 'GCF_000166715.1',
 'GCF_003007435.1',
 'GCA_903811565.1',
 'GCA_013505735.1',
 'GCF_007352705.1',
 'GCA_009517135.1',
 'GCA_011113595.1',
 'GCF_001642165.1',
 'GCA_012877075.1',
 'GCA_006948605.1',
 'GCF_015700435.1',
 'GCF_000363185.1',
 'GCA_000727145.1',
 'GCA_013381865.1',
 'GCA_015815185.1',
 'GCA_007871985.1',
 'GCA_900030285.1',
 'GCA_016231205.1',
 'GCF_004346085.1',
 'GCA_011821025.1',
 'GCA_016544365.1',
 'GCA_004398805.1',
 'GCA_014290695.1',
 'GCA_012499665.1',
 'GCA_007021765.1',
 'GCA_011876665.1',
 'GCF_003333715.1',
 'GCA_011161695.1',


In [4]:
#Assembly taxid and species taxid information for every genbank and refseq genome and store in dictionary
genbank = pd.read_csv('/home/cdoering/assembly_summary.txt',sep = '\t',skiprows = 1,usecols = ['# assembly_accession','taxid','species_taxid'])
refseq = pd.read_csv('/home/cdoering/assembly_summary_refseq.txt',sep = '\t',skiprows = 1,usecols = ['# assembly_accession','taxid','species_taxid'])

genbankDict = {row[0]:(row[1],row[2]) for index, row in genbank.iterrows()}
refseqDict = {row[0]:(row[1],row[2]) for index, row in refseq.iterrows()}
Acc2Taxid = {**genbankDict, **refseqDict}

In [4]:
Write to file the taxid for every genome in the dataset
allDBTaxids.txt
allTaxids = pd.DataFrame(gAccessions,columns = ['Accession'])
allTaxids['Taxid'] = None
allTaxids['Species Taxid'] = None
for index, row in allTaxids.iterrows():
    allTaxids.at[index,'Taxid'] = Acc2Taxid[row['Accession']][0]
    allTaxids.at[index,'Species Taxid'] = Acc2Taxid[row['Accession']][1]
allTaxids

Unnamed: 0,Accession,Taxid,Species Taxid
0,GCF_004684395.1,1496,1496
1,GCA_006551295.1,28901,28901
2,GCA_015152675.1,28901,28901
3,GCF_012844135.1,492670,492670
4,GCA_013639475.1,28901,28901
...,...,...,...
844598,GCA_010051105.1,1639,1639
844599,GCF_000879375.1,674982,674982
844600,GCA_017251475.1,28901,28901
844601,GCF_003188145.1,1639,1639


In [62]:
allTaxids.to_csv('Taxonomy/allDBTaxids.txt',sep = '\t',index_label = False)

In [5]:
allTaxids = pd.read_csv('Taxonomy/allDBTaxids.txt',sep = '\t')
allTaxids

Unnamed: 0,Accession,Taxid,Species Taxid
0,GCF_004684395.1,1496,1496
1,GCA_006551295.1,28901,28901
2,GCA_015152675.1,28901,28901
3,GCF_012844135.1,492670,492670
4,GCA_013639475.1,28901,28901
...,...,...,...
844598,GCA_010051105.1,1639,1639
844599,GCF_000879375.1,674982,674982
844600,GCA_017251475.1,28901,28901
844601,GCF_003188145.1,1639,1639


### 2. TaxIDs for Experimental Systems

In [5]:
systems = glob.glob('/home/cdoering/ChrisSysInContext/Cov80Summaries/*')
for sys in systems:
    OUT = '/home/cdoering/ChrisSysInContext/Taxonomy/'+('_').join(os.path.basename(sys).split('_')[:-1])+'_TaxIDs'
    homologs = pd.read_csv(sys,sep = '\t',skiprows = 6,usecols = ['Accession'])
    outDF = pd.DataFrame([None]*len(homologs),columns = ['Accession'])
    outDF['Taxid'] = None
    outDF['Species Taxid'] = None
    for index, row in homologs.iterrows():
        file = os.path.basename(row[0])
        Acc = file.split('.')[0]
        version = file.split('.')[1].split('_')[0]
        Acc = Acc + '.' + version
        outDF.at[index,'Accession'] = Acc
        outDF.at[index,'Taxid'] = Acc2Taxid[Acc][0]
        outDF.at[index,'Species Taxid'] = Acc2Taxid[Acc][1]
    outDF.to_csv(OUT,sep = '\t',index_label = False)

### 3. TaxIDs for Control systems

In [6]:
OUT_FOLDER = 'TaxonomyControls/' #Folder to output data to
IN_FASTA = '20211208_ControlSystems.faa' #Input list of proteins to search for
MultiGeneSys = ['RM_1','RM_2','RM_3','RM_4','Cas','Zorya_I','ZoryaII','Kiwa','Durantia']

In [28]:
#Take in fasta file with all proteins and split into individual files, create folders to store each proteins results
for record in SeqIO.parse(IN_FASTA,'fasta'):
    SUB = OUT_FOLDER+record.id
    if os.path.isdir(SUB) == False:
        os.mkdir(SUB)
    if os.path.isfile(OUT_FOLDER+record.id+'.faa') == False:
        SeqIO.write(record,OUT_FOLDER+record.id+'.faa','fasta')

In [29]:
#BLASTP for Multi-Gene Systems. No coverage requirement
#Run blastp on all proteins against the non-redundant database to get protein accessions
EVAL = '0.00001'
num = 1
for system in MultiGeneSys:
    starters = glob.glob(OUT_FOLDER+system+"*.faa")
    for init in starters:
        print('Blasting #'+str(num)+', '+init)
        num += 1
        hitName = os.path.splitext(os.path.basename(init))[0]
        if os.path.isfile(OUT_FOLDER+hitName+'_blastHomologs') == False:
            command = ['blastp','-query',init,
                       '-db','/mnt/disks/storage/nr/nr',
                       '-evalue',EVAL,
                       '-out',OUT_FOLDER+hitName+'_blastHomologs',
                       '-outfmt','6 sseqid evalue length qlen qstart qend slen sstart send',
                      '-max_target_seqs','10000000',
                      '-num_threads','6',
                      '-taxidlist','bacterial.ids']
            subprocess.run(command)

Blasting #1, TaxonomyControls/RM_1-A.faa
Blasting #2, TaxonomyControls/RM_1-C.faa
Blasting #3, TaxonomyControls/RM_1-B.faa
Blasting #4, TaxonomyControls/RM_2-A.faa
Blasting #5, TaxonomyControls/RM_2-B.faa
Blasting #6, TaxonomyControls/RM_3-A.faa
Blasting #7, TaxonomyControls/RM_3-B.faa
Blasting #8, TaxonomyControls/RM_4-A.faa
Blasting #9, TaxonomyControls/RM_4-B.faa
Blasting #10, TaxonomyControls/Cas-Cas9.faa
Blasting #11, TaxonomyControls/Cas1.faa
Blasting #12, TaxonomyControls/Cas-Cas1.faa
Blasting #13, TaxonomyControls/Cas2.faa
Blasting #14, TaxonomyControls/Cas-Cas2.faa
Blasting #15, TaxonomyControls/Cas9.faa
Blasting #16, TaxonomyControls/Zorya_I-A.faa
Blasting #17, TaxonomyControls/Zorya_I-B.faa
Blasting #18, TaxonomyControls/Zorya_I-C.faa
Blasting #19, TaxonomyControls/Zorya_I-D.faa
Blasting #20, TaxonomyControls/ZoryaII-B.faa
Blasting #21, TaxonomyControls/ZoryaII-A.faa
Blasting #22, TaxonomyControls/ZoryaII-C.faa
Blasting #23, TaxonomyControls/Kiwa-B.faa
Blasting #24, Taxonomy

In [30]:
#BLASTP for Single-Gene Systems
#Run blastp on all proteins against the non-redundant database to get protein accessions
EVAL = '0.00001'
starters = glob.glob(OUT_FOLDER+"*.faa")
num = 1
for init in starters:
    print('Blasting #'+str(num)+' out of '+str(len(starters)))
    num += 1
    hitName = os.path.splitext(os.path.basename(init))[0]
    if os.path.isfile(OUT_FOLDER+hitName+'_blastHomologs') == False:
        command = ['blastp','-query',init,
                   '-db','/mnt/disks/storage/nr/nr',
                   '-evalue',EVAL,
                   '-out',OUT_FOLDER+hitName+'_blastHomologs',
                   '-outfmt','6 sseqid evalue length qlen qstart qend slen sstart send',
                  '-max_target_seqs','10000000',
                  '-num_threads','6',
                  '-taxidlist','bacterial.ids',
                  '-qcov_hsp_perc','80']
        subprocess.run(command)

Blasting #1 out of 31
Blasting #2 out of 31
Blasting #3 out of 31
Blasting #4 out of 31
Blasting #5 out of 31
Blasting #6 out of 31
Blasting #7 out of 31
Blasting #8 out of 31
Blasting #9 out of 31
Blasting #10 out of 31
Blasting #11 out of 31
Blasting #12 out of 31
Blasting #13 out of 31
Blasting #14 out of 31
Blasting #15 out of 31
Blasting #16 out of 31
Blasting #17 out of 31
Blasting #18 out of 31
Blasting #19 out of 31
Blasting #20 out of 31
Blasting #21 out of 31
Blasting #22 out of 31
Blasting #23 out of 31
Blasting #24 out of 31
Blasting #25 out of 31
Blasting #26 out of 31
Blasting #27 out of 31
Blasting #28 out of 31
Blasting #29 out of 31
Blasting #30 out of 31
Blasting #31 out of 31


In [7]:
homologs = glob.glob(OUT_FOLDER+"*_blastHomologs")
homologs2main = dict() #To map protein accessions onto original names/IDs
protIDs = set() #All homolog accessions
for log in homologs:
    main = ('_').join(log.split('_')[:-1]) #remove _blastHomolog part of globbed name
    IDs = pd.read_csv(log,delim_whitespace = True,usecols=[0],header=None,squeeze = True).tolist() #read in all IDs
    IDs = [x.split("|")[1] for x in IDs] #remove genbank or refseq demarkation attached to accession by | mark
    homologs2main = {**homologs2main,**{key:main for key in IDs}} #Add accessions mapped to main names into dictionary
    protIDs.update(set(IDs)) #all all IDs into set
protIDs

{'WP_201599727.1',
 'WP_106176411.1',
 'WP_124218902.1',
 'CDB91907.1',
 'OHE82557.1',
 'EBM1955409.1',
 'EAB3492478.1',
 'WP_179912142.1',
 'WP_102479551.1',
 'WP_128164746.1',
 'WP_193809650.1',
 'OYU48007.1',
 'WP_151528700.1',
 'MBQ6518639.1',
 'RYE59367.1',
 'WP_187536032.1',
 'EEP3672499.1',
 'WP_189698421.1',
 'WP_069329714.1',
 'EAH9355227.1',
 'MBL7759847.1',
 'WP_013746110.1',
 'WP_033582600.1',
 'WP_160208104.1',
 'WP_195492178.1',
 'WP_003630904.1',
 'WP_080500670.1',
 'WP_123882106.1',
 'PWM33557.1',
 'EDP6908368.1',
 'AZS27584.1',
 'WP_155330963.1',
 'WP_129950061.1',
 'WP_014425141.1',
 'EFP1008486.1',
 'MYG08591.1',
 'WP_150873456.1',
 'WP_084849751.1',
 'WP_038788694.1',
 'WP_177344534.1',
 'NLD58990.1',
 'WP_135199350.1',
 'VLH00712.1',
 'MBD9292999.1',
 'MAO56318.1',
 'HAR99442.1',
 'MBR0190960.1',
 'MBR4310567.1',
 'WP_172127038.1',
 'RPF29022.1',
 'WP_151048957.1',
 'MBK8942500.1',
 'WP_134602161.1',
 'WP_100257349.1',
 'MBD8956418.1',
 'QDX03602.1',
 'WP_147803977

In [32]:
#Multiprocessing 
#Search through feature tables to see if protein homologs are contained within those files
def CheckForHomologs(FT):
    FeatTable = pd.read_csv(FT,sep = '\t',usecols = ['product_accession','non-redundant_refseq'])
    command = ['printf','Checking table '+FT]
    subprocess.run(command)
    if (any(ID in protIDs for ID in FeatTable['product_accession'])) or (any(ID in protIDs for ID in FeatTable['non-redundant_refseq'])):
        prod = [ID for ID in FeatTable['product_accession'] if ID in protIDs]
        ref = [ID for ID in FeatTable['non-redundant_refseq'] if ID in protIDs]
        Accs = list(set(prod+ref))
        return (FT,Accs)
    else:
        return (FT,[])
if __name__ == '__main__':
    FTs = glob.glob('/mnt/disks/storage/ncbi-genomes-2021-04-29/*_feature_table.txt')
    with Pool() as pool:
        haveHomologs = pool.map(CheckForHomologs,FTs)      

In [33]:
haveHomolog = pd.DataFrame(haveHomologs,columns = ['Files','ID Present'])
haveHomolog.to_csv(OUT_FOLDER+'HomologsinFiles_TaxControls.txt',sep='\t')

In [10]:
haveHomolog = pd.read_csv(OUT_FOLDER+'HomologsinFiles_TaxControls.txt',sep = '\t',index_col = 0,converters={'ID Present': pd.eval})

In [11]:
haveHomolog = haveHomolog.where(haveHomolog['ID Present'].str.len() != 0).dropna()
haveHomolog

Unnamed: 0,Files,ID Present
0,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCF...,"[WP_003434070.1, WP_003435381.1, WP_016729011...."
3,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCF...,"[WP_025284995.1, WP_168984774.1, WP_076425056...."
5,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCF...,"[WP_003456371.1, WP_140562167.1, WP_061415384...."
7,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCA...,[BCB51680.1]
8,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCF...,"[WP_012775649.1, WP_011922226.1, WP_011922218...."
...,...,...
844576,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCF...,"[WP_000578076.1, WP_053264748.1, WP_001305642...."
844589,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCF...,"[WP_000028651.1, WP_000190897.1]"
844591,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCF...,"[WP_107232319.1, WP_107232290.1]"
844595,/mnt/disks/storage/ncbi-genomes-2021-04-29/GCF...,"[WP_112048993.1, WP_072200547.1, WP_000578061...."


In [37]:
SingleGeneSys = ['ToxN']
for sys in SingleGeneSys:
    OUT = OUT_FOLDER+'TaxControlSummaries/'+sys+'_TaxIDs'
    fileFound = []
    homologs = fetchHomologList(sys)
    for index, row in haveHomolog.iterrows():
        if any([acc in homologs for acc in row['ID Present']]):
            fileFound.append(row['Files'])
    outDF = pd.DataFrame([None]*len(fileFound),columns = ['Accession'])
    outDF['Taxid'] = None
    outDF['Species Taxid'] = None
    for index, row in outDF.iterrows():
        file = fileFound[index]
        Acc = os.path.basename(file.split('.')[0])
        version = file.split('.')[1].split('_')[0]
        Acc = Acc + '.' + version
        outDF.at[index,'Accession'] = Acc
        outDF.at[index,'Taxid'] = Acc2Taxid[Acc][0]
        outDF.at[index,'Species Taxid'] = Acc2Taxid[Acc][1]
    outDF.to_csv(OUT,sep = '\t',index_label = False)

In [35]:
for sys in MultiGeneSys:
    OUT = OUT_FOLDER+'TaxControlSummaries/'+sys+'_TaxIDs'
    if os.path.isfile(OUT):
        continue
    fileFound = []
    parts = [os.path.basename(part[:-1]) for part in glob.glob(OUT_FOLDER+sys+'*/')]
    initHomologs = fetchHomologList(parts[0])
    otherHomologs = [fetchHomologList(ID) for ID in parts[1:]]
    for index, row in haveHomolog.iterrows():
        if any([ID in initHomologs for ID in row['ID Present']]):
            FeatTable = pd.read_csv(row['Files'],sep = '\t')
            FeatTable = FeatTable[FeatTable['# feature'] == 'CDS']
            prots = FeatTable[(FeatTable['product_accession'].isin(initHomologs)) | (FeatTable['non-redundant_refseq'].isin(initHomologs))]
            
            for indexFT, rowFT in prots.iterrows():

                if str(rowFT['chromosome']) == 'nan':
                    contig = rowFT['genomic_accession']
                    up10 = rowFT['start'] - 10000
                    down10 = rowFT['end'] + 10000
                    aroundProt = FeatTable[(FeatTable['start'] > up10) & 
                                                 (FeatTable['end'] < down10) &
                                                (FeatTable['genomic_accession'] == contig)]
                else:
                    chromosome = rowFT['chromosome']
                    contig = rowFT['genomic_accession']
                    up10 = rowFT['start'] - 10000
                    down10 = rowFT['end'] + 10000
                    aroundProt = FeatTable[(FeatTable['start'] > up10) & 
                                                 (FeatTable['end'] < down10) &
                                                (FeatTable['chromosome'] == chromosome) &
                                                (FeatTable['genomic_accession'] == contig)]
                allPartsPresent = [False]*len(otherHomologs)
                for i in range(len(allPartsPresent)):
                    allPartsPresent[i] = any([any([ID in aroundProt['product_accession'].tolist() for ID in otherHomologs[i]]),
                                              any([ID in aroundProt['non-redundant_refseq'].tolist() for ID in otherHomologs[i]])])
                if all(allPartsPresent):
                    fileFound.append(row['Files'])
    outDF = pd.DataFrame([None]*len(fileFound),columns = ['Accession'])
    outDF['Taxid'] = None
    outDF['Species Taxid'] = None
    for index, row in outDF.iterrows():
        file = fileFound[index]
        Acc = os.path.basename(file.split('.')[0])
        version = file.split('.')[1].split('_')[0]
        Acc = Acc + '.' + version
        outDF.at[index,'Accession'] = Acc
        outDF.at[index,'Taxid'] = Acc2Taxid[Acc][0]
        outDF.at[index,'Species Taxid'] = Acc2Taxid[Acc][1]
    outDF.to_csv(OUT,sep = '\t',index_label = False)