# Process phage reference genome

Phage genomes were downloaded from NCBI GenBank using search keywords: [phage] AND [pseudomonas].

This search returned 1,950 samples (as of 15 December 2020)

By manual inspection, this download includes phages for other bacteria such as Samonella and E. Coli. This notebook removes those FASTA entries and saves the cleaned version.

In [1]:
from Bio import SeqIO
from core_acc_modules import paths

In [2]:
# Select only those entries with keyword, "pseudomonas"
cleaned_records = []
keyword = "pseudomonas"
for record in SeqIO.parse(paths.RAW_PHAGE_REF, "fasta"):
    print("%s %s %i" % (record.id, record.description.lower(), len(record)))
    if keyword in record.description.lower():
        cleaned_records.append(record)

NC_028999.1 nc_028999.1 pseudomonas phage phipa3, complete genome 309208
HQ630627.1 hq630627.1 pseudomonas phage phipa3, complete genome 309208
CP019649.1 cp019649.1 salmonella enterica subsp. enterica serovar typhimurium var. monophasic 4,5,12:i:- strain tw-stm6 chromosome, complete genome 4999862
MT133560.1 mt133560.1 pseudomonas phage fnug, complete genome 278899
MK599315.1 mk599315.1 pseudomonas phage pa1c, complete genome 304671
MH725810.1 mh725810.1 pseudomonas phage payy-2, complete genome 92348
MF974178.1 mf974178.1 pseudomonas phage ys35, complete genome 93296
NC_016765.1 nc_016765.1 pseudomonas phage vb_paes_pmg1, complete genome 54024
HQ711985.1 hq711985.1 pseudomonas phage vb_paes_pmg1, complete genome 54024
NC_031063.1 nc_031063.1 pseudomonas phage pev2, complete genome 72697
KU948710.1 ku948710.1 pseudomonas phage pev2, complete genome 72697
NC_020083.1 nc_020083.1 serratia phage phimam1, complete genome 157834
JX878496.1 jx878496.1 serratia phage phimam1, complete genome

MN553585.1 mn553585.1 unverified: pseudomonas phage 4phic20-1 clone 9, complete genome 42232
MN553584.1 mn553584.1 unverified: pseudomonas phage 4phic20-2 clone 7, complete genome 72474
MN553583.1 mn553583.1 unverified: pseudomonas phage 4phic20-2 clone 5, complete genome 55941
MK817115.1 mk817115.1 escherichia phage vb_ecom_phapec6, complete genome 352598
MK511027.1 mk511027.1 pseudomonas phage vb_pae_cf136a, partial genome 37654
MK511014.1 mk511014.1 pseudomonas phage vb_pae_br243a, partial genome 46492
MK511013.1 mk511013.1 pseudomonas phage vb_pae_br161a, partial genome 42421
MK511011.1 mk511011.1 pseudomonas phage vb_pae_cf213a, partial genome 52658
MK511010.1 mk511010.1 pseudomonas phage vb_pae_cf177c, partial genome 52755
MK511009.1 mk511009.1 pseudomonas phage vb_pae_cf136b, partial genome 37242
MK511006.1 mk511006.1 pseudomonas phage vb_pae_cf118a, partial genome 52758
MK511005.1 mk511005.1 pseudomonas phage vb_pae_cf81a, partial genome 52823
MK511003.1 mk511003.1 pseudomonas 

NC_017972.1 nc_017972.1 pseudomonas phage lu11, complete genome 280538
NC_012418.1 nc_012418.1 pseudomonas phage phikf77, complete genome 43152
KX711710.1 kx711710.1 pseudomonas phage vb_pae-tbilisim32, complete genome 42965
MH179480.1 mh179480.1 pseudomonas phage 98pflur60pp, complete genome 74361
KX074201.1 kx074201.1 pseudomonas phage phiyy segment l, complete sequence 6648
LC102729.1 lc102729.1 pseudomonas phage r18 dna, complete genome 63560
KR054033.1 kr054033.1 pseudomonas phage dl68, complete genome 66111
KR054032.1 kr054032.1 pseudomonas phage dl64, complete genome 72378
KR054031.1 kr054031.1 pseudomonas phage dl62, complete genome 42508
KR054030.1 kr054030.1 pseudomonas phage dl60, complete genome 66103
KR054029.1 kr054029.1 pseudomonas phage dl54, complete genome 45673
KR054028.1 kr054028.1 pseudomonas phage dl52, complete genome 65867
KP340288.1 kp340288.1 pseudomonas phage phiktn6, complete genome 65994
KC758116.1 kc758116.1 pseudomonas phage lko4, complete genome 61818
LN

NC_011756.1 nc_011756.1 pseudomonas phage sn, complete genome 66390
NC_007805.1 nc_007805.1 pseudomonas phage f10, complete genome 39199
NC_048687.1 nc_048687.1 pseudomonas phage pmbt14, complete genome 47820
NC_048639.1 nc_048639.1 pseudomonas phage zc08, complete genome 70774
NC_048638.1 nc_048638.1 pseudomonas phage zc03, complete genome 69844
MT104473.1 mt104473.1 pseudomonas phage mr13, complete genome 47456
NC_047957.1 nc_047957.1 pseudomonas phage alpheus, complete genome 45756
NC_047956.1 nc_047956.1 pseudomonas phage achelous, complete genome 46585
NC_047955.1 nc_047955.1 pseudomonas phage nerthus, complete genome 45779
NC_047954.1 nc_047954.1 pseudomonas phage njord, complete genome 46644
NC_047953.1 nc_047953.1 pseudomonas phage vb_paep_130_113, complete genome 44205
NC_047852.1 nc_047852.1 pseudomonas phage phinfs, complete genome 42351
MT104470.1 mt104470.1 pseudomonas phage mr7, complete genome 41705
MT108725.1 mt108725.1 pseudomonas phage epa5, complete genome 64096
MT10

MF417910.1 mf417910.1 uncultured caudovirales phage clone 2ax_5, partial genome 38191
KU160495.1 ku160495.1 exiguobacterium phage vb_eaus-123, complete genome 30924
NC_007804.2 nc_007804.2 escherichia phage phiv10, complete genome 39104
DQ126339.2 dq126339.2 enterobacteria phage phiv10, complete genome 39104
EU710883.1 eu710883.1 erwinia phage phiea21-4, complete genome 84576
MT708550.1 mt708550.1 achromobacter phage mano, complete genome 42452
MN131143.1 mn131143.1 pseudomonas virus pa11p1, complete genome 66049
MF417926.1 mf417926.1 uncultured caudovirales phage clone 9f_1, partial genome 33961
MF417914.1 mf417914.1 uncultured caudovirales phage clone 8s_2, partial genome 37831
MF417913.1 mf417913.1 uncultured caudovirales phage clone 9s_2, partial genome 37831
MF417942.1 mf417942.1 uncultured caudovirales phage clone 7f_6, partial genome 21262
MG209611.1 mg209611.1 aphanizomenon phage vb_aphas-cl131, complete genome 112793
HM072038.1 hm072038.1 bacillus phage phi105, complete genome

In [3]:
# Write cleaned fasta records to file
SeqIO.write(cleaned_records, paths.PHAGE_REF, "fasta")

1519