In [None]:
!pip install biopython



In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction
from Bio.Data import CodonTable
from Bio import Entrez
from Bio.SeqFeature import SeqFeature, FeatureLocation
from functools import reduce
import pandas as pd


In [None]:
# Download the file of relative bacterium T.oleivorans MIL-1 in genbank format
!sh -c "$(curl -fsSL ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)"


Entrez Direct has been successfully downloaded and installed.

In order to complete the configuration process, please execute the following:

  echo "export PATH=/root/edirect:\${PATH}" >> ${HOME}/.bashrc

or manually edit the PATH variable assignment in your .bashrc file.

Would you like to do that automatically now? [y/N]
^C


In [None]:
!echo "export PATH=\${PATH}:/root/edirect" >> ${HOME}/.bashrc
!export PATH=${PATH}:${HOME}/edirect
!$HOME/edirect/efetch -db nuccore -id HF680312 -format gb  >  T_oleivorans_MIL_1.gbk



In [None]:
!head T_oleivorans_MIL_1.gbk

LOCUS       HF680312             3920328 bp    DNA     circular BCT 27-FEB-2015
DEFINITION  Thalassolituus oleivorans MIL-1 complete genome.
ACCESSION   HF680312
VERSION     HF680312.1
DBLINK      BioProject: PRJEB1425
            BioSample: SAMEA2272589
KEYWORDS    complete genome.
SOURCE      Thalassolituus oleivorans MIL-1
  ORGANISM  Thalassolituus oleivorans MIL-1
            Bacteria; Pseudomonadota; Gammaproteobacteria; Oceanospirillales;


In [None]:
!head -n 30 gms2.lst


# GeneMark.hmm-2 LST format
# GeneMark.hmm-2 prokaryotic version: 1.25_lic
# File with sequence: scaffolds.fasta
# File with native parameters: GMS2.mod
# Native species name and build: unspecified GeneMarkS-2-1.14_1.25_lic
# File with MetaGeneMark parameters: /content/gms2_linux_64/mgm_11.mod
# translation table: 11
# output date start: Wed Oct 23 15:38:38 2024

# sequence-region 1 4682
SequenceID: scaffold1_cov248
     1   +   <2    1123    1122 atypical
     2   +    1120    1581     462 atypical TTGAGC 7 1
     3   +    1587    1871     285 atypical AGCTAG 5 1
     4   +    1953    2447     495 native TGCCCC 5 1
     5   +    2504    3388     885 atypical TTGACG 5 1
     6   +    3385    3813     429 atypical AGGGGA 4 1
     7   +    4131    4640     510 native ACGCTC 7 1
# scaffold1_cov248	total_logodd	258.723	average_length	598	average_density	1.50

# sequence-region 1 2295
SequenceID: scaffold2_cov158
     8   +    56    442     387 atypical AGGTGG 6 1
     9   -    658    1065 

In [None]:
scaffolds = dict()
for scaffold in SeqIO.parse("scaffolds.fasta", "fasta"):
    scaffold.annotations['molecule_type'] = 'DNA'
    scaffolds[scaffold.id] = scaffold

In [None]:
!head proteins.fasta

>1 scaffold1_cov248 2 1123 + gene_type=atypical partial=10
FVKMQHAARALQAAWESEDFSKTYSETYVKGEKRELVEALGFDPTQITEQQIDEAMAMAN
LVIEDPSLRDMLYQFVKDYAEAQHAIEITNVAGTAAFELILTIIMAAVTGGVGAVAAIGS
KAHLIKKFQKVGDLLSDFAKATRKLKLQGKKRKAKGNSAKFSNFDTDEIQAKKTDAHGAE
TGPKNSANATVPKSVPMTQEKYDEIINLERGNRPNDVGEYLSKDYVDSHLNKFKEEGGAF
IVIEEWISAPEYTSFPKDGKFVGLSSEMDQVVTKYKDSGGDWRVLRDELNLGENTDLSSA
KISYVKLSPNDPSFEYSMPNGNERGAYEHEWVPGGLTKSGTSEATLSGGDRIIHNNNVDN
LKKNSGLVVEPLQ
>2 scaffold1_cov248 1120 1581 + gene_type=atypical
MNKNLFIKALNRNETKDFFLGNGNYFSRNRESHNHDYVLFITGWVKKYIDISPKENLGTF


In [None]:
proteins = [iter for iter in SeqIO.parse("proteins.fasta", "fasta")]
proteins[0]

SeqRecord(seq=Seq('FVKMQHAARALQAAWESEDFSKTYSETYVKGEKRELVEALGFDPTQITEQQIDE...PLQ'), id='1', name='1', description='1 scaffold1_cov248 2 1123 + gene_type=atypical partial=10', dbxrefs=[])

In [None]:
def get_strand(info):
    return 1 if info[4] == '+' else -1

def create_feature(trans):
    info = trans.description.split(' ')
    strand = get_strand(info)
    scaff = SeqFeature(FeatureLocation(int(info[2]), int(info[3]), strand=strand), type="CDS")
    scaff.qualifiers['locus_tag'] = [info[0]]
    scaff.qualifiers['translation'] = [trans.seq]
    return scaff, info[1]


for trans in SeqIO.parse("proteins.fasta", "fasta"):
    feature, scaffold_id = create_feature(trans)
    scaffolds[scaffold_id].features.append(feature)

In [None]:
def extract_protein_info(genome):
    return {
        f.qualifiers['protein_id'][0]: f.qualifiers['product']
        for f in genome.features
        if 'protein_id' in f.qualifiers
    }

mil_1_genome = SeqIO.read("/content/T_oleivorans_MIL_1.gbk", "genbank")
mil_fs = extract_protein_info(mil_1_genome)

In [None]:
mil_fs

{'CCU70450.1': ['chromosomal replication initiator protein'],
 'CCU70451.1': ['DNA polymerase III subunit beta'],
 'CCU70452.1': ['DNA replication and repair protein RecF'],
 'CCU70453.1': ['DNA gyrase subunit B'],
 'CCU70454.1': ['hypothetical protein'],
 'CCU70455.1': ['hypothetical protein'],
 'CCU70456.1': ['hypothetical protein'],
 'CCU70457.1': ['hypothetical protein'],
 'CCU70458.1': ['hypothetical protein'],
 'CCU70459.1': ['hypothetical protein'],
 'CCU70460.1': ['transposase orfA'],
 'CCU70461.1': ['transposase orfB'],
 'CCU70462.1': ['hypothetical protein'],
 'CCU70463.1': ['metal-dependent hydrolase'],
 'CCU70464.1': ['small-conductance mechanosensitive channel-like protein'],
 'CCU70465.1': ['hypothetical protein'],
 'CCU70466.1': ['acyltransferase'],
 'CCU70467.1': ['histidinol phosphatase'],
 'CCU70468.1': ['glycyl-tRNA synthetase, beta subunit'],
 'CCU70469.1': ['glycyl-tRNA synthetase, alpha subunit'],
 'CCU70470.1': ['spermidine synthase'],
 'CCU70471.1': ['hypothetic

More on hypothetical proteins:

https://en.wikipedia.org/wiki/Hypothetical_protein

Example of a hypothetical protein: https://www.ncbi.nlm.nih.gov/protein/CCU71441.1

Not hypothetical: https://www.ncbi.nlm.nih.gov/protein/461491448


In [None]:
def read_hits(file_path):
    names = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
    return pd.read_csv(file_path, sep='\t', header=None, names=names)

def filter_hits(hits, substring):
    return (hits[hits['sseqid'].str.contains(substring)]
              .sort_values('bitscore', ascending=False)
              .drop_duplicates('qseqid'))

mil_hits = (read_hits('scaffolds.hits_from_MIL_1.txt')
            .pipe(filter_hits, substring="CCU"))


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

sorted_mil_hits = mil_hits.sort_index()

print(sorted_mil_hits)

       qseqid                               sseqid   pident  length  mismatch  gapopen  qstart  qend  sstart  send         evalue  bitscore
0           1  lcl|HF680312.1_prot_CCU73181.1_2732   97.319     373        10        0       1   373     174   546   0.000000e+00     751.0
7           2  lcl|HF680312.1_prot_CCU73182.1_2733   99.346     153         1        0       1   153       1   153  1.340000e-111     310.0
8           3  lcl|HF680312.1_prot_CCU73183.1_2734  100.000      94         0        0       1    94       1    94   8.690000e-68     194.0
9           4  lcl|HF680312.1_prot_CCU73181.1_2732   98.758     161         2        0       1   161      45   205  3.970000e-116     336.0
15          5  lcl|HF680312.1_prot_CCU73181.1_2732   91.034     145        10        2       1   145     229   370   8.210000e-82     253.0
...       ...                                  ...      ...     ...       ...      ...     ...   ...     ...   ...            ...       ...
10433    3614  lcl|H

In [None]:
def update_scaffold(mil_hit):
    qseqid = str(mil_hit['qseqid'])
    if qseqid in scaffolds:
        sseqid_parts = mil_hit['sseqid'].split('_')
        product = mil_fs[sseqid_parts[2]]
        scaffolds[qseqid].qualifiers['product'] = product

mil_hits.apply(update_scaffold, axis=1)

Unnamed: 0,0
8495,
9902,
4811,
8273,
9665,
...,...
5581,
3723,
10268,
465,


In [None]:
!wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
!gzip -d uniprot_sprot.dat.gz
!grep '^ID\|^DE   RecName: Full=' uniprot_sprot.dat > SwissProtNames.txt

In [None]:
!head SwissProtNames.txt

ID   001R_FRG3G              Reviewed;         256 AA.
DE   RecName: Full=Putative transcription factor 001R;
ID   002L_FRG3G              Reviewed;         320 AA.
DE   RecName: Full=Uncharacterized protein 002L;
ID   002R_IIV3               Reviewed;         458 AA.
DE   RecName: Full=Uncharacterized protein 002R;
ID   003L_IIV3               Reviewed;         156 AA.
DE   RecName: Full=Uncharacterized protein 003L;
ID   003R_FRG3G              Reviewed;         438 AA.
DE   RecName: Full=Uncharacterized protein 3R;


In [None]:
def load_swissprot(file_path):
    x = None
    swissprot = dict()
    with open(file_path) as file:
        for line in file:
            if line.startswith('ID'):
                x = line.split()[1]
            elif line.startswith('DE'):
                swissprot[x] = line.split('=')[1][:-2]
    return swissprot

def update_scaffold_swiss(swiss_hit, scaffolds, swissprot):
    qseqid = str(swiss_hit['qseqid'])
    if qseqid in scaffolds:
        sseqid = swiss_hit['sseqid'].split('|')[-1]
        scaffolds[qseqid].qualifiers['product'] = swissprot.get(sseqid, None)
names = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] #from BLAST specification
swissprot = load_swissprot('SwissProtNames.txt')
swiss_hits = pd.read_csv('scaffolds.hits_from_SwissProt.txt', sep='\t', header=None, names=names)
swiss_hits = swiss_hits.sort_values('bitscore', ascending=False).drop_duplicates('qseqid')
swiss_hits.apply(update_scaffold_swiss, axis=1, scaffolds=scaffolds, swissprot=swissprot)

Unnamed: 0,0
323,
585,
726,
673,
665,
742,
317,
688,
320,
606,


In [None]:
print(swiss_hits) #54 proteins, just like in the other notebook with data preparation

     qseqid                    sseqid  pident  length  mismatch  gapopen  qstart  qend  sstart  send         evalue  bitscore
323     645      sp|Q07736|T1RA_ECOLX  57.876     838       314        9       1   836       4   804   0.000000e+00     976.0
585    2611      sp|P08956|T1RK_ECOLI  40.701    1199       642       23       8  1171       5  1169   0.000000e+00     837.0
726    3028     sp|C7DLJ6|OLHYD_ELIME  62.258     620       223        3       7   623       6   617   0.000000e+00     809.0
673    3000      sp|Q9X447|YACK_RHIML  52.975     706       320        5      27   725      16   716   0.000000e+00     754.0
665    2997      sp|P0AGH1|YHHJ_ECOLI  65.684     373       128        0       4   376       2   374  1.490000e-170     483.0
742    3033      sp|P76113|CURA_ECOLI  65.306     343       119        0       4   346       3   345  1.500000e-170     481.0
317     186      sp|P23840|DIND_ECOLI  77.323     269        61        0       1   269       1   269  3.380000e-157   

Let's look up some of the proteins, e.g. T1RA_ECOLX is a ferment https://www.uniprot.org/uniprotkb/Q07736/entry

In [None]:
SeqIO.write(scaffolds.values(), "GENOME.gbk", "genbank")


83

In [None]:
!head -n 54 GENOME.gbk 
#genome was annotated normally, although in principle there could be more fields and details in the genbank format, you can see what fields there are here https://www.ncbi.nlm.nih.gov/genbank/samplerecord/
#other formats exist
#we see fields with information about coding sequences and amino acid sequences after translation

LOCUS       scaffold1_cov248        4682 bp    DNA              UNK 01-JAN-1980
DEFINITION  scaffold1_cov248.
ACCESSION   scaffold1_cov248
VERSION     scaffold1_cov248
KEYWORDS    .
SOURCE      .
  ORGANISM  .
            .
FEATURES             Location/Qualifiers
     CDS             3..1123
                     /locus_tag="1"
                     /translation="FVKMQHAARALQAAWESEDFSKTYSETYVKGEKRELVEALGFDPT
                     QITEQQIDEAMAMANLVIEDPSLRDMLYQFVKDYAEAQHAIEITNVAGTAAFELILTII
                     MAAVTGGVGAVAAIGSKAHLIKKFQKVGDLLSDFAKATRKLKLQGKKRKAKGNSAKFSN
                     FDTDEIQAKKTDAHGAETGPKNSANATVPKSVPMTQEKYDEIINLERGNRPNDVGEYLS
                     KDYVDSHLNKFKEEGGAFIVIEEWISAPEYTSFPKDGKFVGLSSEMDQVVTKYKDSGGD
                     WRVLRDELNLGENTDLSSAKISYVKLSPNDPSFEYSMPNGNERGAYEHEWVPGGLTKSG
                     TSEATLSGGDRIIHNNNVDNLKKNSGLVVEPLQ"
     CDS             1121..1581
                     /locus_tag="2"
                     /translation="MNKNLFIKALNRNETKDFFLGNGNY

Genes predicted: 3618

Of these, annotated using comparison with the bacterium MIL-1: 3330

Of these, annotated using the SwissProt DB: 54

Functions left unannotated: 288

In [None]:
!apt-get update && apt-get install ncbi-blast+

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/129 kB 11%] [Connected to cloud.r-project                                                                                                    Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [1 InRelease 51.8 kB/129 kB 40%] [2 InRelease0% [Connecting to archive.ubuntu.com (185.125.190.82)] [1 InRelease 88.0 kB/129 kB 68%] [Connected t0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Waiting for headers] [Waiting for headers] [                                                                                                    Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Ign:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy Releas

In [None]:
!$HOME/edirect/efetch -db nuccore -id HF680312 -format gene_fasta  >  T_oleivorans_MIL_1.genes.fasta

In [None]:
!head T_oleivorans_MIL_1.genes.fasta

>lcl|HF680312.1_gene_1 [gene=dnaA] [locus_tag=TOL_0001] [location=35..1624] [gbkey=Gene]
ATGACGCTAGTAACAGGGGTAACCGTGACGGGTTTGTGGCAAGACTGCCTTGGGCATTTACAGCATGAGT
TGCCTGCGCAGCAATACAACACATGGATCAGACCTTTGATCGCGAGTACGGAAAACGGTCAGTTAGTTCT
GAGCGCCCCTAATCGCTTTGTCAAAGATTGGGTTAAAGACAAGTATCTGCAGCGTATTCAAGAAATCTTA
TCCGAACTTAACGGCGGTCGTATCACTCATGTCGATGTGACTGCGGGTGAGTCGCGTCCTATGTTTAGTC
CTCAGGCTGCGCCAAGACCAGAACCACGGCCTGCGGCGTCGTCGGTTGAAGGATTTGCTTTTGCTGCGCC
TCGAGTGGAAGCTGAAGAACCAACCAGTACTTTTTCACCGATTGCATCTTCACCGTTAAAAGAATCACCT
TCTACTAATAACAACAATGAGTTTGGACGTCAGTCTTCTTCTAATCTCATTTTGCCTGGTCAGGCTTCTT
TTAATACTGATCCTATGCCGTCGGCTCCGGTTTCTAATAAACCGAAACGTAATGTGCAGGTTGAAGGTGG
TATTCAGCATCAGAGTTTCTTGAATTCGACATTTACCTTTAAAACCTTTGTTGAAGGTAAATCAAACCAG


In [None]:
def read_genbank_features(genbank_file):
    records = SeqIO.read(genbank_file, "genbank").features
    positive_strand_rRNA = []
    negative_strand_rRNA = []

    for feature in records:
        if feature.type == 'rRNA':
            if feature.location.strand == 1:
                positive_strand_rRNA.append((feature.location.start, feature.location.end))
            else:
                negative_strand_rRNA.append((feature.location.start, feature.location.end))

    return positive_strand_rRNA, negative_strand_rRNA

def read_genome_sequence(fasta_file):
    genome_sequence = ''
    for seq_record in SeqIO.parse(fasta_file, "fasta"):
        genome_sequence += str(seq_record.seq)
    return genome_sequence

def write_rRNA_to_fasta(output_file, positive_strand_rRNA, negative_strand_rRNA, genome_sequence):
    with open(output_file, 'w') as file:
        for start, end in positive_strand_rRNA:
            file.write(f'> rRNA {start} {end}\n{genome_sequence[start:end]}\n')
        negative_strand_genome = Seq(genome_sequence).reverse_complement()
        for start, end in negative_strand_rRNA:
            file.write(f'> rRNA {start} {end}\n{negative_strand_genome[start:end]}\n')

In [None]:
genbank_file = "T_oleivorans_MIL_1.gbk"
fasta_file = "T_oleivorans_MIL_1.genes.fasta"
output_file = "rRNA.fasta"

In [None]:
positive_strand_rRNA, negative_strand_rRNA = read_genbank_features(genbank_file)
genome_sequence = read_genome_sequence(fasta_file)
write_rRNA_to_fasta(output_file, positive_strand_rRNA, negative_strand_rRNA, genome_sequence)

In [None]:
print(positive_strand_rRNA)

[(ExactPosition(341494), ExactPosition(343033)), (ExactPosition(343487), ExactPosition(346374)), (ExactPosition(346568), ExactPosition(346684))]


In [None]:
print(negative_strand_rRNA)

[(ExactPosition(2580484), ExactPosition(2580600)), (ExactPosition(2580781), ExactPosition(2583668)), (ExactPosition(2583918), ExactPosition(2585457)), (ExactPosition(3418645), ExactPosition(3418761)), (ExactPosition(3418942), ExactPosition(3421829)), (ExactPosition(3422079), ExactPosition(3423618)), (ExactPosition(3423993), ExactPosition(3424109)), (ExactPosition(3424290), ExactPosition(3427177)), (ExactPosition(3427427), ExactPosition(3428966))]


In [None]:
!cat rRNA.fasta

> rRNA 341494 343033
TGCCAACACACAAAATAGAAAGCACAAGCGAAGTAGGCATTGACCCTGACTGGATGGAAGCCATGGCATTTGCATGGCTAGGCTGGCGTACGATAAATGGTCTGCCGGGAAATATTCCGAGTGTTACCGGTGCCGCTGGTGAGCGAATTTTAGGTGGGATCTACAAGGCCTAGAACAGGCTTTTAGAACAATGCATAACCTTCTTTTTGCAGTCGTTCGACTTCAAACTGACCTGAGACTACGCTGTCAATGAACGGCGGAAGGTCGGTCATTTCGATACCGCGGTCTCCAAGCCAACGAGAACAAACTTTGATATCGATGATATTGAAGGCTTCTAATCGTGAGGGTGCTGAACATACCTGATCACAATTCAAAATCAAGTTCACCGAGTAAATTGCCGACGGATTGTTTACACTTGCCGGCTATATAGAAGAGAAGCTGAATAAGGACACCTGTTTCATCATGGCGTTGCCAGGCCCACTACAGTATTTTCCAGCCAGACATATTTTCGCTGCGGCGCTGGTTGTGTTTTTAGTAATCGTCGTCGCCGCATGGCCGACTCCCGAATCCAACATAACTCAGCAAAACTATGTTATAGAGTTGCCCGAGCCAAGTGAGGAGCCGATGCCGGATACTCCGCAACTGGACTGGGAGGAAGACAAAGTTAAATCCGGTGATAGTTTATCGGTGGTTTTTGGGCGGCATAATTTAAGTGCGGTTGATGTCATTGATATTGCAGCGGCAGTCCCGCGCGAAGTCATTACTTTACGTCCAGGGCAGAGCCTTCGTTGGGTCCGTACTGCCGATAACCACATTACTCACCTAGAAATAGATATTAGCCCTCTTGCCAAACACAGCATTACTCGCGATGCAGAAGGCAAATTGCAATATGAGCTACTGCAACGCGATGCTGACTTTATTCCGCGCTTTGCTCACGCCGTTATCGACAACTCGCTCTTCCTTGATGGAGGACGCGCAG

In [None]:
!blastn -query rRNA.fasta -subject scaffolds.fasta > blastn_out.gbk

In [None]:
!cat blastn_out.gbk #check the percentage of similarity in identities, and other information

BLASTN 2.12.0+


Reference: Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb
Miller (2000), "A greedy algorithm for aligning DNA sequences", J
Comput Biol 2000; 7(1-2):203-14.



Database: User specified sequence set (Input: scaffolds.fasta).
           83 sequences; 3,915,504 total letters



Query= rRNA 341494 343033

Length=1539
                                                                      Score     E
Sequences producing significant alignments:                          (Bits)  Value

scaffold3_cov273                                                      2150    0.0  


> scaffold3_cov273
Length=3871046

 Score = 2150 bits (1164),  Expect = 0.0
 Identities = 1186/1197 (99%), Gaps = 0/1197 (0%)
 Strand=Plus/Minus

Query  343      GAGGGTGCTGAACATACCTGATCACAATTCAAAATCAAGTTCACCGAGTAAATTGCCGAC  402
                ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Sbjct  3869284  GAGGGTGCTGAACATACCTGATCACAATTCAAAATCAAGTTCACCGAGTAAATTGCCGAC  3869225

Query  403      GGA

In [None]:
import re
def add_blast_annotations(genbank_file, blast_output, output_file):
    records = SeqIO.parse(genbank_file, "genbank")
    updated_records = []
    with open(blast_output, 'r') as blast_file:
        for line in blast_file:
            if line.startswith("Query="):
                match = re.search(r'(\d+)\s+(\d+)', line)
                if match:
                    query_start = int(match.group(1))
                    query_end = int(match.group(2))

            elif line.startswith("Score ="):
                # extracting match information
                scaffold = line.split()[1]
                score = line.split('=')[1].split()[0]
                e_value = line.split('=')[2].split()[0]
                identities = line.split('=')[3].strip()
                feature = SeqFeature(
                    location=FeatureLocation(start=query_start - 1, end=query_end),
                    type="match",
                    qualifiers={
                        "score": score,
                        "e_value": e_value,
                        "identities": identities,
                        "note": f"BLAST match with {scaffold}"
                    }
                )

                for record in records:
                    record.features.append(feature)
                    updated_records.append(record)

    SeqIO.write(updated_records, output_file, "genbank")


In [None]:
genbank_file = "GENOME.gbk"
output_file = "T_oleivorans_Barents_updated.gbk"
blast_output = "blastn_out.gbk"
add_blast_annotations(genbank_file, blast_output, output_file) #fix
