In [1]:
from Bio.SeqRecord import SeqRecord

help(SeqRecord)

Help on class SeqRecord in module Bio.SeqRecord:

class SeqRecord(builtins.object)
 |  SeqRecord(seq: Union[ForwardRef('Seq'), ForwardRef('MutableSeq'), NoneType], id: str | None = '<unknown id>', name: str = '<unknown name>', description: str = '<unknown description>', dbxrefs: list[str] | None = None, features: list['SeqFeature'] | None = None, annotations: dict[str, typing.Union[str, int]] | None = None, letter_annotations: dict[str, collections.abc.Sequence[typing.Any]] | None = None) -> None
 |
 |  A SeqRecord object holds a sequence and information about it.
 |
 |  Main attributes:
 |   - id          - Identifier such as a locus tag (string)
 |   - seq         - The sequence itself (Seq object or similar)
 |
 |  Additional attributes:
 |   - name        - Sequence name, e.g. gene name (string)
 |   - description - Additional text (string)
 |   - dbxrefs     - List of database cross references (list of strings)
 |   - features    - Any (sub)features defined (list of SeqFeature obj

### The SeqRecord object

This class allows higher level features such as identifiers and features to be associated with a sequence

The SeqRecord class itself is quite simple, and offers the following information as attributes:

.`seq` => The sequence itself, typically a Seq object.

.`id` => The primary ID used to identify the sequence – a string. In most cases this is something like an accession number.

.`name` => A “common” name/id for the sequence – a string. In some cases this will be the same as the accession number, but it could also be a clone name. I think of this as being analogous to the LOCUS id in a GenBank record.

.`description` => A human readable description or expressive name for the sequence – a string.

.`letter_annotations` => Holds per-letter-annotations using a (restricted) dictionary of additional information about the letters in the sequence. The keys are the name of the information, and the information is contained in the value as a Python sequence (i.e. a list, tuple or string) with the same length as the sequence itself. This is often used for quality scores or secondary structure information (e.g. from Stockholm/PFAM alignment files).

.`annotations` => A dictionary of additional information about the sequence. The keys are the name of the information, and the information is contained in the value. This allows the addition of more “unstructured” information to the sequence.

.`features` => A list of SeqFeature objects with more structured information about the features on a sequence (e.g. position of genes on a genome, or domains on a protein sequence).

.`dbxrefs` => A list of database cross-references as strings.


### Creating a SeqRecord

Usually you won’t create a SeqRecord “by hand”, but instead use Bio.SeqIO to read in a sequence file for you 

#### SeqRecord objects from scratch

To create a SeqRecord at a minimum you just need a Seq object

In [None]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

simple_seq = Seq("GATC")
simple_seq_r = SeqRecord(simple_seq)

In [3]:
simple_seq_r

SeqRecord(seq=Seq('GATC'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [4]:
simple_seq_r.id = "AC12345"
simple_seq_r.description = "Made up sequence I wish I could write a paper about"
print(simple_seq_r.description)

Made up sequence I wish I could write a paper about


In [6]:
simple_seq_r.seq

Seq('GATC')

In [7]:
# Including an identifier is very important if you want to output your SeqRecord to a file. You would normally include this when creating the object

simple_seq = Seq("GATC")
simple_seq_r = SeqRecord(simple_seq, id="AC12345")

In [8]:
# we could just make up different annotations to our SeqRecord through it's dictionary annot

simple_seq_r.annotations["evidence"] = "None. I just made it up."
simple_seq_r.annotations

{'evidence': 'None. I just made it up.'}

#### SeqRecord objects from FASTA files

In [1]:
from Bio import SeqIO

record = SeqIO.read("NC_005816.fna", "fasta")
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='gi|45478711|ref|NC_005816.1|', name='gi|45478711|ref|NC_005816.1|', description='gi|45478711|ref|NC_005816.1| Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=[])

In [2]:
record.seq

Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG')

In [3]:
record.id

'gi|45478711|ref|NC_005816.1|'

In [4]:
record.name

'gi|45478711|ref|NC_005816.1|'

In [5]:
record.description

'gi|45478711|ref|NC_005816.1| Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence'

#### SeqRecord objects from GenBank files

In [1]:
from Bio import SeqIO

record = SeqIO.read("NC_005816.gb", "gb")
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=['Project:58037'])

In [2]:
record.seq

Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG')

In [3]:
record.id

'NC_005816.1'

In [4]:
record.name

'NC_005816'

In [10]:
record.description

'Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence'

In [5]:
len(record.annotations)

13

In [7]:
len(record.annotations.keys())

13

In [12]:
record.annotations["source"]

'Yersinia pestis biovar Microtus str. 91001'

In [13]:
record.dbxrefs

['Project:58037']

Finally, and perhaps most interestingly, all the entries in the features table (e.g. the genes or CDS features) get recorded as `SeqFeature` objects in the `features` list.

In [15]:
len(record.features)

41

In [13]:
record.annotations['references']

[Reference(title='Genetics of metabolic variations between Yersinia pestis biovars and the proposal of a new biovar, microtus', ...),
 Reference(title='Complete genome sequence of Yersinia pestis strain 91001, an isolate avirulent to humans', ...),
 Reference(title='Direct Submission', ...),
 Reference(title='Direct Submission', ...)]

### Feature, location and position objects

#### SeqFeature objects
Sequence features are an essential part of describing a sequence. 
Way to organize and easily get at the more `abstract` information that is known about the sequence.
The key idea about each `SeqFeature` object is to describe a region on a parent sequence, typically a `SeqRecord` object. That region is described with a location object, typically a range between two positions.

<img width="800" src="seq_feature.png" alt="SeqFeature-Attributes"/>

#### Positions and locations
The key idea about each `SeqFeature` object is to describe a region on a parent sequence, for which we use a location object, typically describing a range between two positions.

In [6]:
from Bio import SeqIO

my_snp = 4350
record = SeqIO.read("NC_005816.gb", "genbank")
for feature in record.features:
    if my_snp in feature:
        print("%s %s" % (feature.type, feature.qualifiers.get("db_xref")))

source ['taxon:229193']
gene ['GeneID:2767712']
CDS ['GI:45478716', 'GeneID:2767712']


In [3]:
record.features

[SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(9609), strand=1), type='source', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(1954), strand=1), type='repeat_region'),
 SeqFeature(SimpleLocation(ExactPosition(86), ExactPosition(1109), strand=1), type='gene', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(86), ExactPosition(1109), strand=1), type='CDS', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(86), ExactPosition(959), strand=1), type='misc_feature', qualifiers=...),
 SeqFeature(SimpleLocation(BeforePosition(110), ExactPosition(209), strand=1), type='misc_feature', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(437), ExactPosition(812), strand=1), type='misc_feature', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(1105), ExactPosition(1888), strand=1), type='gene', qualifiers=...),
 SeqFeature(SimpleLocation(ExactPosition(1105), ExactPosition(1888), strand=1), type='CDS', qualifiers=...),
 SeqFeatu

In [5]:
f = record.features[0]
f

SeqFeature(SimpleLocation(ExactPosition(0), ExactPosition(9609), strand=1), type='source', qualifiers=...)

#### Sequence described by a feature or location

In [11]:
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, SimpleLocation

seq = Seq("ACCGAGACGGCAAAGGCTAGCATAGGTATGAGACTTCCTTCCTGCCAGTGCTGAGGAACTGGGAGCCTAC")
feature = SeqFeature(SimpleLocation(5, 18, strand=-1), type="gene")

In [15]:
feature_seq = seq[feature.location.start : feature.location.end].reverse_complement()
feature_seq

Seq('AGCCTTTGCCGTC')

In [17]:
# or (recommended)

feature_seq = feature.extract(seq)
feature_seq

Seq('AGCCTTTGCCGTC')

### Comparison

In [1]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

record1 = SeqRecord(Seq("ACGT"), id="test")
record2 = SeqRecord(Seq("ACGT"), id="test")

In [19]:
record1 == record2

NotImplementedError: SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest.

In [2]:
record1.seq == record2.seq

True

In [5]:
record1.id == record2.id

True

### The format method

In [24]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

record = SeqRecord(
    Seq(
        "MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD"
        "GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK"
        "NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM"
        "SSAC"
    ),
    id="gi|14150838|gb|AAK54648.1|AF376133_1",
    description="chalcone synthase [Cucumis sativus]"
)
print(record.format("fasta"))

>gi|14150838|gb|AAK54648.1|AF376133_1 chalcone synthase [Cucumis sativus]
MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD
GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK
NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM
SSAC



### Slicing a SeqRecord

In [27]:
from Bio import SeqIO

record = SeqIO.read("NC_005816.gb", "genbank")
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=['Project:58037'])

In [36]:
len(record)

9609

In [33]:
len(record.features)

41

In [31]:
print(record.features[20])

type: gene
location: [4342:4780](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']



In [32]:
print(record.features[21])

type: CDS
location: [4342:4780](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GI:45478716', 'GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']
    Key: note, Value: ['similar to many previously sequenced pesticin immunity protein entries of Yersinia pestis plasmid pPCP, e.g. gi| 16082683|,ref|NP_395230.1| (NC_003132) , gi|1200166|emb|CAA90861.1| (Z54145 ) , gi|1488655| emb|CAA63439.1| (X92856) , gi|2996219|gb|AAC62543.1| (AF053945) , and gi|5763814|emb|CAB531 67.1| (AL109969)']
    Key: product, Value: ['pesticin immunity protein']
    Key: protein_id, Value: ['NP_995571.1']
    Key: transl_table, Value: ['11']
    Key: translation, Value: ['MGGGMISKLFCLALIFLSSSGLAEKNTYTAKDILQNLELNTFGNSLSHGIYGKQTTFKQTEFTNIKSNTKKHIALINKDNSWMISLKILGIKRDEYTVCFEDFSLIRPPTYVAIHPLLIKKVKSGNFIVVKEIKKSIPGCTVYYH']



In [37]:
sub_record = record[4300:4800]
sub_record

SeqRecord(seq=Seq('ATAAATAGATTATTCCAAATAATTTATTTATGTAAGAACAGGATGGGAGGGGGA...TTA'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=[])

In [38]:
len(sub_record)

500

In [39]:
len(sub_record.features)

2

In [40]:
print(sub_record.features[0])

type: gene
location: [42:480](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']



In [41]:
print(sub_record.features[1])

type: CDS
location: [42:480](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GI:45478716', 'GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']
    Key: note, Value: ['similar to many previously sequenced pesticin immunity protein entries of Yersinia pestis plasmid pPCP, e.g. gi| 16082683|,ref|NP_395230.1| (NC_003132) , gi|1200166|emb|CAA90861.1| (Z54145 ) , gi|1488655| emb|CAA63439.1| (X92856) , gi|2996219|gb|AAC62543.1| (AF053945) , and gi|5763814|emb|CAB531 67.1| (AL109969)']
    Key: product, Value: ['pesticin immunity protein']
    Key: protein_id, Value: ['NP_995571.1']
    Key: transl_table, Value: ['11']
    Key: translation, Value: ['MGGGMISKLFCLALIFLSSSGLAEKNTYTAKDILQNLELNTFGNSLSHGIYGKQTTFKQTEFTNIKSNTKKHIALINKDNSWMISLKILGIKRDEYTVCFEDFSLIRPPTYVAIHPLLIKKVKSGNFIVVKEIKKSIPGCTVYYH']



In [42]:
sub_record.annotations

{'molecule_type': 'DNA'}

In [None]:
sub_record.annotations["topology"] = "linear"

In [46]:
sub_record.id = "NC_005816.2"
sub_record.id

'NC_005816.2'

In [50]:
sub_record.description = ("Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, partial")
seq = sub_record.format("genbank")

In [51]:
print(seq)

LOCUS       NC_005816                500 bp    DNA     linear   UNK 01-JAN-1980
DEFINITION  Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, partial.
ACCESSION   NC_005816
VERSION     NC_005816.2
KEYWORDS    .
SOURCE      .
  ORGANISM  .
            .
FEATURES             Location/Qualifiers
     gene            43..480
                     /gene="pim"
                     /locus_tag="YP_pPCP05"
                     /db_xref="GeneID:2767712"
     CDS             43..480
                     /gene="pim"
                     /locus_tag="YP_pPCP05"
                     /note="similar to many previously sequenced pesticin
                     immunity protein entries of Yersinia pestis plasmid pPCP,
                     e.g. gi| 16082683|,ref|NP_395230.1| (NC_003132) ,
                     gi|1200166|emb|CAA90861.1| (Z54145 ) , gi|1488655|
                     emb|CAA63439.1| (X92856) , gi|2996219|gb|AAC62543.1|
                     (AF053945) , and gi|5763814|emb|CAB531 67.1| (AL

In [54]:
print(seq[:205] + "...")

LOCUS       NC_005816                500 bp    DNA     linear   UNK 01-JAN-1980
DEFINITION  Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, partial.
ACCESSION   NC_005816
VERSION     NC_005816.2
...


### Adding SeqRecord objects

In [57]:
from Bio import SeqIO

record = next(SeqIO.parse("example.fastq", "fastq"))
record

SeqRecord(seq=Seq('GTTGTACTTCGTTCAATCGGTAGGTGTTTAACCGGATGGTCACGCCTACCGTGA...GTC'), id='ee15a423-b008-44be-a4b2-5441d11b0b94', name='ee15a423-b008-44be-a4b2-5441d11b0b94', description='ee15a423-b008-44be-a4b2-5441d11b0b94 runid=fa1d76e661ac2bbb53a002e85e75a30e91827c51 sampleid=1 read=5087 ch=53 start_time=2019-10-18T22:14:05Z', dbxrefs=[])

In [58]:
len(record)

192

In [59]:
print(record.seq)

GTTGTACTTCGTTCAATCGGTAGGTGTTTAACCGGATGGTCACGCCTACCGTGACAAAGAGATTGTCGGTGTCTTTGTGTTTCTGTTGGTGCTGATATTGCATTATGCATGAACGTAATGCCCATTAGTTGTGAATCCACCATGCGCGGAAGATAGAGCGACAGGCAAGTCACAAAGACACCGACAACTGTC


In [62]:
print(record.letter_annotations["phred_quality"])

[2, 2, 3, 5, 3, 5, 14, 15, 18, 20, 23, 23, 16, 7, 8, 6, 3, 15, 5, 9, 7, 6, 12, 13, 28, 26, 21, 23, 20, 7, 8, 3, 13, 4, 7, 3, 6, 4, 4, 5, 2, 5, 8, 10, 13, 13, 15, 11, 5, 10, 5, 4, 13, 12, 14, 10, 11, 4, 5, 7, 8, 3, 18, 25, 15, 5, 31, 15, 24, 33, 37, 28, 29, 34, 34, 23, 7, 22, 23, 15, 17, 24, 37, 22, 28, 27, 28, 8, 10, 31, 10, 13, 21, 34, 34, 37, 37, 34, 31, 12, 23, 4, 17, 20, 22, 24, 27, 33, 23, 26, 23, 23, 19, 16, 17, 16, 18, 19, 11, 11, 26, 25, 23, 13, 14, 11, 2, 16, 2, 5, 7, 4, 7, 7, 15, 24, 26, 33, 28, 30, 30, 19, 23, 27, 28, 27, 31, 22, 24, 9, 12, 25, 33, 20, 19, 15, 11, 23, 28, 33, 28, 19, 19, 19, 25, 27, 20, 22, 16, 12, 33, 20, 28, 36, 35, 17, 13, 20, 21, 26, 16, 16, 15, 13, 20, 10, 11, 9, 8, 4, 4, 9]


In [64]:
left = record[:20]
left.seq

Seq('GTTGTACTTCGTTCAATCGG')

In [65]:
left.letter_annotations["phred_quality"]

[2, 2, 3, 5, 3, 5, 14, 15, 18, 20, 23, 23, 16, 7, 8, 6, 3, 15, 5, 9]

In [67]:
right = record[21:26]
right.seq

Seq('AGGTG')

In [68]:
right.letter_annotations["phred_quality"]

[6, 12, 13, 28, 26]

In [69]:
edited = left + right
edited

SeqRecord(seq=Seq('GTTGTACTTCGTTCAATCGGAGGTG'), id='ee15a423-b008-44be-a4b2-5441d11b0b94', name='ee15a423-b008-44be-a4b2-5441d11b0b94', description='ee15a423-b008-44be-a4b2-5441d11b0b94 runid=fa1d76e661ac2bbb53a002e85e75a30e91827c51 sampleid=1 read=5087 ch=53 start_time=2019-10-18T22:14:05Z', dbxrefs=[])

In [71]:
print(edited.letter_annotations["phred_quality"])

[2, 2, 3, 5, 3, 5, 14, 15, 18, 20, 23, 23, 16, 7, 8, 6, 3, 15, 5, 9, 6, 12, 13, 28, 26]


In [72]:
edited = record[:20] + record[21:26]

In [73]:
from Bio import SeqIO

record = SeqIO.read("NC_005816.gb", "genbank")
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=['Project:58037'])

In [74]:
len(record)

9609

In [75]:
len(record.features)

41

In [76]:
record.dbxrefs

['Project:58037']

In [77]:
record.annotations.keys()

dict_keys(['molecule_type', 'topology', 'data_file_division', 'date', 'accessions', 'sequence_version', 'gi', 'keywords', 'source', 'organism', 'taxonomy', 'references', 'comment'])

In [78]:
# You can shift the origin like this

shifted = record[2000:] + record[:2000]
shifted

SeqRecord(seq=Seq('GATACGCAGTCATATTTTTTACACAATTCTCTAATCCCGACAAGGTCGTAGGTC...GGA'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=[])

In [87]:
len(shifted.features)

40

In [89]:
shifted.dbxrefs

[]

In [90]:
shifted.annotations.keys()

dict_keys(['molecule_type'])

In [91]:
shifted.dbxrefs = record.dbxrefs[:]
shifted.dbxrefs

['Project:58037']

In [92]:
shifted.annotations = record.annotations.copy()
shifted.annotations.keys()

dict_keys(['molecule_type', 'topology', 'data_file_division', 'date', 'accessions', 'sequence_version', 'gi', 'keywords', 'source', 'organism', 'taxonomy', 'references', 'comment'])

In [93]:
from Bio import SeqIO

rec = SeqIO.read("NC_005816.gb", "genbank")
print(rec.id, len(rec), len(rec.features), len(rec.dbxrefs), len(rec.annotations))

NC_005816.1 9609 41 1 13


In [94]:
rc = rec.reverse_complement(id="TESTING")
print(rc.id, len(rc), len(rc.features), len(rc.dbxrefs), len(rc.annotations))

TESTING 9609 41 0 0
