# Section 2

In [1]:
#We have talked in the last section about how to deal with the sequences in bioPython
#For example:
import Bio
from Bio.Seq import Seq
my_seq = Seq("AGTACACTGGT")
print(my_seq)

AGTACACTGGT


## The SeqRecord object

The SeqRecord (Sequence Record) class is defined in the Bio.SeqRecord module. This class allows higher
level features associated with a sequence
* .seq { The sequence itself, typically a Seq object.
* .id { The primary ID used to identify the sequence.
* .description { A human readable description or expressive name for the sequence.

In [2]:
# To create a SeqRecord at a minimum you just need a Seq object:
from Bio.SeqRecord import SeqRecord
simple_seq_r = SeqRecord(my_seq)


In [3]:
# Additionally, you can also pass the id, name and description
print('ID before identification ',simple_seq_r.id)
simple_seq_r.id = "AC12345"
simple_seq_r.description = "Made up sequence I wish I could write a paper about"
simple_seq_r.name = "SequenceName"
print(simple_seq_r.description)
print(simple_seq_r)

ID before identification  <unknown id>
Made up sequence I wish I could write a paper about
ID: AC12345
Name: SequenceName
Description: Made up sequence I wish I could write a paper about
Number of features: 0
Seq('AGTACACTGGT')


### Iterating over the records in a sequence file

###### The Bio.SeqIO module for reading and writing sequence file formats works with SeqRecord objects

In [4]:
#The Bio.SeqIO.parse() function returns an iterator which gives SeqRecord objects. Iterators are
#typically used in a for loop as shown below.
#Sometimes you'll find yourself dealing with files which contain only a single record. For this situation
#use the function Bio.SeqIO.read()


from Bio import SeqIO
for seq_record in SeqIO.parse("exFasta.fasta", "fasta"):
    print(seq_record.id) #seq_record is of type sequence record
    print(repr(seq_record.seq)) 
    print(len(seq_record))
    #If instead you wanted to load another formate
    # then all you need to do is change the filename and the format string

gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
740
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC')
753
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA')
748
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT')
744
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA')
733
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC')
718
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT')
730
gi|2765650|emb|Z78525.1|CAZ78525
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA')
704
gi|2765649|emb|Z78524.1|CFZ78524
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGATAGTAG...AGC')
740
gi|2765648|emb|Z78523.1|CHZ78523
Seq('CGTAACCAGGTTTCCGT

In [5]:
#Since the parser gives seqRecord object, so you may want only to loop and get only one element e.g id
identifiers = [seq_record.id for seq_record in SeqIO.parse("exFasta.fasta", "fasta")]
identifiers

['gi|2765658|emb|Z78533.1|CIZ78533',
 'gi|2765657|emb|Z78532.1|CCZ78532',
 'gi|2765656|emb|Z78531.1|CFZ78531',
 'gi|2765655|emb|Z78530.1|CMZ78530',
 'gi|2765654|emb|Z78529.1|CLZ78529',
 'gi|2765652|emb|Z78527.1|CYZ78527',
 'gi|2765651|emb|Z78526.1|CGZ78526',
 'gi|2765650|emb|Z78525.1|CAZ78525',
 'gi|2765649|emb|Z78524.1|CFZ78524',
 'gi|2765648|emb|Z78523.1|CHZ78523',
 'gi|2765647|emb|Z78522.1|CMZ78522',
 'gi|2765646|emb|Z78521.1|CCZ78521',
 'gi|2765645|emb|Z78520.1|CSZ78520',
 'gi|2765644|emb|Z78519.1|CPZ78519',
 'gi|2765643|emb|Z78518.1|CRZ78518',
 'gi|2765642|emb|Z78517.1|CFZ78517',
 'gi|2765641|emb|Z78516.1|CPZ78516',
 'gi|2765640|emb|Z78515.1|MXZ78515',
 'gi|2765639|emb|Z78514.1|PSZ78514',
 'gi|2765638|emb|Z78513.1|PBZ78513',
 'gi|2765637|emb|Z78512.1|PWZ78512',
 'gi|2765636|emb|Z78511.1|PEZ78511',
 'gi|2765635|emb|Z78510.1|PCZ78510',
 'gi|2765634|emb|Z78509.1|PPZ78509',
 'gi|2765633|emb|Z78508.1|PLZ78508',
 'gi|2765632|emb|Z78507.1|PLZ78507',
 'gi|2765631|emb|Z78506.1|PLZ78506',
 

### Modifying data
The attributes of a SeqRecord can be modified directly

In [6]:

for seq_record in SeqIO.parse("exFasta3Rec.fasta", "fasta"):
    res_seq=repr(seq_record.seq)
    if res_seq[5] == 'C' and res_seq[6] == 'G':
            seq_record.id="new_id"
            print(seq_record.id) #seq_record is of type sequence record
            print(repr(seq_record.seq)) 
            print(len(seq_record))


new_id
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC')
740
new_id
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC')
753


### Writing Sequence Files

Bio.SeqIO.write() which is for sequence output (writing files). This is a function taking three arguments:
1. Some SeqRecord object 
2. Filename to write to
3. A sequence format

In [7]:
rec1 = SeqRecord(
Seq(
"MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD"
"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK"
"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM"
"SSAC",
),
id="gi|14150838|gb|AAK54648.1|AF376133_1",
description="chalcone synthase [Cucumis sativus]",
)
rec2 = SeqRecord(
Seq(
"YPDYYFRITNREHKAELKEKFQRMCDKSMIKKRYMYLTEEILKENPSMCEYMAPSLDARQ"
"DMVVVEIPKLGKEAAVKAIKEWGQ",
),
id="gi|13919613|gb|AAK33142.1|",
description="chalcone synthase [Fragaria vesca subsp. bracteata]",
)
rec3 = SeqRecord(
Seq(
"MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPDYYFRITNSEHKVELKEKFKRMC"
"EKSMIKKRYMHLTEEILKENPNICAYMAPSLDARQDIVVVEVPKLGKEAAQKAIKEWGQP"
"KSKITHLVFCTTSGVDMPGCDYQLTKLLGLRPSVKRFMMYQQGCFAGGTVLRMAKDLAEN"
"NKGARVLVVCSEITAVTFRGPNDTHLDSLVGQALFGDGAAAVIIGSDPIPEVERPLFELV"
"SAAQTLLPDSEGAIDGHLREVGLTFHLLKDVPGLISKNIEKSLVEAFQPLGISDWNSLFW"
"IAHPGGPAILDQVELKLGLKQEKLKATRKVLSNYGNMSSACVLFILDEMRKASAKEGLGT"
"TGEGLEWGVLFGFGPGLTVETVVLHSVAT",
),
id="gi|13925890|gb|AAK49457.1|",
description="chalcone synthase [Nicotiana tabacum]",
)
my_records = [rec1, rec2, rec3]

In [8]:
NumofSequencesWrote=SeqIO.write(my_records, "section2_example.fasta", "fasta")
#The Bio.SeqIO.write() function returns the number of SeqRecord objects written to the file.
# Note - If you tell the Bio.SeqIO.write() function to write to a file that already exists, the old file will be overwritten without any warning.

#### Multiple exons from genomic DNA
1.	The file genomic_dna.txt contains a section of genomic DNA, and the file exons.txt contains a list of start/stop positions of exons. Each exon is on a separate line and the start and stop positions are separated by a comma.
2.	Write a function that will extract the exon segments, concatenate them.

In [13]:
def ExtractandConcaExons(inputFileName,exonsFileName):
    dnaFile = open(inputFileName)
    dna = dnaFile.read()
    dnaFile.close()
    print('Before transcription:',dna)
    

    # Importing sequence from exons.txt file
    exonsPosFile = open(exonsFileName)
    exonsPos = exonsPosFile.read()
    exonsPosFile.close()

    # List of exons positions
    pos = exonsPos.split("\n")

    codingSeq = ""

    # Extracting exons and concat them
    for p in pos:
        if p != "":
            startEndPos = p.split(",")
            print('positions:',p)
            codingSeq = codingSeq + dna[int(startEndPos[0]):int(startEndPos[1])+1]
    return codingSeq

In [14]:
returned=ExtractandConcaExons("genomic_dna_Copy.txt","exons_Copy.txt")
print('After transcription:',returned)

Before transcription: TCGATCGTACCGTCGACGATG
positions: 5,10
positions: 12,15
After transcription: CGTACCTCGA
