# Python Project Part 2: Data Parsing with Biopython

## Based in the Biopython tutorial Chapter 2, 5: Parsing sequence files
Focus on Sars-COV-2 data

In [1]:
from Bio import SeqIO

Try these examples from Chapter 2, Sections 2.4.1 - 2.4.2
Then complete the exercises in Chapter 5 up to and including section 5.5.3
Note: You will need to make some compressed verions of the files
      and for the downloaded files in the exercise, use the examples in the tutorial.
      
      https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc47

# CHAPTER 5 Tutorial Below

Sections 5.1 to 5.5.3
Make sure to put in a few notes with the markdown language. Briefly explain what each part is doing.

## 5.1 Parsing or Reading Sequences

In [None]:
# 5.1.1

for seq_record in SeqIO.parse("sarscov2.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    
for seq_record in SeqIO.parse("sarscov2.gbk", "genbank"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    
identifiers = [seq_record.id for seq_record in SeqIO.parse("sarscov2.gbk", "genbank")]
identifiers

In [None]:
# 5.1.2

record_iterator = SeqIO.parse("sarscov2.fasta", "fasta")

first_record = next(record_iterator)
print(first_record.id)
print(first_record.description)

second_record = next(record_iterator)
print(second_record.id)
print(second_record.description)

lonely_record = next(SeqIO.parse("sarscov2.gbk", "genbank"))

In [None]:
# 5.1.3

records = list(SeqIO.parse("sarscov2.gbk", "genbank"))

print("Found %i records" % len(records))

print("The last record")
last_record = records[-1]  # using Python's list tricks
print(last_record.id)
print(repr(last_record.seq))
print(len(last_record))

print("The first record")
first_record = records[0]  # remember, Python counts from zero
print(first_record.id)
print(repr(first_record.seq))
print(len(first_record))

In [None]:
# 5.1.4

record_iterator = SeqIO.parse("sarscov2.gbk", "genbank")
first_record = next(record_iterator)
print(first_record)

print(first_record.annotations)
print(first_record.annotations.keys())
print(first_record.annotations.values())

print(first_record.annotations["source"])
print(first_record.annotations["organism"])

all_species_g = []
for seq_record in SeqIO.parse("sarscov2.gbk", "genbank"):
    all_species_g.append(seq_record.annotations["organism"])
print(all_species_g)

all_species_f = []
for seq_record in SeqIO.parse("sarscov2.fasta", "fasta"):
    all_species_f.append(seq_record.description.split()[1])
print(all_species_f)



In [None]:
# 5.1.5

record_iterator = SeqIO.parse("sarscov2.fasta", "fasta")
first_record = next(record_iterator)
first_record.id
first_record.id = "new_id"
first_record.id

record_iterator = SeqIO.parse("sarscov2.fasta", "fasta")
first_record = next(record_iterator)
first_record.id = "new_id"
first_record.description = first_record.id + " " + "desired new description"
print(first_record.format("fasta")[:200])

## 5.2 Parsing Sequences from Compressed Files

In [None]:
# 5.2

# make gzip file??

print(sum(len(r) for r in SeqIO.parse("ls_orchid.gbk", "gb")))

with open("ls_orchid.gbk") as handle:
    print(sum(len(r) for r in SeqIO.parse(handle, "gb")))

import gzip

with gzip.open("ls_orchid.gbk.gz", "rt") as handle:
    print(sum(len(r) for r in SeqIO.parse(handle, "gb")))


## 5.3 Parsing Sequenced from the Net

In [5]:
# 5.3.1

In [6]:
# 5.3.2

## 5.4 Sequence files as Dictionaries

In [8]:
# 5.4.1

In [9]:
# 5.4.1.1

In [10]:
# 5.4.1.2

In [11]:
# 5.4.2

In [12]:
# 5.4.2.1

In [13]:
# 5.4.2.2

In [14]:
# 5.4.3

In [15]:
# 5.4.3.2

In [16]:
# 5.4.4

In [17]:
# 5.4.5

## 5.5 Writing Sequence Files

In [18]:
# 5.5.1

In [None]:
# 5.5.2

In [None]:
# 5.5.3