## Loading fasta files via Biopython
https://github.com/peterjc/biopython_workshop/blob/master/reading_sequence_files/README.rst

In [20]:
# load library
from Bio import SeqIO

In [44]:
# read a single fasta file
record = SeqIO.read("Acidithiobacillus_ferrooxidans.fasta", "fasta")

# print sequence ID
print('ID:', record.id)

# print sequence length
print('length:', len(record))

# make a variable containing only the 
seq = str(record.seq)
print(seq[1:50])

ID: gi|218665024|ref|NC_011761.1|
length: 2982397
AGTTAAAAGAAAAAATATAAATTATTTTTATAAAGAGACGCCATCGATC


## Loading fasta files via Screed
http://screed.readthedocs.org/en/latest/screed.html

In [30]:
import screed

In [86]:
# assign filename to variable
Genomefastafile = "Acidithiobacillus_ferrooxidans.fasta"

# Loop through all the records in fasta file and assign to 
# variables (assuming only one genome is in the file)
for record in screed.open(Genomefastafile):
    Genome_seqname = record.name
    Genome_sequence = record.sequence
    
print(Genome_seqname)
print(Genome_sequence[1:100])

gi|218665024|ref|NC_011761.1| Acidithiobacillus ferrooxidans ATCC 23270, complete genome
AGTTAAAAGAAAAAATATAAATTATTTTTATAAAGAGACGCCATCGATCCCTTTCCAGTCCTGGCATTCTAGGAGCACATCCCGATGAAAATCACCATA


## Loading fasta files via base Python
http://pythonforbiologists.com/index.php/introduction-to-python-for-biologists/reading-and-writing-files/

In [46]:
fasta_file = open("Acidithiobacillus_ferrooxidans.fasta")
seq = fasta_file.read().rstrip("\n")
print(seq[1:150])
# would need to use regex to get rid of ID

gi|218665024|ref|NC_011761.1| Acidithiobacillus ferrooxidans ATCC 23270, complete genome
TAGTTAAAAGAAAAAATATAAATTATTTTTATAAAGAGACGCCATCGATCCCTTTCCAGT


## Find Crispr repeats

### Load Crispr repeat seqeunces

In [59]:
# assign filename to variable
Crisprfastafile = "Acidithiobacillus_ferrooxidans_Crispr.fasta"

# Create empty lists to append sequences to
Crispr_seqname = list()
Crispr_sequence = list()

# Loop through all the records in fasta file and add them the the lists
for record in screed.open(Crisprfastafile):
    Crispr_seqname.append(record.name)
    Crispr_sequence.append(record.sequence) 

# zip the two lists together to make a dictionary
Crispr_repeats = dict(zip(Crispr_seqname, Crispr_sequence))
 
# print the dictionary to view the seqeunces and their names    
print(Crispr_repeats)

{'NC_011761_5': 'GGGCGATTCGTTTCACCTCCTCCGC', 'NC_011761_3': 'GTATGCCGCCAGGTGCGCGGCTGAGAAC', 'NC_011761_6': 'CCTGGTCAGTACAACAACGGCTACGG', 'NC_011761_4': 'GTATGCCGCCATATGCGCAGCTTGTAAT', 'NC_011761_1': 'CTTCTCAGCCGCGCGTGTGGCGGCATACCGC', 'NC_011761_2': 'CTTCTCAGCCGCGCGTGTGGCGGCATACCGC'}


### Access the sequence of a single repeat

In [90]:
# select the sequence from the dictionary by subsetting it's key
Crispr_repeats['NC_011761_2']

'CTTCTCAGCCGCGCGTGTGGCGGCATACCGC'

### Import regular expression library

In [89]:
import re

### Find a single repeat in genome

In [145]:
# use re.search to find sequence in genome, inputs are 1) pattern and 2) string to search for pattern
# '(' and ')' surround the string that you want to capture
second_repeat = re.search('(' + Crispr_repeats['NC_011761_2'] + ')', Genome_sequence)

# re.search object has has a method called group() for what it matched (one group per capture group)
print('matching sequence for NC_011761_2 Crispr is:', second_repeat.group(1))

matching sequence for NC_011761_2 Crispr is: CTTCTCAGCCGCGCGTGTGGCGGCATACCGC


In [146]:
# re.search object also has position information!
print('matching sequence postition for NC_011761_2 Crispr is:', second_repeat.span())

matching sequence postition for NC_011761_2 Crispr is: (934846, 934877)


### Compile a pattern ahead of time for speed and readability

In [160]:
# specify and compile pattern ahead of time
NC_011761_2pattern = re.compile('(' + Crispr_repeats['NC_011761_2'] + ')')

# then use re.search 
second_repeat = NC_011761_2pattern.search(Genome_sequence)
print('matching sequence for NC_011761_2 Crispr is:', second_repeat.group(1))

matching sequence for NC_011761_2 Crispr is: CTTCTCAGCCGCGCGTGTGGCGGCATACCGC


### Use re.findall to find multiple repeats

In [151]:
# specify and compile pattern ahead of time
NC_011761_2pattern = re.compile('(' + Crispr_repeats['NC_011761_2'] + ')')

# search for all occurrences of pattern
second_repeat_matches = NC_011761_2pattern.findall(Genome_sequence)
print('All the matches are:', second_repeat_matches)

All the matches are: ['CTTCTCAGCCGCGCGTGTGGCGGCATACCGC', 'CTTCTCAGCCGCGCGTGTGGCGGCATACCGC']


### Find unknown sequence between Crispr repeats

In [153]:
# put capture group between sequences
NC_011761_2spacerspattern = re.compile(Crispr_repeats['NC_011761_2'] + '([ATCG]+)' + Crispr_repeats['NC_011761_2'])

# search for all occurrences of pattern
second_repeat_spacers = NC_011761_2spacerspattern.findall(Genome_sequence)
print('All the matches are:', second_repeat_spacers)

All the matches are: ['TGATCATCTTGGTAGACCGCGACATATCA']


## Challenge

1. Find and print all the spacers for the 3rd Crispr (NC_011761_3) of A. ferrooxidans. 

2. Can you find the position of the spacer(s) as well?

### Finding positions of multiple matches

In [172]:
# put capture group between sequences
NC_011761_3spacerspattern = re.compile(Crispr_repeats['NC_011761_3'] + '([ATCG]+)' + Crispr_repeats['NC_011761_3'])

# use re.finditer instead of re.findall or search
thirdspacers = NC_011761_3spacerspattern.finditer(Genome_sequence)

# loop over thirdspacers to access matches and positions
for match in thirdspacers:
    print(match.group(1))

In [200]:
for mo in re.finditer('('+ 'CTTCTCAGCCGCGCGTGTGGCGGCATACCGC' + ')', Genome_sequence):
    print((mo.start(), mo.end(), mo.group(1)))

(934846, 934877, 'CTTCTCAGCCGCGCGTGTGGCGGCATACCGC')
(934906, 934937, 'CTTCTCAGCCGCGCGTGTGGCGGCATACCGC')


In [198]:
'(' + Crispr_repeats['NC_011761_3'] + ')'

'(GTATGCCGCCAGGTGCGCGGCTGAGAAC)'