In [1]:
from fasta import *
from checkGenome import *

## The first thing to do is to perform a BLAST search

In [69]:
# Load the 2U1 files
# full_record = SeqIO.to_dict(SeqIO.parse("files/2U1_all_candidates_PSI_BLAST_unique.fasta", "fasta"))
# full_record = SeqIO.to_dict(SeqIO.parse("files/Python_bivittaus_full_PSI.fasta", "fasta"))
full_record = SeqIO.to_dict(SeqIO.parse("files/2U1_and_2U1_like_candidates_BLAST_results_unique_X_seqs_removed.fasta", "fasta"))

#### How many sequences do we have in the full records?

In [70]:
print (len(full_record))

577


#### Now we can chose to exclude sequences with certain words appearing in the header and set a minimum length

In [71]:
filtered_ids = build_species_count('unnamed protein product', '2J2-like', '2J2', '2J', '2D17', '2J6', '2J6-like', '2P3', '2P3-like', '2B4-like', '2B4', '2D15', '2D15-like', 'CYP2N', 'vitamin D', '2B15', '2B15-like', '2D14', '2D14-like', '2D20', '2C31-like', '2C20-like', '2C31', '2C20', '2B19-like', '2D3', '2C15-like', '2H2-like', '2C5-like', '2D49', '2B12-like', 'subfamily J', 'subfamily j', 'subfamily V', '2H1', '2H1-like', '2D6', '2D6-like', '2D26', '2D26-like', 'Cyp2v1', '2C1-like', '2V1', records=full_record, length=400)

#### `filtered_ids` is actually a list that maps Species name to the IDs that come from that species. So we can use it to easily see how many unique species we have and which species are over represented.

In [72]:
print("There are %s unique species in our dataset." % (len(filtered_ids)))

There are 202 unique species in our dataset.


#### `plot_record_number` is a function that plots the numbers of IDs per species. We can set a minimum number of IDs that a species must have in order to be plotted.

In [6]:
plotthis = plot_record_number(filtered_ids, min=0)
py.iplot(plotthis, filename='inline_bar')

In [None]:
plotthis = plot_record_number(filtered_ids, min=4)
py.iplot(plotthis, filename='inline_bar')

In [None]:
plotthis = plot_record_number(filtered_ids, min=5)
py.iplot(plotthis, filename='inline_bar')

#### We can also just extract the names using `get_species_name`, which also accepts a minimum number of IDs required and can print out the number of counts per each species

In [58]:
get_species_names(filtered_ids, min=5)

Danio rerio
Oryzias melastigma
Branchiostoma floridae
Branchiostoma belcheri


In [59]:
get_species_names(filtered_ids, min=5, counts=True)

Danio rerio 9
Oryzias melastigma 9
Branchiostoma floridae 20
Branchiostoma belcheri 20


#### `count_ids` is a function that counts the total number of sequences in filtered_ids, not just the number of unique species.

In [60]:
print ("There are %s total sequences in our filtered dataset." % (count_ids(filtered_ids)))

There are 295 total sequences in our filtered dataset.


#### As before, it can also take a minimum number of IDs required

In [61]:
min_num = 1
print ("There are %s total sequences in our filtered dataset that have %d or more IDs per species." % (count_ids(filtered_ids, min=min_num), min_num))

There are 162 total sequences in our filtered dataset that have 1 or more IDs per species.


In [11]:
check_genomic_location(filtered_ids, min=5, visualise="linear")

Pygocentrus nattereri
['108440017', '108440016', '108440015', '108440014', '108440013', '108440010', '108429574']
7
Danio rerio
['494153', '768294', '555510', '492484', '101882126', '324212', '556280', '792207', '414933']
9
Oryzias melastigma
[]
0
Branchiostoma floridae
['7255781', '7253488', '7252428', '7251670', '7250689', '7246431', '7238522', '7238246', '7232835', '7231615', '7231442', '7230529', '7228756', '7228733', '7224336', '7215875', '7215870', '7214905', '7214642', '7208936']
20
Branchiostoma belcheri
['109487178', '109486409', '109483948', '109483947', '109483946', '109481879', '109481878', '109481830', '109477107', '109477038', '109475266', '109474037', '109472880', '109472128', '109471849', '109470627', '109467919', '109467880', '109461727', '109461725']
20


#### Because `filtered_ids` just contains the species name and IDs of these species, we need to map these IDs back to their full records. We can use the function `map_ids_to_records` which allows for us to select all the records in `filtered_ids` or just the unique species.

In [73]:
filtered_records = map_ids_to_records(filtered_ids, full_record)
filtered_records_unique = map_ids_to_records(filtered_ids, full_record, unique=True)

# Check that the numbers are correct
print (len(filtered_records))
print (len(filtered_records_unique))

420
202


#### And now we can save these records to a new FASTA file using `write_fasta`

In [74]:
write_fasta(filtered_records, "files/filtered_records.fasta")
write_fasta(filtered_records_unique, "files/filtered_records_unique.fasta")