In [2]:
from fasta import *
from checkGenome import *

In [3]:
# Load the 2U1 files
# full_record = SeqIO.to_dict(SeqIO.parse("files/2U1_all_candidates_PSI_BLAST_unique.fasta", "fasta"))
full_record = SeqIO.to_dict(SeqIO.parse("files/Python_bivittaus_full_PSI.fasta", "fasta"))

#### How many sequences do we have in the full records?

In [4]:
print (len(full_record))

1000


#### Now we can chose to exclude sequences with certain words appearing in the header and set a minimum length

In [5]:
filtered_ids = build_species_count(records=full_record, length=450)

IndexError: list index out of range

#### `filtered_ids` is actually a list that maps Species name to the IDs that come from that species. So we can use it to easily see how many unique species we have and which species are over represented.

In [6]:
print("There are %s unique species in our dataset." % (len(filtered_ids)))

There are 162 unique species in our dataset.


#### `plot_record_number` is a function that plots the numbers of IDs per species. We can set a minimum number of IDs that a species must have in order to be plotted.

In [7]:
plotthis = plot_record_number(filtered_ids, min=0)
py.iplot(plotthis, filename='inline_bar')

In [8]:
plotthis = plot_record_number(filtered_ids, min=4)
py.iplot(plotthis, filename='inline_bar')

In [10]:
plotthis = plot_record_number(filtered_ids, min=5)
py.iplot(plotthis, filename='inline_bar')

#### We can also just extract the names using `get_species_name`, which also accepts a minimum number of IDs required and can print out the number of counts per each species

In [13]:
get_species_names(filtered_ids, min=5)

Strongylocentrotus purpuratus
Acanthaster planci
Saccoglossus kowalevskii
Branchiostoma belcheri
Ciona intestinalis
Aplysia californica
Hypsibius dujardini


In [15]:
get_species_names(filtered_ids, min=2, counts=True)

Rattus norvegicus 3
Papio anubis 3
Strongylocentrotus purpuratus 12
Acanthaster planci 15
Saccoglossus kowalevskii 17
Branchiostoma belcheri 49
Priapulus caudatus 5
Ciona intestinalis 8
Aplysia californica 6
Lingula anatina 4
Biomphalaria glabrata 5
Hypsibius dujardini 13


#### `count_ids` is a function that counts the total number of sequences in filtered_ids, not just the number of unique species.

In [16]:
print ("There are %s total sequences in our filtered dataset." % (count_ids(filtered_ids)))

There are 272 total sequences in our filtered dataset.


#### As before, it can also take a minimum number of IDs required

In [30]:
min_num = 1
print ("There are %s total sequences in our filtered dataset that have %d or more IDs per species." % (count_ids(filtered_ids, min=min_num), min_num))

There are 18 total sequences in our filtered dataset that have 1 or more IDs per species.


In [17]:
check_genomic_location(filtered_ids, min=5, visualise="linear")

Strongylocentrotus purpuratus
['100893396', '100888399', '100888078', '757026', '591567', '582422', '581885', '580612', '577176', '577116', '576998', '581193']
12
Acanthaster planci
['110986213', '110986212', '110983833', '110983243', '110981708', '110981220', '110980619', '110979585', '110979233', '110977588', '110975828', '110975423', '110974947', '110974561', '110974401']
15
Saccoglossus kowalevskii
['102810228', '102803428', '102802877', '100378407', '100376031', '100375497', '100374288', '100374138', '100373349', '100371843', '100370778', '100370626', '100370125', '100368785', '100368446', '100368302', '100368284']
17
Branchiostoma belcheri
['109487146', '109486409', '109484168', '109484145', '109483948', '109483947', '109483946', '109483935', '109483920', '109483337', '109483074', '109481879', '109481878', '109481830', '109480295', '109478642', '109477107', '109477038', '109475266', '109475079']
20
Ciona intestinalis
['100185251', '100183929', '100181554', '100185839', '100185727

#### Because `filtered_ids` just contains the species name and IDs of these species, we need to map these IDs back to their full records. We can use the function `map_ids_to_records` which allows for us to select all the records in `filtered_ids` or just the unique species.

In [61]:
filtered_records = map_ids_to_records(filtered_ids, full_record)
filtered_records_unique = map_ids_to_records(filtered_ids, full_record, unique=True)

# Check that the numbers are correct
print (len(filtered_records))
print (len(filtered_records_unique))

136
46


#### And now we can save these records to a new FASTA file using `write_fasta`

In [62]:
write_fasta(filtered_records, "files/filtered_records.fasta")
write_fasta(filtered_records_unique, "files/filtered_records_unique.fasta")
