In [1]:
from fasta import *
from checkGenome import *

In [2]:
# Load the 2U1 files
full_record = SeqIO.to_dict(SeqIO.parse("files/2U1_and_2U1_like_candidates_BLAST_results_unique_X_seqs_removed.fasta", "fasta"))

#### How many sequences do we have in the full records?

In [3]:
print (len(full_record))

577


#### Now we can chose to exclude sequences with certain words appearing in the header and set a minimum length

In [4]:
filtered_ids = build_species_count(records=full_record, length=450)

#### `filtered_ids` is actually a list that maps Species name to the IDs that come from that species. So we can use it to easily see how many unique species we have and which species are over represented.

In [5]:
print("There are %s unique species in our dataset." % (len(filtered_ids)))

There are 172 unique species in our dataset.


#### `plot_record_number` is a function that plots the numbers of IDs per species. We can set a minimum number of IDs that a species must have in order to be plotted.

In [6]:
plotthis = plot_record_number(filtered_ids, min=0)
py.iplot(plotthis, filename='inline_bar')

In [7]:
plotthis = plot_record_number(filtered_ids, min=2)
py.iplot(plotthis, filename='inline_bar')

In [8]:
plotthis = plot_record_number(filtered_ids, min=20)
py.iplot(plotthis, filename='inline_bar')

#### We can also just extract the names using `get_species_name`, which also accepts a minimum number of IDs required and can print out the number of counts per each species

In [9]:
get_species_names(filtered_ids, min=5)

Strongylocentrotus purpuratus
Acanthaster planci
Saccoglossus kowalevskii
Branchiostoma belcheri
Ciona intestinalis
Aplysia californica
Biomphalaria glabrata
Hypsibius dujardini


In [10]:
get_species_names(filtered_ids, min=5, counts=True)

Strongylocentrotus purpuratus 14
Acanthaster planci 15
Saccoglossus kowalevskii 17
Branchiostoma belcheri 51
Ciona intestinalis 9
Aplysia californica 6
Biomphalaria glabrata 8
Hypsibius dujardini 13


#### `count_ids` is a function that counts the total number of sequences in filtered_ids, not just the number of unique species.

In [11]:
print ("There are %s total sequences in our filtered dataset." % (count_ids(filtered_ids)))

There are 351 total sequences in our filtered dataset.


#### As before, it can also take a minimum number of IDs required

In [12]:
min_num = 5
print ("There are %s total sequences in our filtered dataset that have %d or more IDs per species." % (count_ids(filtered_ids, min=min_num), min_num))

There are 133 total sequences in our filtered dataset that have 5 or more IDs per species.


In [13]:
check_genomic_location(filtered_ids, min=20)

Branchiostoma belcheri
['109487146', '109486409', '109484168', '109484145', '109483948', '109483947', '109483946', '109483935', '109483920', '109483337', '109483074', '109481879', '109481878', '109481830', '109480295', '109478642', '109477107', '109477038', '109475266', '109475079']

Gene id is 109487146 
Gene region starts at 1229419
Gene region ends at 1235098
Gene region is 5679 nucleotides long 
Chromosome is unassigned

Gene id is 109486409 
Gene region starts at 3644259
Gene region ends at 3648598
Gene region is 4339 nucleotides long 
Chromosome is unassigned

Gene id is 109484168 
Gene region starts at 1643154
Gene region ends at 1648384
Gene region is 5230 nucleotides long 
Chromosome is unassigned

Gene id is 109484145 
Gene region starts at 1649892
Gene region ends at 1653532
Gene region is 3640 nucleotides long 
Chromosome is unassigned

Gene id is 109483948 
Gene region starts at 239259
Gene region ends at 249557
Gene region is 10298 nucleotides long 
Chromosome is unassign

#### Because `filtered_ids` just contains the species name and IDs of these species, we need to map these IDs back to their full records. We can use the function `map_ids_to_records` which allows for us to select all the records in `filtered_ids` or just the unique species.

In [14]:
filtered_records = map_ids_to_records(filtered_ids, full_record)
filtered_records_unique = map_ids_to_records(filtered_ids, full_record, unique=True)

# Check that the numbers are correct
print (len(filtered_records))
print (len(filtered_records_unique))

351
172


#### And now we can save these records to a new FASTA file using `write_fasta`

In [15]:
write_fasta(filtered_records, "files/filtered_records.fasta")
write_fasta(filtered_records_unique, "files/filtered_records_unique.fasta")
