In [1]:
from fasta import *
import alignment
import utilities
import annotations
from checkGenome import *
from ipywidgets import widgets
from ipywidgets import *
from traitlets import *
from IPython.display import display

# Loading the files

The first thing we want to do is load in our files. Loading them in this way actually loads them into a dictionary where the keys are the record IDs and the values are the full records.

In [2]:
# Load the 2U1 files
records = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/concatenated_output.fasta", "fasta"))
records_40 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/40_percent_concatenated_output.fasta", "fasta"))
records_50 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/50_percent_concatenated_output.fasta", "fasta"))
records_55 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/55_percent_concatenated_output.fasta", "fasta"))


#### How many sequences do we have in the full records?

In [3]:
print (len(records))
print (len(records_40))
print (len(records_50))
print (len(records_55))


19998
4686
622
421


# Creating subsets of the records
### Including / excluding based on header annotations

Now the workflow moves to including and excluding certain sequences. `subset_records` allows us to provide a list of terms which we either want or don't want in the header description. We can also give a minimum length for sequences to meet for inclusion.

We don't ever alter the original `full_record`, we just create new dictionary objects that are subsets.

We can either provide arguments directly to the function or we can pass in a list variable, such as `header_terms`.

In the example below `only_2U1_records` is set to only include sequences which have either '2U1' or '2U1-like' in the header. And `filtered_records` will contain the full set of sequences as we are not providing a length minimum (and the default is 0) and we are passing in the currently empty `header_terms`.

In [4]:
# A blank list to hold terms we want to exclude or include
header_terms = []

In [5]:
records = subset_records(records=records, length=400, mode='exclude')
records = exclude_character(records, "X")

records_40 = subset_records(records=records_40, length=400, mode='exclude')
records_40 = exclude_character(records_40, "X")

records_50 = subset_records(records=records_50, length=400, mode='exclude')
records_50 = exclude_character(records_50, "X")

records_55 = subset_records(records=records_55, length=400, mode='exclude')
records_55 = exclude_character(records_55, "X")

print ("The number of sequences without filtering for identity to the original candidates is %s" % (len(records)))
print ("The number of sequences with 40%% identity to the original candidates is %s" % (len(records_40)))
print ("The number of sequences with 50%% identity to the original candidates is %s" % (len(records_50)))
print ("The number of sequences with 55%% identity to the original candidates is %s" % (len(records_55)))



The number of sequences without filtering for identity to the original candidates is 15739
The number of sequences with 40% identity to the original candidates is 3047
The number of sequences with 50% identity to the original candidates is 299
The number of sequences with 55% identity to the original candidates is 275


In [7]:
differentRecords = getDifferentRecords(records_50, records_55)

print (len(differentRecords))
    
for record_id, record in differentRecords.items():
    print (record.description)
#     annotations.getCommonNameFromID(record_id)

    

24
PIO41157.1 hypothetical protein AB205_0089410 [Rana catesbeiana]
XP_006768464.1 PREDICTED: cytochrome P450 2J2 [Myotis davidii]
XP_014379039.1 PREDICTED: cytochrome P450 2A13-like isoform X1 [Alligator sinensis]
XP_016404253.1 PREDICTED: cytochrome P450 2J3-like isoform X2 [Sinocyclocheilus rhinocerous]
XP_021107397.1 cytochrome P450 2B4 isoform X2 [Heterocephalus glaber]
XP_004941452.1 PREDICTED: vitamin D 25-hydroxylase isoform X2 [Gallus gallus]
ELW64418.1 Cytochrome P450 2D17 [Tupaia chinensis]
XP_014801266.1 PREDICTED: cytochrome P450 2D1-like [Calidris pugnax]
KYO44822.1 hypothetical protein Y1Q_0006987 [Alligator mississippiensis]
OCA35976.1 hypothetical protein XENTR_v90015294mg [Xenopus tropicalis]
XP_006123186.1 PREDICTED: cytochrome P450 2J2-like, partial [Pelodiscus sinensis]
XP_013766542.1 PREDICTED: cytochrome P450 2J6-like [Pundamilia nyererei]
EPY88763.1 cytochrome P450 2C26-like isoform 2 [Camelus ferus]
XP_013865517.1 PREDICTED: cytochrome P450 2K4-like isoform X2 

In [None]:
exclude_list = ["2U1"]
exclude_records = subset_records_with_regex("2U1", records=records, mode="exclude")
exclude_records_40 = subset_records_with_regex("2U1", records=records_40, mode="exclude")
exclude_records_55 = subset_records_with_regex("grab",records=records_55, mode="exclude")

print (len(exclude_records)) 
print (len(exclude_records_40)) 
print (len(exclude_records_55)) 


In [None]:
for item in exclude_records_55:
    print (exclude_records_55[item].description)

In [None]:
for item in exclude_records_40:
    print (exclude_records_40[item].description)

In [None]:
species_counts = build_species_count(records=records_40)


In [None]:
species_names = get_species_names(species_counts, min=1)
for name in species_names:
    print (name)

### Plotting the frequency of proteins per species
`plot_record_number` is a function that plots the numbers of IDs per species. We can set a minimum number of IDs that a species must have in order to be plotted.

In [None]:
plotthis = plot_record_number(species_counts, "Bar", min=3)
py.iplot(plotthis, filename='inline_bar')

In [None]:
plotthis = plot_record_number(species_counts, "Bar", min=2)
py.iplot(plotthis, filename='inline_bar')

In [None]:
plotthis = plot_record_number(species_counts, "Bar", min=5)
py.iplot(plotthis, filename='inline_bar')

#### We can also just extract the names using `get_species_name`, which also accepts a minimum number of IDs required and can print out the number of counts per each species

In [None]:
species_names_with_counts = get_species_names(species_counts, min=3, counts=True)
for name in species_names_with_counts:
    print (name)

### Counting the total number of sequences with multiple hits
`count_ids` is a function that counts the total number of sequences in a species count dictionary, not just the number of unique species.

As before, it can also take a minimum number of IDs required

In [None]:
min_num = 5
print ("There are %s total sequences in our filtered dataset." % (count_ids(species_counts)))
print ("There are %s total sequences in our filtered dataset that have %d or more IDs per species." % (count_ids(species_counts, min=min_num), min_num))

# Generating datasets containing information about species with multiple hits
For each species that has more than the given number of hits, we create 
1. A FASTA file of the protein sequences from that species
2. An alignment of the protein sequences
3. An information file telling use where in the genome the protein maps to
4. A visual diagram of the genome mapping the proteins to the genome

In [None]:
def generate_multiple_hit_data(species_names, species_counts, full_record, file_path):
    id_dict = {}
#     for name in species_names:
#         seqs = map_species_to_records(species_counts[name], full_record)
#         write_fasta(seqs, file_path + name + " sequences")
#         alignmentFile = alignment.alignWithMAFFT(file_path + name + " sequences")
#         alignment.writeAlignment(alignmentFile, file_path + name + ".aln", "fasta")
        

    check_genomic_location(species_counts, min=1, file_path=file_path +" gene locations ")
    check_genomic_location(species_counts, min=1, visualise="linear")


species_names = get_species_names(species_counts, min=1)
generate_multiple_hit_data(species_names, species_counts, full_record, "files/multiple_hits/")

Or we could just use parts of this function. The cell below will just print out the locations of the proteins in the genome. We could save this to disk by providing an argument to the `file_path` variable or visualise it by providing either 'linear' or 'circular' to the `visualise` variable.

In [None]:
check_genomic_location(species_counts, min=5)

# Saving the records to FASTA files
Because `filtered_records` just contains the species name and IDs of these species, we need to map these IDs back to their full records. We can use the function `map_ids_to_records` which allows for us to select all the records in `filtered_ids` or just the unique species.

In [None]:
filtered_records = map_ids_to_records(species_counts, records_55)
filtered_records_unique = map_ids_to_records(species_counts, records_55, unique=True)

# Check that the numbers are correct
print (len(filtered_records))
print (len(filtered_records_unique))

And now we can save these records to a new FASTA file using `write_fasta`

In [None]:
write_fasta(filtered_records, "/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/2U1_40_percent.fasta")
write_fasta(filtered_records_unique, "/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/2U1_40_percent_unique.fasta")
# write_fasta(filtered_records_unique, "files/2U1_BLAST_filtered_records_unique.fasta")

We can also use the function `map_species_to_records` to just map a particular species to a FASTA file.

In [None]:
priapulus_caudatus = map_species_to_records(species_counts['Priapulus caudatus'], full_record)
write_fasta(priapulus_caudatus, "files/priapulus_caudatus.fasta")