In [4]:
from math import pi

In [5]:
import fasta
import alignment
import utilities
from checkGenome import *
from ipywidgets import widgets
from ipywidgets import *
from traitlets import *
from IPython.display import display

# Loading the files

The first thing we want to do is load in our files. Loading them in this way actually loads them into a dictionary where the keys are the record IDs and the values are the full records.

In [7]:
# Load the 2U1 files
full_record = utilities.load_sequences("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Alignment_exclusion/2U1_50_percent_motif_reduced_exons.fasta")

#### How many sequences do we have in the full records?

In [8]:
print (len(full_record))

177


In [11]:
for record in full_record:
    print (full_record[record].description)
    print (full_record[record].name)

XP_010739611.2*TAG 5* XP_010739611.2 PREDICTED: cytochrome P450 2U1 [Larimichthys crocea]
XP_010739611.2*TAG
XP_019110856.1*TAG 5* XP_019110856.1 PREDICTED: cytochrome P450 2U1-like [Larimichthys crocea]
XP_019110856.1*TAG
XP_023283957.1*TAG 5* XP_023283957.1 cytochrome P450 2U1 [Seriola lalandi dorsalis]
XP_023283957.1*TAG
XP_022621190.1*TAG 5* XP_022621190.1 cytochrome P450 2U1 [Seriola dumerili]
XP_022621190.1*TAG
XP_018539665.1*TAG 5* XP_018539665.1 PREDICTED: LOW QUALITY PROTEIN: cytochrome P450 2U1 [Lates calcarifer]
XP_018539665.1*TAG
XP_020447513.1*TAG 5* XP_020447513.1 cytochrome P450 2U1 isoform X1 [Monopterus albus]
XP_020447513.1*TAG
XP_019956775.1*TAG 5* XP_019956775.1 PREDICTED: cytochrome P450 2U1 [Paralichthys olivaceus]
XP_019956775.1*TAG
XP_004549474.1*TAG 5* XP_004549474.1 cytochrome P450 2U1 [Maylandia zebra]
XP_004549474.1*TAG
XP_006787161.1*TAG 5* XP_006787161.1 PREDICTED: cytochrome P450 2U1-like [Neolamprologus brichardi]
XP_006787161.1*TAG
XP_005457042.1*TAG 5*

# Creating subsets of the records
### Including / excluding based on header annotations

Now the workflow moves to including and excluding certain sequences. `subset_records` allows us to provide a list of terms which we either want or don't want in the header description. We can also give a minimum length for sequences to meet for inclusion.

We don't ever alter the original `full_record`, we just create new dictionary objects that are subsets.

We can either provide arguments directly to the function or we can pass in a list variable, such as `header_terms`.

In the example below `only_2U1_records` is set to only include sequences which have either '2U1' or '2U1-like' in the header. And `filtered_records` will contain the full set of sequences as we are not providing a length minimum (and the default is 0) and we are passing in the currently empty `header_terms`.

In [12]:
# A blank list to hold terms we want to exclude or include
header_terms = []

In [15]:
only_2U1_records = subset_records("2U1", "2U1-like", records=full_record, length=400, mode="include")
filtered_records = subset_records("2U1", "2U1-like", records=full_record, length=400, mode='exclude')

print ("The number of sequences with either 2U1 or 2U1-like in the header is %s " % (len(only_2U1_records)))
print ("The number of sequences we've filtered is %s which should be equal to %s" % (len(filtered_records), len(full_record)))

The number of sequences with either 2U1 or 2U1-like in the header is 170 
The number of sequences we've filtered is 7 which should be equal to 177


### Adding terms to the `header_terms` variable
The following section makes it easy to add in terms to the `header_terms` variable and to save these files for later use.

Let's first print out the terms in our variable and the length of it. As you add to the list you can always come back and rerun this cell to peek inside the `header_terms` variable

In [None]:
print (header_terms)
print (len(header_terms))

The first thing we might be interested in doing is to print out the header information of the sequences we currently have.

In [16]:
for record in filtered_records:
    print (filtered_records[record].description)

CAG11477.1*TAG 5* CAG11477.1 unnamed protein product, partial [Tetraodon nigroviridis]
AGN04284.1 cytochrome P450 [Oryzias melastigma]
KTF87224.1*TAG 5* KTF87224.1 hypothetical protein cypCar_00019558 [Cyprinus carpio]
EFB16740.1 hypothetical protein PANDA_005136, partial [Ailuropoda melanoleuca]
EHH26107.1*TAG 5* EHH26107.1 hypothetical protein EGK_15996, partial [Macaca mulatta]
ARO89866.1 cytochrome P450 Cyp2u1 [Andrias davidianus]
NP_001106471.1*TAG 5* NP_001106471.1 cytochrome P450 family 2 subfamily U member 1 [Xenopus tropicalis]


The cell below will add items to our `header_terms` variable. Hit run on the cell and you'll see an input box - simply add words seperated by a space that you want to add.

# Evaluating how many hits per species

`build_species_count` builds a dictionary which has the set of unique species as its keys and a list of the sequence IDs that belong to each unique species as its . So we can use it to easily see how many unique species we have and which species are over represented.

In [17]:
species_counts = build_species_count(records=full_record)
print("There are %s unique species in our dataset." % (len(species_counts)))

There are 168 unique species in our dataset.


### Plotting the frequency of proteins per species
`plot_record_number` is a function that plots the numbers of IDs per species. We can set a minimum number of IDs that a species must have in order to be plotted.

In [18]:
plotthis = plot_record_number(species_counts, "Bar", min_length=3)
py.iplot(plotthis, filename='inline_bar')

In [20]:
plotthis = plot_record_number(species_counts, "Bar", min_length=2)
py.iplot(plotthis, filename='inline_bar')

In [None]:
plotthis = plot_record_number(species_counts, "Bar", min=5)
py.iplot(plotthis, filename='inline_bar')

#### We can also just extract the names using `get_species_name`, which also accepts a minimum number of IDs required and can print out the number of counts per each species

In [38]:
species_names = get_species_names(species_counts, min=1)
for name in species_names:
    print (name)

Sclerotinia_sclerotiorum_1980_UF-70
Aspergillus_wentii_DTO_134E9
Aspergillus_cristatus
Aspergillus_niger
Aspergillus_carbonarius_ITEM_5010
Aspergillus_brasiliensis_CBS_101740
Aspergillus_kawachii_IFO_4308
Aspergillus_ruber_CBS_135680
Aspergillus_glaucus_CBS_516.65
Penicillium_arizonense
Penicillium_italicum
Penicillium_freii
Penicillium_polonicum
Penicillium_nordicum
Penicillium_solitum
Penicillium_camemberti_FM_013
Penicillium_digitatum_Pd1
Penicillium_expansum
Aspergillus_bombycis
Aspergillus_nomius_NRRL_13137
Aspergillus_parasiticus_SU-1
Byssochlamys_spectabilis_No._5
Penicillium_flavigenum
Penicillium_nalgiovense
Penicillium_steckii
Penicillium_decumbens
Aspergillus_terreus_NIH2624
Aspergillus_oryzae
Penicillium_brasilianum
Aspergillus_sydowii_CBS_593.65
Aspergillus_versicolor_CBS_583.65
Cordyceps_confragosa_RCEF_1005
Cordyceps_confragosa
Penicillium_griseofulvum
Aspergillus_niger_ATCC_1015
Aspergillus_luchuensis
Aspergillus_lentulus
Aspergillus_turcosus
Penicillium_antarcticum
Pen

In [36]:
species_names_with_counts = get_species_names(species_counts, min=3, counts=True)
for name in species_names_with_counts:
    print (name)

Sclerotinia_sclerotiorum_1980_UF-70 4
Aspergillus_wentii_DTO_134E9 7
Aspergillus_cristatus 5
Aspergillus_niger 5
Aspergillus_ruber_CBS_135680 4
Aspergillus_glaucus_CBS_516.65 4
Penicillium_arizonense 5
Penicillium_freii 4
Penicillium_expansum 4
Aspergillus_bombycis 4
Aspergillus_nomius_NRRL_13137 5
Aspergillus_parasiticus_SU-1 5
Penicillium_flavigenum 4
Penicillium_nalgiovense 4
Aspergillus_versicolor_CBS_583.65 4
Aspergillus_turcosus 4
Penicillium_antarcticum 4
Serendipita_indica_DSM_11827 4
Fomitiporia_mediterranea_MF3/22 5
Fibularhizoctonia_sp._CBS_109695 6
Aureobasidium_namibiae_CBS_147.97 4
Aureobasidium_subglaciale_EXF-2481 4
Aureobasidium_pullulans_EXF-150 5
Neofusicoccum_parvum_UCRNP2 4
Aureobasidium_melanogenum_CBS_110374 4
Alternaria_alternata 4


### Counting the total number of sequences with multiple hits
`count_ids` is a function that counts the total number of sequences in a species count dictionary, not just the number of unique species.

As before, it can also take a minimum number of IDs required

In [None]:
min_num = 5
print ("There are %s total sequences in our filtered dataset." % (count_ids(species_counts)))
print ("There are %s total sequences in our filtered dataset that have %d or more IDs per species." % (count_ids(species_counts, min=min_num), min_num))

# Generating datasets containing information about species with multiple hits
For each species that has more than the given number of hits, we create 
1. A FASTA file of the protein sequences from that species
2. An alignment of the protein sequences
3. An information file telling use where in the genome the protein maps to
4. A visual diagram of the genome mapping the proteins to the genome

In [30]:
def generate_multiple_hit_data(species_names, species_counts, full_record, file_path):
    id_dict = {}
#     for name in species_names:
#         seqs = map_species_to_records(species_counts[name], full_record)
#         write_fasta(seqs, file_path + name + " sequences")
#         alignmentFile = alignment.alignWithMAFFT(file_path + name + " sequences")
#         alignment.writeAlignment(alignmentFile, file_path + name + ".aln", "fasta")
        

    check_genomic_location(species_counts, min=1, file_path=file_path +" gene locations ")
    check_genomic_location(species_counts, min=1, visualise="linear")


species_names = get_species_names(species_counts, min=1)
generate_multiple_hit_data(species_names, species_counts, full_record, "files/multiple_hits/")

Cercospora beticola
PIA94173.1 OR PIA98779.1 OR 
{}
Aspergillus arachidicola
PIG81432.1 OR PIG85841.1 OR PIG86610.1 OR 
{}
Fomitiporia mediterranea MF3/22
XP_007263416.1 OR XP_007270208.1 OR XP_007270219.1 OR XP_007270266.1 OR XP_007270267.1 OR 
{'XP_007270267.1': '18677144'}
Auricularia subglabra TFB-10046 SS5
XP_007343182.1 OR XP_007343183.1 OR 
{'XP_007343183.1': '18856100'}
Trametes versicolor FP-101664 SS1
XP_008033399.1 OR XP_008038061.1 OR 
{'XP_008038061.1': '19407809'}
Cercospora beticola
PIA94173.1 OR PIA98779.1 OR 
{}
Aspergillus arachidicola
PIG81432.1 OR PIG85841.1 OR PIG86610.1 OR 
{}
Fomitiporia mediterranea MF3/22
XP_007263416.1 OR XP_007270208.1 OR XP_007270219.1 OR XP_007270266.1 OR XP_007270267.1 OR 
{'XP_007270267.1': '18677144'}
Auricularia subglabra TFB-10046 SS5
XP_007343182.1 OR XP_007343183.1 OR 
{'XP_007343183.1': '18856100'}
Trametes versicolor FP-101664 SS1
XP_008033399.1 OR XP_008038061.1 OR 
{'XP_008038061.1': '19407809'}


Or we could just use parts of this function. The cell below will just print out the locations of the proteins in the genome. We could save this to disk by providing an argument to the `file_path` variable or visualise it by providing either 'linear' or 'circular' to the `visualise` variable.

In [None]:
check_genomic_location(species_counts, min=5)

# Saving the records to FASTA files
Because `filtered_records` just contains the species name and IDs of these species, we need to map these IDs back to their full records. We can use the function `map_ids_to_records` which allows for us to select all the records in `filtered_ids` or just the unique species.

In [None]:
filtered_records = map_ids_to_records(species_counts, full_record)
filtered_records_unique = map_ids_to_records(species_counts, full_record, unique=True)

# Check that the numbers are correct
print (len(filtered_records))
print (len(filtered_records_unique))

And now we can save these records to a new FASTA file using `write_fasta`

In [None]:
write_fasta(filtered_records, "files/2U1_BLAST_smaller_records.fasta")
# write_fasta(filtered_records_unique, "files/2U1_BLAST_filtered_records_unique.fasta")

We can also use the function `map_species_to_records` to just map a particular species to a FASTA file.

In [None]:
priapulus_caudatus = map_species_to_records(species_counts['Priapulus caudatus'], full_record)
write_fasta(priapulus_caudatus, "files/priapulus_caudatus.fasta")