In [None]:
from fasta import *
import alignment
import utilities
import annotations
from checkGenome import *
from ipywidgets import widgets
from ipywidgets import *
from traitlets import *
from IPython.display import display

# Loading the files

The first thing we want to do is load in our files. Loading them in this way actually loads them into a dictionary where the keys are the record IDs and the values are the full records.

In [None]:
# Load the 2U1 files
records = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/concatenated_output.fasta", "fasta"))
records_40 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/40_percent_concatenated_output.fasta", "fasta"))
records_50 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/50_percent_concatenated_output.fasta", "fasta"))
records_55 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/55_percent_concatenated_output.fasta", "fasta"))


#### How many sequences do we have in the full records?

In [None]:
print (len(records))
print (len(records_40))
print (len(records_50))
print (len(records_55))


# Creating subsets of the records
### Including / excluding based on header annotations

Now the workflow moves to including and excluding certain sequences. `subset_records` allows us to provide a list of terms which we either want or don't want in the header description. We can also give a minimum length for sequences to meet for inclusion.

We don't ever alter the original `full_record`, we just create new dictionary objects that are subsets.

We can either provide arguments directly to the function or we can pass in a list variable, such as `header_terms`.

In the example below `only_2U1_records` is set to only include sequences which have either '2U1' or '2U1-like' in the header. And `filtered_records` will contain the full set of sequences as we are not providing a length minimum (and the default is 0) and we are passing in the currently empty `header_terms`.

In [None]:
# A blank list to hold terms we want to exclude or include
header_terms = []

In [None]:
records = subset_records(records=records, length=400, mode='exclude')
records = exclude_character(records, "X")

records_40 = subset_records(records=records_40, length=400, mode='exclude')
records_40 = exclude_character(records_40, "X")

records_50 = subset_records(records=records_50, length=400, mode='exclude')
records_50 = exclude_character(records_50, "X")

records_55 = subset_records(records=records_55, length=400, mode='exclude')
records_55 = exclude_character(records_55, "X")

print ("The number of sequences without filtering for identity to the original candidates is %s" % (len(records)))
print ("The number of sequences with 40%% identity to the original candidates is %s" % (len(records_40)))
print ("The number of sequences with 50%% identity to the original candidates is %s" % (len(records_50)))
print ("The number of sequences with 55%% identity to the original candidates is %s" % (len(records_55)))



Below is the code to get the IDs of records that appear in one set of records but not in another. Here we use it to work out which records are in our 50% identity set but not our 55% identity set

In [None]:
differentRecords = getDifferentRecords(records_50, records_55)

print (len(differentRecords))
    
for record_id, record in differentRecords.items():
    print (record.description)
#     annotations.getCommonNameFromID(record_id)

    

In [None]:
def annotateHeader(full_records, annotate_records, headerText):
    for record in full_records:
        if record in annotate_records:
            full_records[record].description += headerText
    return full_records

records_50 = annotateHeader(records_50, differentRecords, "***Added_at_50***")

for record in records_50.values():
    print (record.description)
    

# Saving the records to FASTA files
Now lets save the annotated records to a FASTA file

In [None]:
annotated_records = map_dict_to_records(records_50)
print(len(annotated_records))

And now we can save these records to a new FASTA file using `write_fasta`

In [None]:
write_fasta(annotated_records, "/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/2U1_50_percent.fasta")

We can also use the function `map_species_to_records` to just map a particular species to a FASTA file.