In [1]:
from fasta import *
import alignment
import utilities
import annotations
from checkGenome import *
from ipywidgets import widgets
from ipywidgets import *
from traitlets import *
from IPython.display import display

# Loading the files

The first thing we want to do is load in our files. Loading them in this way actually loads them into a dictionary where the keys are the record IDs and the values are the full records.

In [2]:
# Load the 2U1 files
records = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/concatenated_output.fasta", "fasta"))
records_40 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/40_percent_concatenated_output.fasta", "fasta"))
records_50 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/50_percent_concatenated_output.fasta", "fasta"))
records_55 = SeqIO.to_dict(SeqIO.parse("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/55_percent_concatenated_output.fasta", "fasta"))


#### How many sequences do we have in the full records?

In [3]:
print (len(records))
print (len(records_40))
print (len(records_50))
print (len(records_55))


19998
4686
622
421


# Creating subsets of the records
### Including / excluding based on header annotations

Now the workflow moves to including and excluding certain sequences. `subset_records` allows us to provide a list of terms which we either want or don't want in the header description. We can also give a minimum length for sequences to meet for inclusion.

We don't ever alter the original `full_record`, we just create new dictionary objects that are subsets.

We can either provide arguments directly to the function or we can pass in a list variable, such as `header_terms`.

In the example below `only_2U1_records` is set to only include sequences which have either '2U1' or '2U1-like' in the header. And `filtered_records` will contain the full set of sequences as we are not providing a length minimum (and the default is 0) and we are passing in the currently empty `header_terms`.

In [4]:
# A blank list to hold terms we want to exclude or include
header_terms = []

In [5]:
records = subset_records(records=records, length=400, mode='exclude')
records = exclude_character(records, "X")

records_40 = subset_records(records=records_40, length=400, mode='exclude')
records_40 = exclude_character(records_40, "X")

records_50 = subset_records(records=records_50, length=400, mode='exclude')
records_50 = exclude_character(records_50, "X")

records_55 = subset_records(records=records_55, length=400, mode='exclude')
records_55 = exclude_character(records_55, "X")

print ("The number of sequences without filtering for identity to the original candidates is %s" % (len(records)))
print ("The number of sequences with 40%% identity to the original candidates is %s" % (len(records_40)))
print ("The number of sequences with 50%% identity to the original candidates is %s" % (len(records_50)))
print ("The number of sequences with 55%% identity to the original candidates is %s" % (len(records_55)))



The number of sequences without filtering for identity to the original candidates is 15739
The number of sequences with 40% identity to the original candidates is 3047
The number of sequences with 50% identity to the original candidates is 299
The number of sequences with 55% identity to the original candidates is 275


Below is the code to get the IDs of records that appear in one set of records but not in another. Here we use it to work out which records are in our 50% identity set but not our 55% identity set

In [6]:
differentRecords = getDifferentRecords(records_50, records_55)

print (len(differentRecords))
    
for record_id, record in differentRecords.items():
    print (record.description)
#     annotations.getCommonNameFromID(record_id)

    

24
EDL30916.1 mCG15468 [Mus musculus]
KTF79201.1 hypothetical protein cypCar_00047955 [Cyprinus carpio]
EPY88763.1 cytochrome P450 2C26-like isoform 2 [Camelus ferus]
XP_006123186.1 PREDICTED: cytochrome P450 2J2-like, partial [Pelodiscus sinensis]
KYO44822.1 hypothetical protein Y1Q_0006987 [Alligator mississippiensis]
ELW64418.1 Cytochrome P450 2D17 [Tupaia chinensis]
XP_016404253.1 PREDICTED: cytochrome P450 2J3-like isoform X2 [Sinocyclocheilus rhinocerous]
XP_014379039.1 PREDICTED: cytochrome P450 2A13-like isoform X1 [Alligator sinensis]
XP_023361104.1 uncharacterized protein LOC100926939 [Sarcophilus harrisii]
PIK38219.1 putative steroid 17-alpha-hydroxylase/17,20 lyase [Apostichopus japonicus]
XP_006768464.1 PREDICTED: cytochrome P450 2J2 [Myotis davidii]
XP_021107396.1 cytochrome P450 2B4 isoform X1 [Heterocephalus glaber]
XP_014801266.1 PREDICTED: cytochrome P450 2D1-like [Calidris pugnax]
XP_021107397.1 cytochrome P450 2B4 isoform X2 [Heterocephalus glaber]
XP_006825012.1 PR

In [7]:
def annotateHeader(full_records, annotate_records, headerText):
    for record in full_records:
        if record in annotate_records:
            full_records[record].description += headerText
    return full_records

records_50 = annotateHeader(records_50, differentRecords, "***Added_at_50***")

for record in records_50.values():
    print (record.description)
    

XP_005940065.1 PREDICTED: cytochrome P450 2U1 isoform X1 [Haplochromis burtoni]
XP_017318806.1 PREDICTED: cytochrome P450 2U1 [Ictalurus punctatus]
XP_005872504.1 PREDICTED: cytochrome P450 2U1 [Myotis brandtii]
ERE89448.1 cytochrome P450 2U1-like protein [Cricetulus griseus]
XP_005978933.1 PREDICTED: cytochrome P450 2U1-like [Pantholops hodgsonii]
XP_021330598.1 cytochrome P450 2U1 isoform X1 [Danio rerio]
ETE67196.1 Cytochrome protein, partial [Ophiophagus hannah]
XP_021014502.1 cytochrome P450 2U1 isoform X1 [Mus caroli]
XP_004380279.1 cytochrome P450 2U1 [Trichechus manatus latirostris]
XP_006825012.1 PREDICTED: cytochrome P450 2D6-like [Saccoglossus kowalevskii]***Added_at_50***
XP_003929520.1 PREDICTED: cytochrome P450 2U1 [Saimiri boliviensis boliviensis]
XP_019272361.1 PREDICTED: cytochrome P450 2U1 isoform X3 [Panthera pardus]
XP_003410436.1 cytochrome P450 2U1 isoform X1 [Loxodonta africana]
XP_021251581.1 cytochrome P450 2U1 [Numida meleagris]
XP_006061937.1 PREDICTED: cytoc

BAF82691.1 unnamed protein product [Homo sapiens]
KYO44822.1 hypothetical protein Y1Q_0006987 [Alligator mississippiensis]***Added_at_50***
XP_007899581.1 PREDICTED: cytochrome P450 2U1 [Callorhinchus milii]
XP_001366063.1 PREDICTED: cytochrome P450 2U1 [Monodelphis domestica]
XP_006629927.2 PREDICTED: cytochrome P450 2U1 [Lepisosteus oculatus]
NP_898898.1 cytochrome P450 2U1 [Homo sapiens]
XP_021540185.1 cytochrome P450 2U1 [Neomonachus schauinslandi]
EPY88763.1 cytochrome P450 2C26-like isoform 2 [Camelus ferus]***Added_at_50***
XP_014414682.1 PREDICTED: cytochrome P450 2U1 [Camelus ferus]
OCA35976.1 hypothetical protein XENTR_v90015294mg [Xenopus tropicalis]***Added_at_50***
XP_006787161.1 PREDICTED: cytochrome P450 2U1-like [Neolamprologus brichardi]
XP_019110856.1 PREDICTED: cytochrome P450 2U1-like [Larimichthys crocea]
KTF79021.1 hypothetical protein cypCar_00041961 [Cyprinus carpio]
XP_020757615.1 cytochrome P450 2U1 [Odocoileus virginianus texanus]
XP_023129780.1 cytochrome P4

# Saving the records to FASTA files
Now lets save the annotated records to a FASTA file

In [8]:
annotated_records = map_dict_to_records(records_50)
print(len(annotated_records))

empty dict
299


And now we can save these records to a new FASTA file using `write_fasta`

In [10]:
write_fasta(annotated_records, "/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/2U1_50_percent.fasta")

We can also use the function `map_species_to_records` to just map a particular species to a FASTA file.