In [1]:
import utilities
import fasta
import alignment_curation
import glob

In [9]:
# Read in and collate the FASTA files
cyp2U1_records = utilities.load_sequences("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/2U1_50_percent.fasta")

fasta_files = glob.glob("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/Candidates/Full_FASTA/*.fasta")

cyp2R1_records = utilities.load_sequences(*fasta_files)

working_dir = "/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2U1_CYP2R1/"

In [3]:
print ("The total length of CYP2U1 hits before cleaning them up is %d " % len(cyp2U1_records))
print ("The total length of CYP2R1 hits before cleaning them up is %d " % len(cyp2R1_records))

The total length of CYP2U1 hits before cleaning them up is 299 
The total length of CYP2R1 hits before cleaning them up is 690 


In [4]:
# Only include records > 400 amino acids, with no X characters, and with the Cytochrome P450 motif

# Cytochrome P450 motif from Guengerich, Waterman & Egli 2016 (doi: 10.1016/j.tips.2016.05.006) FXXGXbXXCXG
cytochrome_p450_motif = "F..G[HRK]..C.G"



In [5]:
cyp2U1_reduced_records = fasta.subset_records(records=cyp2U1_records, length=400, mode='exclude')
cyp2U1_reduced_records = fasta.exclude_character(cyp2U1_reduced_records, "X")
cyp2U1_reduced_records = fasta.subset_on_motif(cyp2U1_reduced_records, cytochrome_p450_motif)
cyp2U1_cleaned_records = fasta.map_dict_to_records(cyp2U1_reduced_records)


cyp2R1_reduced_records = fasta.subset_records(records=cyp2R1_records, length=400, mode='exclude')
cyp2R1_reduced_records = fasta.exclude_character(cyp2R1_reduced_records, "X")
cyp2R1_reduced_records = fasta.subset_on_motif(cyp2R1_reduced_records, cytochrome_p450_motif)
cyp2R1_cleaned_records = fasta.map_dict_to_records(cyp2R1_reduced_records)


In [6]:
print ("The total length of CYP2U1 hits after cleaning them up is %d " % len(cyp2U1_cleaned_records))
print ("The total length of CYP2R1 hits after cleaning them up is %d " % len(cyp2R1_cleaned_records))

The total length of CYP2U1 hits after cleaning them up is 260 
The total length of CYP2R1 hits after cleaning them up is 370 


In [None]:
# Write the cleaned sequence file to disk

# fasta.write_fasta(cyp2U1_cleaned_records, "/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2U1_CYP2R1/cyp2U1_50_percent_identity.fasta")

# fasta.write_fasta(cyp2R1_cleaned_records, "/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2U1_CYP2R1/cyp2R1_50_percent_identity.fasta")

## Now we manually removed sequences with long insertions - these are the sequences we remove

In [15]:
cyp2U1_insertions_removed = utilities.load_sequences(working_dir + "/cyp2U1_50_percent_identity_I.fasta")
cyp2R1_insertions_removed = utilities.load_sequences(working_dir + "/cyp2R1_50_percent_identity_I.fasta")

print ("These are the sequences we removed from CYP2U1 for having insertions")
print (fasta.compare_fasta(cyp2U1_reduced_records, cyp2U1_insertions_removed))

print ("These are the sequences we removed from CYP2R1 for having insertions")
print (fasta.compare_fasta(cyp2R1_reduced_records, cyp2R1_insertions_removed))


These are the sequences we removed from CYP2U1 for having insertions
PIK38219.1
XP_002605102.1
PIO41157.1
XP_014379039.1
XP_023361104.1
KKF09825.1
KYO44822.1
OCA35976.1
None
These are the sequences we removed from CYP2R1 for having insertions
XP_015485136.1
XP_018112723.1
XP_018112722.1
XP_018112721.1
OWK58087.1
XP_009688434.1
PIO41157.1
PIO29018.1
XP_017660785.1
OCA35976.1
XP_014379039.1
XP_002605102.1
None


## Now we reduce the alignment down on the basis of internal deletions

In [16]:
# alignment_curation.automated_curation("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2U1_CYP2R1/cyp2U1_50_percent_identity_I.aln", 0.1, 20, outpath="/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2U1_CYP2R1/cyp2U1_50_percent_I_D")

alignment_curation.automated_curation("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2U1_CYP2R1/cyp2R1_50_percent_identity_I.aln", 0.1, 20, outpath="/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2U1_CYP2R1/cyp2R1_50_percent_I_D")

defaultdict(<class 'list'>, {406: ['XP_015852825.1']})
The candidate sequence is
['XP_015852825.1']
Count is  1
defaultdict(<class 'list'>, {28: ['XP_015852823.1'], 30: ['AAI68986.1', 'XP_006507901.1'], 52: ['XP_005216116.1'], 302: ['POI34245.1'], 310: ['XP_023372804.1', 'XP_018880964.1', 'XP_017708555.1', 'XP_023051618.1', 'XP_017202860.1']})
checking for most deletions
The candidate sequence is
['XP_023372804.1', 'XP_018880964.1', 'XP_017708555.1', 'XP_023051618.1', 'XP_017202860.1']
Count is  2
defaultdict(<class 'list'>, {28: ['XP_015852823.1'], 30: ['AAI68986.1', 'XP_006507901.1'], 52: ['XP_005216116.1'], 302: ['POI34245.1']})
The candidate sequence is
['POI34245.1']
Count is  3
defaultdict(<class 'list'>, {28: ['XP_015852823.1', 'AAI68986.1', 'XP_006507901.1'], 52: ['XP_005216116.1'], 221: ['XP_015852826.1']})
The candidate sequence is
['XP_015852826.1']
Count is  4
defaultdict(<class 'list'>, {28: ['XP_015852823.1', 'AAI68986.1', 'XP_006507901.1'], 52: ['XP_005216116.1'], 80: ['

In [None]:
Now we check the exon counts of the sequences in CYP2U1 and CYP2R1
