In [2]:
import utilities
import fasta
import alignment_curation
import glob

## First pull and collate the hits from the BLAST results

In [2]:
# # Get the output from the BLAST search into the full FASTA files
# filepaths = glob.glob("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/Candidates/Results/*.fasta")

# utilities.save_ids(*filepaths, percent_identity=50, output_dir="/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/Candidates/Full_FASTA/")

In [3]:
# Read in and collate the FASTA files
fasta_files = glob.glob("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/Candidates/Full_FASTA/*.fasta")

records = utilities.load_sequences(*fasta_files)

In [4]:
print ("The total length of hits before cleaning them up is %d " % len(records))

The total length of hits before cleaning them up is 690 


In [5]:
# Only include records > 400 amino acids, with no X characters, and with the Cytochrome P450 motif

# Cytochrome P450 motif from Guengerich, Waterman & Egli 2016 (doi: 10.1016/j.tips.2016.05.006) FXXGXbXXCXG
cytochrome_p450_motif = "F..G[HRK]..C.G"

reduced_records = fasta.subset_records(records=records, length=400, mode='exclude')
reduced_records = fasta.exclude_character(reduced_records, "X")
reduced_records = fasta.subset_on_motif(reduced_records, cytochrome_p450_motif)

In [6]:
print ("The total length of hits after cleaning them up is %d " % len(reduced_records))

The total length of hits after cleaning them up is 370 


## Subsetting on only 2R1 annotated sequences

The easiest way to do this so that we aren't excluding 2R1 sequences is to create a negative set excluding all the vitamin D 25-hydroxylase sequences, and then just use those header terms as the terms we want to include

In [7]:
header_terms = "vitamin D 25-hydroxylase", "CYP2R1", "2r1", "family 2 subfamily r", "family 2, subfamily r"
not_2R1_records = fasta.subset_records(*header_terms, records=reduced_records, mode="exclude", ignore_case=True)

In [8]:
# for record in not_2R1_records.values():
#     print (record.description)

Then I went through these manually and added back in anything that either mapped to 2R1 or where the particular cytochrome P450 wasn't mentioned -
* AAI68986.1
* KTF79658.1
* AAI55663.1
* AGN04335.1
* AAI54114.1

In [9]:
header_terms += ("AAI68986.1", "KTF79658.1", "AAI55663.1", "AGN04335.1", "AAI54114.1")
not_2R1_records = fasta.subset_records(*header_terms, records=reduced_records, mode="exclude", ignore_case=True)

In [10]:
# for record in not_2R1_records.values():
#     print (record.description)

Now we can include rather than exclude and get only the sequences that we believe to be cytochrome P450 2R1

In [None]:
only_2R1_records = fasta.subset_records(*header_terms, records=reduced_records, mode="include", ignore_case=False)
print ("The total length of hits after cleaning them up and only including cytochrome P450 2R1 is %d " % len(only_2R1_records))
cleaned_records = fasta.map_dict_to_records(only_2R1_records)

In [None]:
# Write the cleaned sequence file to disk

fasta.write_fasta(cleaned_records, "/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/CYP2R1_50_percent_identity.fasta")

## Now we reduce the alignment down on the basis of internal deletions

In [11]:
# alignment.align_with_mafft("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/CYP2R1_50_percent_identity.fasta")

alignment_curation.automated_curation("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/CYP2R1_50_percent_identity_insertions_removed.aln", 0.1, 20, outpath="/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/Automated_alignment/CYP2R1_50_percent_identity_insertions_removed")


defaultdict(<class 'list'>, {64: ['XP_015852823.1'], 399: ['XP_015852825.1']})
The candidate sequence is
['XP_015852825.1']
Count is  1
defaultdict(<class 'list'>, {149: ['XP_015852823.1', 'AAI68986.1', 'XP_006507901.1'], 160: ['XP_005216116.1']})
The candidate sequence is
['XP_005216116.1']
Count is  2
defaultdict(<class 'list'>, {149: ['XP_015852823.1', 'AAI68986.1', 'XP_006507901.1', 'XP_005216114.1', 'XP_010828870.1', 'XP_011951097.1']})
checking for most deletions
The candidate sequence is
['XP_015852823.1', 'AAI68986.1', 'XP_006507901.1', 'XP_005216114.1', 'XP_010828870.1', 'XP_011951097.1']
Count is  3
defaultdict(<class 'list'>, {108: ['XP_015852826.1']})
The candidate sequence is
['XP_015852826.1']
Count is  4
defaultdict(<class 'list'>, {47: ['XP_008851744.1'], 63: ['XP_015852824.1'], 82: ['XP_017658800.1'], 94: ['KYO22076.1', 'KYO22077.1']})
checking for most deletions
The candidate sequence is
['KYO22076.1', 'KYO22077.1']
Count is  5
defaultdict(<class 'list'>, {47: ['XP_00

In [5]:
alignment_curation.automated_curation("/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/Automated_alignment/CYP2R1_50_percent_identity_I_D_I.aln", 0.1, 20, outpath="/Users/gabefoley/Dropbox/PhD/Projects/2U1/2U1_2018/Excluding plants fungi nematodes insects and bacteria/180312_fifty_percent_identity/Further_exclusions/April_2018/CYP2R1/Automated_alignment/CYP2R1_50_percent_identity_I_D_I")

The candidate sequence is
None
Count is  1
We are finished
