* This notebook needs you to provide a FASTA file and a HMMER file
* It then searches through the FASTA file and checks to see if there is a hit for the profile.
* It then creates a new FASTA file that contains the sequence content only from the start of the profile hit until the C-terminus.
* If a sequence does not have a hit for the profile it will not appear in the new FASTA file.

In [4]:
import os
from Bio import SearchIO
import sequence

# Sequences you want to search in
fasta_path = "./a_regions_1128.fasta"

# Profile to check with
hmm_path = "./pore.hmm"

# Where to save the intermediate results from the hmmer search
outpath = "./a_regions_1128.txt"

# Where to save the final trimmed sequences
fasta_outpath = "./a_regions_1128_pore.fasta"

In [5]:
# Load sequences
seqs = sequence.readFastaFile(fasta_path)

# Search the sequences with the profile
os.system('hmmsearch -o' + outpath + ' --domT 1 ' + hmm_path + " " + fasta_path)

0

In [6]:
def get_pos_dict_from_hmm(path):
    qresult = SearchIO.read(path, 'hmmer3-text')
    
    pos_dict = {}
    
    print (len(qresult.hsps))
    print (len(seqs))
    
    if len(qresult.hsps) > len(seqs):
        print ("ERROR: More HSPs than expected")
    
    for hsp in qresult.hsps:
#         print (hsp.hit.id)
#         print (hsp.hit_start)
#         print (hsp.hit_end)
        pos_dict[hsp.hit.id] = (hsp.hit_start, hsp.hit_end)
        
    return pos_dict
            
pos_dict = get_pos_dict_from_hmm(outpath)

trimmed = []

for seq in seqs:
    if seq.name in pos_dict:
        trimmed.append(sequence.Sequence("".join(seq.sequence[pos_dict[seq.name][0]:]), name=seq.name))
    else:
        print (seq.name + " didn't have a hit for pore")
    
sequence.writeFastaFile(fasta_outpath, trimmed)

585
564
ERROR: More HSPs than expected
NZ_VOBM01000105.1_information_Pseudomonas_sp._region_A1_expanded_4032976_4035892_backward_joined_A2_expanded_4031147_4032953_backward didn't have a hit for pore
NZ_QUMR01000030.1_information_Pseudomonas_sp._region_A1_expanded_2440753_2444485_backward_joined_A2_expanded_2438435_2440754_backward didn't have a hit for pore
NZ_QVIG01000004.1_information_Kitasatospora_xanthocidica_region_TcdA1_expanded_1514345_1522262_backward didn't have a hit for pore
NZ_FONE01000126.1_information_Nitrosomonas_sp._region_TcdA1_expanded_1271605_1280425_forward didn't have a hit for pore
NZ_CBLI010000886.1_information_Yersinia_wautersii_region_TcdA1_expanded_3904237_3906139_backward didn't have a hit for pore
NC_020418.1_information_Morganella_morganii_region_TcdA1_expanded_2005504_2009848_backward didn't have a hit for pore
NZ_FORB01000034.1_information_Pseudomonas_sp._region_TcdA1_expanded_1386611_1386896_backward didn't have a hit for pore
NZ_FORB01000034.1_informat