In [None]:
import json, collections, re, pandas, os, glob
import Bio
import Bio.SeqIO


In [None]:

class convert_uniprot():
    
    def __init__(
        self, table_fname: str = '/Users/dp/pma/RBP missense mutations/domains/uniprot_gene_id_to_symbol.txt'):
        
        self.df = pandas.read_csv(table_fname, sep='\t')
        self.uni_to_name = dict(zip(self.df['From'], self.df['To']))
        #print(self.df)

    def convert(self, uniprot_id: str):
        uniprot_id = uniprot_id.split('.')[0]
        return self.uni_to_name.get(uniprot_id, uniprot_id)
    
    def is_uniprot(self, uniprot_id):
        uniprot_id = uniprot_id.split('.')[0]
        if uniprot_id in self.uni_to_name:
            return True
        return False
    
uniprot_id_converter = convert_uniprot()

In [None]:

def parse_blast(blast_json_filename: str) -> str:
    a = json.load(open(blast_json_filename))
    blast = a['BlastOutput2']

    hits = {}
    for entry in blast:

        report = entry['report']['results']
        
        if 'bl2seq' in report:
            report = report['bl2seq']#['search_target']
            query = report[0]['query_title']
            hits[query] = collections.defaultdict(list)
            
            for m in [_ for _ in report if len(_['hits'])]:
                for hit in m['hits']:#[_ for _ in m['hits'] if _['evalue'] < 1E-10]:

                    match_name = hit['description'][0]['title']

                    for alignment in hit['hsps']:
                        if alignment['evalue'] < 1E-50:
                            hits[query][match_name].append(alignment['evalue'])
                            
        else:
            report = report['search']
            query = report['query_title']
            hits[query] = collections.defaultdict(list)
            
            #print(report)
            for hit in report['hits']:
                match_name = hit['description'][0]['id']
                
                m = re.search('sp\|(.+)\|', match_name)
                #print(match_name)
                if m is not None and uniprot_id_converter.is_uniprot(m.group(1)):
                    match_name = uniprot_id_converter.convert(m.group(1))
                    
                for alignment in hit['hsps']:
                    if alignment['evalue'] < 1E-50:
                        hits[query][match_name].append(alignment['evalue'])
                #print(hit, match_name)
        
        
        


    rows = ''
    rows_no_fname = ''

    queries = sorted(hits, key = lambda x: float(re.search('[^-]*-[A-Za-z]*(\d+)_', x).group(1)))

    for query in queries:
        s = ','.join(hits[query])
        rows += f'{query}\t{s}'
        rows_no_fname += f'{s}'
        
        if len(hits[query]) == 1:
            gene_name = s.split('_')[0]
            rows += f'\t{gene_name}'
        else:
            rows += '\t'
            
        rows += '\n'
        rows_no_fname += '\n'
    
    print(rows)
    print('---')
    print(rows_no_fname)
    print('---')
    return rows

input_filename = '/Users/dp/Desktop/sanger/seqs/191011_maxipreps_missense_pSLX3/U1BKW19Z114-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/191010_minipreps_missense_pSLX3/TZ4CJWR5114-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/191017_pSLX3_maxipreps_missense/UKTE09DF114-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/191018_maxipreps_missense_pSLX4_Ct_except_PCBP1_is_pSLX3/UPKNZDWC114-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/191028_maxiprep_missense_both_N_and_C_all_no_uORF/VGUSHWK111N-Alignment.json'
#input_filename = '/Users/dp/Desktop/sanger/seqs/191019_maxiprep_cds_sequencing_verifications_missense/UYC92776114-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/191210_rpl5_rpl10_in_pSLX4_and_upf2_pSLX3/Z58M621R114-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/200117_UPF2_DIS3_SMAD3_SMAD4_BRCA1_OAS1_BCLAF1_all_Nt_pSLX3_except_BCLAF1_Ct/26NJZ0YA114-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/200117_UPF2_DIS3_SMAD3_SMAD4_BRCA1_OAS1_BCLAF1_all_Nt_pSLX3_except_BCLAF1_Ct/26NMC1R6014-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/200122_BARD1_BRCA1_sequencixng/2H5096ZD016-Alignment.json'
input_filename = '/Users/dp/Desktop/sanger/seqs/190219_SMAD3_BRCA1_OTHERS_RESEQ_PSLX3_except_BCLAF1_is_pSLX4/4XHTKXTS016-Alignment.json'
#+input_filename = '/Users/dp/Desktop/sanger/seqs/190218_SMAD3_4_BRCA1_PLSX3_OTHERS_ALL_PSLX3_OR_PSLX4/4UKMRK2S014-Alignment.json'
out = parse_blast(input_filename)

with open(os.path.dirname(input_filename) + '/parsed_blast.txt', 'w') as f:
    f.write(out)

In [None]:
def check_uORF_and_tag_status_in_dir(input_folder: str = None, input_filename: str = None) -> str:
    
    #  Outputs:
    lines = ''
    lines_just_result = ''
    
    # Input file, not folder:
    if input_folder is None:

        input_folder = os.path.dirname(input_filename)
        print(f"Parsing fasta in {input_folder}.")
        
        #try:
        lines_list = []
        for record in Bio.SeqIO.parse(input_filename, 'fasta'):
            result = check_uORF_and_tag_status_in_seq(record.seq)
            lines_list.append([record, result])

        lines_list = sorted(lines_list, key = lambda x: float(re.search('[^-]*-[A-Za-z]*(\d+)_', os.path.basename(x[0].name)).group(1)))

        for (record, result) in lines_list:
            lines += record.name + '\t' + result
            lines_just_result += result
            
        #lines = ''.join(lines_list)
        print(lines)
        #print(lines)
        print(lines_just_result)

        return
        
        #except:
        #    print("Failed to read input file as a set of fastas. Instead looking in its directory.")
    # Input folder:
    fnames = glob.glob(input_folder + '/*.seq')

    queries = sorted(fnames, key = lambda x: float(re.search('[^-]*-[A-Za-z]*(\d+)_', os.path.basename(x)).group(1)))
    

    for fname in queries:
        result = check_uORF_and_tag_status_in_seq(Bio.SeqIO.read(fname, 'fasta').seq)
        lines += os.path.basename(fname) + '\t' + result
        lines_just_result += result
    print(lines)
    print(lines_just_result)
    
def check_uORF_and_tag_status_in_seq(seq: str) -> str:
    nt_tag_seq = 'ATGGATTATAAAGATGACGACGATAAAGCAGGTTACCCATACGATGTGCCTGACTATGCTGCAGGTTCACATCATCACCACCATCATGGATCCATG'
    nt_tag_seq_no_final_ATG = nt_tag_seq[:-3]
    
    downstream_as_reverse_complement = 'TCCACCACACTGGACTAGTGGATC'
    primer_direction = '(F)'
    if downstream_as_reverse_complement in seq:
        #print('Rev primer')
        primer_direction = '(R)'
    
    if primer_direction == '(F)':
        no_uORF = 'CTGGCATAACGACTAA'
        uORF = 'ATGGCATAACGACTAA'
        if uORF in seq and (no_uORF not in seq):
            uORF_status = 'uORF.'
        elif uORF not in seq and (no_uORF in seq):
            uORF_status = 'No uORF.'
        else:
            uORF_status = 'Unclear uORF.'

        if nt_tag_seq in seq:
            tag = 'Has entire tag.'
        elif nt_tag_seq_no_final_ATG in seq:
            tag = "See entire tag, but no second ATG starting the protein of interest (may be intentional)."
        else:
            tag = "Don't see entire tag."
    else:
        uORF_status = ''
        tag = ''
    #print(uORF_status, tag)
    
    C_terminal_his_r = 'CACTAATGATGGTGGTGATG'  # Rev complement.
    C_terminal_his_f = 'CATCACCACCATCATTAGTG'  # Forward.
    
    C_terminal_tag = ''
    if C_terminal_his_f in seq:
        C_terminal_tag = 'C-t (F).'
    if C_terminal_his_r in seq:
        C_terminal_tag = 'C-t (R).'

    if 'TCTTTATAATCTGGATC' in seq:
        if 'TCTTTATAATCTGGATCTTA' in seq or \
           'TCTTTATAATCTGGATCTCA' in seq or \
           'TCTTTATAATCTGGATCCTA' in seq:  
           #'TCTTTATAATCTGGATC'
            #'TGA>TCA TAA>TTA TAG>CTA'
            C_terminal_tag += ' Has a stop before C-t tag. Not usable.'
    return f'{primer_direction} {uORF_status} {tag} {C_terminal_tag}\n'


check_uORF_and_tag_status_in_dir(input_filename='/Users/dp/Desktop/sanger/seqs/200403_re_maxi_prep_for_more_vector/all.fa')