In [25]:
import shutil, os, subprocess
import pandas as pd

def translate(seq, frame=1): 
    # frame: 1 = start at pos 0; 2 = start at pos 1; 3 = start at pos 2
    table = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'', 'TAG':'', 
        'TGC':'C', 'TGT':'C', 'TGA':'', 'TGG':'W', 
    } 
    comp_dict = {'C':'G',
                 'G':'C',
                 'A':'T',
                 'T':'A'
                }
    protein = ''
    
    if frame == 1 :
        start_val = 0
    if frame == 2:
        start_val = 1
    if frame == 3:
        start_val = 2
    if frame == 4 :
        start_val = 0
    if frame == 5:
        start_val = 1
    if frame == 6:
        start_val = 2
    if frame > 3:
        seq = ''.join([comp_dict.get(x) for x in seq])
        
    for i in range(start_val, len(seq), 3): 
        try:
            codon = seq[i:i + 3] 
            protein+= table[codon] 
        except:
            break
    return protein

def split_translation (full_seq, len_cutoff = 10):
    sub_seq = [x for x in full_seq.split('_') if len(x) > len_cutoff]
    return sub_seq

def print_fasta (sub_list, prefix='test'):
    for idx, entry in enumerate(sub_list):
        print(f'>{prefix}_{idx}\n{entry}')
        
def save_fasta (sub_list, prefix, output):
    with open(output, 'w') as f:
        for idx, entry in enumerate(sub_list):
            line = f'>{prefix}_{idx}\n{entry}\n'
            f.write(line)
            
def process_fasta (infile, frames, outfile):
    with open(infile, 'r') as f:
        sublines = []
        entry_name = ''
        for idx,line in enumerate(f):
            line = line[:-1]
            if line.startswith('>') and entry_name == '':
                entry_name = line
            if line.startswith('>') and entry_name != '':
                if len(sublines) > 0:
                    print_seq = ''.join(sublines)
                    ### process entry seq
                    with open(outfile, 'a') as outf:
                        for frame in frames:
                            full_seq = translate(print_seq, frame)
                            outf.write(f'{entry_name}__fr_{frame}\n{full_seq}\n')
                    ###
                    sublines = []
                entry_name = line
            else:
                sublines = sublines+[line]                          
        if len(sublines) > 0:
            print_seq = ''.join(sublines)
            ### process entry seq
            with open(outfile, 'a') as outf:
                for frame in frames:
                    full_seq = translate(print_seq, frame)
                    outf.write(f'{entry_name}__fr_{frame}\n{full_seq}\n')
            ###
            sublines = []


In [56]:
results_dir = '/home/ubuntu/data/longread/blast_results/'
target_path = '/home/ubuntu/data/longread/A10_TSO1.fa'
target_suffix = '.fa'
cell_id = target_path.split('/')[-1].split(target_suffix)[0]
frames = [1,2,3]
proteome_ref = '/home/ubuntu/data/longread/proteome/UP000005640_9606.fasta'
n_threads = 4

# create results subdir
try:
    shutil.rmtree(f'{results_dir}{cell_id}')
    print('deleted existing results subdir')
except:
    pass
try:
    os.mkdir(f'{results_dir}{cell_id}')
    print('created results subdir')
except:
    pass

# translate nucl to prot
process_fasta(f'{target_path}', 
              frames,
              f'{results_dir}{cell_id}/{cell_id}_protein.fa'
             )
print('translated to protein')

# run blast
blast_cmd_list = ['blastp',
                  '-query',
                  f'{results_dir}{cell_id}/{cell_id}_protein.fa',
                  '-db',
                  proteome_ref,
                  '-outfmt',
                  '"6 qseqid sseqid evalue"',
                  '-max_target_seqs',
                  '1',
                  '-out',
                  f'{results_dir}{cell_id}/{cell_id}_blastp_out.txt',
                  '-num_threads',
                  f'{n_threads}'
                 ]
blast_cmd = ' '.join(blast_cmd_list)
print(blast_cmd)
subprocess.call(blast_cmd, shell=True)


deleted existing results subdir
created results subdir
translated to protein
blastp -query /home/ubuntu/data/longread/blast_results/A10_TSO1/A10_TSO1_protein.fa -db /home/ubuntu/data/longread/proteome/UP000005640_9606.fasta -outfmt "6 qseqid sseqid evalue" -max_target_seqs 1 -out /home/ubuntu/data/longread/blast_results/A10_TSO1/A10_TSO1_blastp_out.txt -num_threads 4


0

In [52]:
results = pd.read_csv('/home/ubuntu/data/longread/blast_results/A10_TSO1/A10_TSO1_blastp_out.txt', 
                      sep = '\t',
                      header = None
                     )
results.columns = ['qsid' , 'ssid', 'evalue']
results['frame'] = [x.split('__')[-1][-1] for x in results['qsid']]
results['qsid'] = [x.split('__')[0] for x in results['qsid']]
results['uniprot_id'] = [x.split('|')[1] for x in results['ssid']]
results['common_name'] = [x.split('|')[2] for x in results['ssid']]
results = results.sort_values('evalue', ascending = True).groupby(['qsid']).head(1)
pd.DataFrame(results['common_name'].value_counts()).head(50)


Unnamed: 0,common_name
HG2A_HUMAN,170
ATP6_HUMAN,88
NU4M_HUMAN,68
RS3_HUMAN,65
EF1A1_HUMAN,60
RS18_HUMAN,60
COX7C_HUMAN,41
MGMT_HUMAN,39
DRC6_HUMAN,25
SSPN_HUMAN,25


In [54]:
'CD74_HUMAN' in results['common_name'].tolist()

False

In [45]:
len(set(results.qsid))

322