In [75]:
import shutil, os, subprocess, mygene, glob, tqdm
import pandas as pd

def translate(seq, frame=1): 
    # frame: 1 = start at pos 0; 2 = start at pos 1; 3 = start at pos 2
    table = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'', 'TAG':'', 
        'TGC':'C', 'TGT':'C', 'TGA':'', 'TGG':'W', 
    } 
    comp_dict = {'C':'G',
                 'G':'C',
                 'A':'T',
                 'T':'A'
                }
    protein = ''
    
    if frame == 1 :
        start_val = 0
    if frame == 2:
        start_val = 1
    if frame == 3:
        start_val = 2
    if frame == 4 :
        start_val = 0
    if frame == 5:
        start_val = 1
    if frame == 6:
        start_val = 2
    if frame > 3:
        seq = ''.join([comp_dict.get(x) for x in seq])
        
    for i in range(start_val, len(seq), 3): 
        try:
            codon = seq[i:i + 3] 
            protein+= table[codon] 
        except:
            break
    return protein

def split_translation (full_seq, len_cutoff = 10):
    sub_seq = [x for x in full_seq.split('_') if len(x) > len_cutoff]
    return sub_seq

def print_fasta (sub_list, prefix='test'):
    for idx, entry in enumerate(sub_list):
        print(f'>{prefix}_{idx}\n{entry}')
        
def save_fasta (sub_list, prefix, output):
    with open(output, 'w') as f:
        for idx, entry in enumerate(sub_list):
            line = f'>{prefix}_{idx}\n{entry}\n'
            f.write(line)
            
def process_fasta (infile, frames, outfile):
    with open(infile, 'r') as f:
        sublines = []
        entry_name = ''
        for idx,line in enumerate(f):
            line = line[:-1]
            if line.startswith('>') and entry_name == '':
                entry_name = line
            if line.startswith('>') and entry_name != '':
                if len(sublines) > 0:
                    print_seq = ''.join(sublines)
                    ### process entry seq
                    with open(outfile, 'a') as outf:
                        for frame in frames:
                            full_seq = translate(print_seq, frame)
                            outf.write(f'{entry_name}__fr_{frame}\n{full_seq}\n')
                    ###
                    sublines = []
                entry_name = line
            else:
                sublines = sublines+[line]                          
        if len(sublines) > 0:
            print_seq = ''.join(sublines)
            ### process entry seq
            with open(outfile, 'a') as outf:
                for frame in frames:
                    full_seq = translate(print_seq, frame)
                    outf.write(f'{entry_name}__fr_{frame}\n{full_seq}\n')
            ###
            sublines = []
            
def run_blast(target_path,
              target_suffix,
              frames,
              proteome_ref,
              results_dir,
              n_threads,
              evalue_cutoff
             ):
    # extract cell id
    cell_id = target_path.split('/')[-1].split(target_suffix)[0]
    
    # create results subdir
    try:
        shutil.rmtree(f'{results_dir}{cell_id}')
    except:
        pass
    try:
        os.mkdir(f'{results_dir}{cell_id}')
    except:
        pass

    # translate nucl to prot
    process_fasta(f'{target_path}', 
                  frames,
                  f'{results_dir}{cell_id}/{cell_id}_protein.fa'
                 )

    # run blast
    blast_cmd_list = ['blastp',
                      '-query',
                      f'{results_dir}{cell_id}/{cell_id}_protein.fa',
                      '-db',
                      proteome_ref,
                      '-task',
                      'blastp-fast',
                      '-outfmt',
                      '"6 qseqid sseqid evalue"',
                      '-max_target_seqs',
                      '1',
                      '-out',
                      f'{results_dir}{cell_id}/{cell_id}_blastpOut.txt',
                      '-num_threads',
                      f'{n_threads}'
                     ]
    blast_cmd = ' '.join(blast_cmd_list)
    print(blast_cmd)
    subprocess.call(blast_cmd, shell=True)
    
    # process results
    process_blastp_results(results_dir, cell_id, evalue_cutoff)

def process_blastp_results(results_dir, cell_id, evalue_cutoff):
    # ingest table
    results = pd.read_csv(f'{results_dir}{cell_id}/{cell_id}_blastpOut.txt', 
                          sep = '\t',
                          header = None
                         )
    results.columns = ['qsid' , 'ssid', 'evalue']
    # parse frame and ids
    results['frame'] = [x.split('__')[-1][-1] for x in results['qsid']]
    results['qsid'] = [x.split('__')[0] for x in results['qsid']]
    results['uniprot_id'] = [x.split('|')[1] for x in results['ssid']]
    results['common_name'] = [x.split('|')[2] for x in results['ssid']]
    # return best-match frame
    results = results.sort_values('evalue', ascending = True).groupby(['qsid']).head(1)
    # print undetermined reads to disk
    undetermined_reads = results[results.evalue >= evalue_cutoff]
    undetermined_reads.to_csv(f'{results_dir}{cell_id}/{cell_id}_undeterminedReads.csv')
    # filter results to significant matches
    results = results[results.evalue < evalue_cutoff]
    # count matches
    results = pd.DataFrame(results['uniprot_id'].value_counts())
    # rename uniprot to gene symbols
    results = (results
               .reset_index()
               .rename(columns={'index':'uniprot',
                                'uniprot_id':'count'
                               })
              )
    # ingest uniprot to symbol df for lookup
    scope2field_df = pd.read_csv('/home/ubuntu/data/longread/proteome/scope2field.csv',index_col = 0)
    results = pd.merge(scope2field_df,
                       results,
                       'left',
                       'uniprot'
                      )
    # sanitize table
    results = results.drop('uniprot', axis=1)
    results['count'] = (results['count']
                        .replace(np.nan, 0)
                        .astype(int)
                        .dropna()
                       )
    # write to disk
    results.to_csv(f'{results_dir}{cell_id}/{cell_id}_geneCounts.csv')

# create results dir
def create_results_dir(results_dir):
    try:
        shutil.rmtree(f'{results_dir}')
        print('####### deleted existing results subdir')
    except:
        pass
    try:
        os.mkdir(f'{results_dir}')
        print('####### created results subdir')
    except:
        pass

# function to create blast db and config
def make_protein_db (ref_path):
    cmd = f'makeblastdb -in {ref_path} -dbtype prot'
    print(cmd)
    subprocess.call(cmd, shell=True)
    print('####### created reference blast database')

# function to create name conversion look up table
def make_scope2field_lookup (ref_path, scope, field, species, results_dir):
    
    # ingest uniprot proteome
    uniprot_list = []
    with open(ref_path, 'r') as f:
        for idx,line in enumerate(f):
            line = line[:-1]
            if line.startswith('>'):
                # extract uniprot id field
                line_split = line.split('|')[1]
                uniprot_list = uniprot_list+[line_split]

    # query for gene symbols
    mg = mygene.MyGeneInfo()
    scope2field = mg.querymany(uniprot_list, 
                                       scopes=scope, 
                                       fields=field, 
                                       species=species,
                                       returnall=False,
                                       as_dataframe=True
                                      )

    # filter and save results
    scope2field = scope2field[scope2field.notfound != True]
    scope2field = (scope2field
                         .reset_index()
                         .loc[:, ['query',field]]
                         .rename(columns = {'query':scope})
                         .dropna()
                        )
    scope2field.to_csv(f'{results_dir}scope2field.csv')
    print('####### created scope-to-field look-up table')

# main function to process dir of samples
def process_dir_samples (dir_path, 
                         ref_path,
                         target_suffix,
                         frames,
                         n_threads,
                         evalue_cutoff,
                         results_dir,
                         scope, 
                         field, 
                         species
                        ):
    # create results dir
    create_results_dir(results_dir)
    
    # create blast database
    make_protein_db (ref_path)
    
    # create name lookup table
    make_scope2field_lookup (ref_path, scope, field, species, results_dir)
    
    for sample in tqdm.tqdm(glob.glob(f'{dir_path}*{target_suffix}')):
        print(f'####### processing {sample}')
        run_blast(sample,
                  target_suffix,
                  frames,
                  ref_path,
                  results_dir,
                  n_threads,
                  evalue_cutoff
                 )
    

In [None]:
dir_path = '/home/ubuntu/data/longread/full_data/'
target_suffix = '.fa'
ref_path = '/home/ubuntu/data/longread/proteome/UP000005640_9606.fasta'
frames = [1,2,3]
results_dir = '/home/ubuntu/data/longread/full_data/blast_results/'
scope = 'uniprot'
field = 'symbol'
species = 'human'
n_threads = 4
evalue_cutoff = 0.01

process_dir_samples (dir_path, 
                     ref_path,
                     target_suffix,
                     frames,
                     n_threads,
                     evalue_cutoff,
                     results_dir,
                     scope, 
                     field, 
                     species
                    )


####### created results subdir
makeblastdb -in /home/ubuntu/data/longread/proteome/UP000005640_9606.fasta -dbtype prot
####### created reference blast database
querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-20659...done.
