## translate to protein and blast

In [209]:
import subprocess
def translate(seq, frame=1): 
    # frame: 1 = start at pos 0; 2 = start at pos 1; 3 = start at pos 2
    table = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'X', 'TAG':'X', 
        'TGC':'C', 'TGT':'C', 'TGA':'X', 'TGG':'W', 
    } 
    comp_dict = {'C':'G',
                 'G':'C',
                 'A':'T',
                 'T':'A'
                }
    protein = ''
    
    if frame == 1 :
        start_val = 0
    if frame == 2:
        start_val = 1
    if frame == 3:
        start_val = 2
    if frame == 4 :
        start_val = 0
    if frame == 5:
        start_val = 1
    if frame == 6:
        start_val = 2
    if frame > 3:
        seq = ''.join([comp_dict.get(x) for x in seq])
        
    for i in range(start_val, len(seq), 3): 
        try:
            codon = seq[i:i + 3] 
            protein+= table[codon] 
        except:
            break
    return protein

def fastq2protein(input_file, output_file, trim_n = 0):
    read_iter = {}
    iterator = 0
    entry_count = 0
    for line in open(input_file):
        iterator += 1
        position = iterator % 4
        read_iter[position] = line.strip()
        if position == 0:
            a = read_iter[1]
            b = read_iter[2]
            name = a[1:].split()[0]
            sequence = b[:len(b)-trim_n+1]
            
            name_1 = f'>{name}_fr1' + '\n'
            trans_seq_1 = translate(sequence, frame=1) + '\n'
            name_2 = f'>{name}_fr2' + '\n'
            trans_seq_2 = translate(sequence, frame=2) + '\n'
            name_3 = f'>{name}_fr3' + '\n'
            trans_seq_3 = translate(sequence, frame=3) + '\n'
            
            if entry_count == 0:
                write_code = 'w'
            else:
                write_code = 'a'
            with open(output_file, write_code) as f:
                for outline in [name_1,trans_seq_1,name_2,trans_seq_2,name_3,trans_seq_3]:
                    f.write(outline)
                    
            entry_count += 1

def run_blastp(input_fn, prot_ref, n_threads, ouput_fn):
    # run blast
    blast_cmd_list = ['blastp',
                      '-query',
                      input_fn,
                      '-db',
                      prot_ref,
                      '-task',
                      'blastp-fast',
                      '-matrix',
                      'BLOSUM45',
                      '-evalue',
                      '1000',
                      '-outfmt',
                      '"6 evalue qseqid sseqid staxids sscinames sskingdoms"',
#                       '-num_descriptions',
#                       '1',
#                       '-num_alignments',
#                       '1',
                      '-max_target_seqs',
                      '5',
                      '-num_threads',
                      f'{n_threads}',
                      '-out',
                      ouput_fn,
                     ]
    blast_cmd = ' '.join(blast_cmd_list)
    print(blast_cmd)
    subprocess.check_output(blast_cmd, shell=True, stderr=subprocess.STDOUT)

In [161]:
input_fq = '/mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/output_rev/1/OligodT_ISPCR_Cell_Barcode_11_UMI/R2C2_raw_reads.fastq'
output_fa = '/mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/blastout/OligodT_ISPCR_Cell_Barcode_11_UMI_translate.fa'
fastq2protein(input_fq, 
              output_fa,
              trim_n = 100
             )

In [163]:
prot_ref = '/mnt/ibm_lg/daniel_le/data/blastdb/nr'
# prot_ref = '/mnt/ibm_lg/daniel_le/data/longreads/proteome/UP000005640_9606.fasta'
input_fn = '/mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/blastout/test.fa'
n_threads = 2
ouput_fn = '/mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/blastout/test_blastout.tsv'
run_blastp(input_fn, prot_ref, n_threads, ouput_fn)

blastp -query /mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/blastout/test.fa -db /mnt/ibm_lg/daniel_le/data/blastdb/nr -task blastp-fast -matrix BLOSUM45 -evalue 1000 -outfmt "6 evalue qseqid sseqid staxids sscinames sskingdoms" -max_target_seqs 5 -num_threads 2 -out /mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/blastout/test_blastout.tsv


## cat c3poa outputs

In [None]:
# cat results of c3poa
import glob, tqdm, os, shutil
output_dir = '/mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/output_rev/merged/'

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.mkdir(output_dir)

partitions = glob.glob('/mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/output_rev/*/')
BC_dict = {}
for partition in tqdm.tqdm(partitions):
    BCs = glob.glob(f'{partition}*/')
    for BC in BCs:
        target = BC.split(partition)[1][:-1]
        source = f'{BC}R2C2_raw_reads.fastq'
        with open(f'{output_dir}{target}.fq', 'a') as outfile:
            try:
                with open(source, 'r') as infile:
                    for line in infile:
                        outfile.write(line)
            except:
                pass


 41%|████      | 693/1695 [08:31<13:23,  1.25it/s]

## create consensus sequence

In [62]:
import subprocess, glob, multiprocessing, os
def run_spoa(fq):
    dest_root = '/'.join(fq.split('/')[:-1]) + '/cns.txt'
    if not os.path.exists(dest_root):
        cmnd = f'spoa {fq}'
        try:
            output = subprocess.check_output(
                cmnd,
                shell=True
            )
            with open(dest_root, 'w') as fout:
                fout.write(output.decode("utf-8"))
        except subprocess.CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
    else:
        pass

In [63]:
# run_spoa(fq_list[50])

In [28]:
# get all fq paths for parallel spoa
fq_dirs = glob.glob('/mnt/ibm_lg/daniel_le/data/acoel/test_c3poa_pre/onclust_output/correction/*/')
fq_list = [x + 'corrected_reads.fastq' for x in fq_dirs]

In [70]:
def parallel_spoa(fq_list, n_proc=1):
    p = multiprocessing.Pool(processes=n_proc)
    try:
        p.map(run_spoa, fq_list)
    finally:
        p.close()
        p.join()
        

In [None]:
parallel_spoa(fq_list, n_proc=32)

### trim rattle fasta

In [99]:
fastaf = '/mnt/ibm_lg/daniel_le/data/acoel/test_rattle/cns/transcriptome.fa'
trimmedfa = '/mnt/ibm_lg/daniel_le/data/acoel/test_rattle/cns/transcriptome_10plus.fa'

with open(trimmedfa, 'w') as fout:
    with open(fastaf, 'r') as fin:
        line_counter = 1
        entry_dict = {}
        
        for line in fin:    
            position = line_counter % 2
            entry_dict[position] = line.strip()
            if position == 0:
                header = entry_dict.get(1)
                header_split = header.split()
                tot_read = int(header_split[2].split('=')[1])
                seq = entry_dict.get(0)       
                if tot_read > 10:
                    fout.write(header + '\n' + seq + '\n')

            line_counter += 1

### stats of aln

In [210]:
import pysam
import pandas as pd
import numpy as np

samfile = pysam.AlignmentFile("/mnt/ibm_lg/daniel_le/data/acoel/test_minimap2/aln/tri_aln.sorted.bam", "r")
query_list = []
ref_list = []
for read in samfile:
    query_list.append(read.query_name)
    ref_list.append(samfile.get_reference_name(read.reference_id))
    
samfile.close()
pysam_df = pd.DataFrame({'cluster':query_list, 'ref':ref_list})
print('mapped clusters', len(pysam_df.dropna()))

pysam_df = pysam_df.replace(np.nan, '-')
pysam_df['algae'] = [x.startswith('T') for x in pysam_df['ref']]
print('algae reads', len(pysam_df[pysam_df['algae'] == True]))

mapped clusters 2800
algae reads 1984


In [216]:
samfile = pysam.AlignmentFile("/mnt/ibm_lg/daniel_le/data/acoel/test_minimap2/aln/tri_aln.sorted.bam", "rb")
contigs = []
mappeds = []
unmappeds = []
totals = []
for line in samfile.get_index_statistics():
    contigs.append(line[0])
    mappeds.append(line[1])
    unmappeds.append(line[2])
    totals.append(line[3])
    
df = pd.DataFrame({'contig':contigs,
                   'mapped':mappeds,
                   'unmapped':unmappeds,
                   'total':totals
                  })
    
samfile.close()

df['genome'] = [x[:2] for x in df.contig]
genome_dict = {'sc':'acoel',
               'SM':'worm',
               'Te':'algae'
              }
df['genome'] = df['genome'].map(genome_dict)
display(df.groupby('genome')['mapped'].sum(),
        df.groupby('genome')['mapped'].sum()/df.groupby('genome')['mapped'].sum().sum()
       )

genome
acoel     274
algae    1984
worm      542
Name: mapped, dtype: int64

genome
acoel    0.097857
algae    0.708571
worm     0.193571
Name: mapped, dtype: float64

### parse blast results

In [239]:
blast_in = '/mnt/ibm_lg/daniel_le/data/acoel/test_rattle/blast/blast_transcriptome_10plus.out'
blast_df = pd.read_csv(blast_in, delimiter='\t', header = None)
blast_df.columns = ['evalue','qcov','cluster','ssid', 'taxid', 'sci_name', 'common_name', 'blast_name', 'kingdom', 'stitle']
blast_df = (pd.DataFrame(blast_df.groupby('cluster')['blast_name'].value_counts())
      .rename(columns = {'blast_name':'count'})
      .reset_index()
      .sort_values('count', ascending=False)
      .groupby('cluster')
      .head(1)
     )
blast_df['algae'] = [x=='green algae' for x in blast_df['blast_name']]

print('proportion of algae')
display(blast_df['algae'].value_counts() / blast_df['algae'].value_counts().sum())

proportion of algae


False    0.979523
True     0.020477
Name: algae, dtype: float64

### overlap between blast and minimap2 algae clusters

In [234]:
merge_df = pd.merge(pysam_df, blast_df, 'left', 'cluster')
merge_df['both_true'] = [x==True and y==True for x,y in zip(merge_df['algae_x'],merge_df['algae_y'])]
merge_df['any_true'] = [x==True or y==True for x,y in zip(merge_df['algae_x'],merge_df['algae_y'])]

merge_df[merge_df['both_true'] == True]

Unnamed: 0,cluster,ref,algae_x,blast_name,count,algae_y,both_true,any_true
818,cluster_18586,Tetraselmis_0004,True,green algae,2.0,True,True,True
841,cluster_5766,Tetraselmis_0072,True,green algae,12.0,True,True,True
859,cluster_18586,Tetraselmis_0099,True,green algae,2.0,True,True,True
943,cluster_4233,Tetraselmis_0296,True,green algae,12.0,True,True,True
977,cluster_7202,Tetraselmis_0388,True,green algae,4.0,True,True,True
...,...,...,...,...,...,...,...,...
2632,cluster_6069,Tetraselmis_3175,True,green algae,12.0,True,True,True
2659,cluster_5817,Tetraselmis_3236,True,green algae,5.0,True,True,True
2731,cluster_4639,Tetraselmis_3457,True,green algae,12.0,True,True,True
2733,cluster_12665,Tetraselmis_3459,True,green algae,8.0,True,True,True


### trim transcriptome using union of blast and minimap2

In [235]:
fastaf = '/mnt/ibm_lg/daniel_le/data/acoel/test_rattle/cns/transcriptome_10plus.fa'
trimmedfa = '/mnt/ibm_lg/daniel_le/data/acoel/test_rattle/cns/transcriptome_10plus_omitAlgae.fa'

with open(trimmedfa, 'w') as fout:
    with open(fastaf, 'r') as fin:
        line_counter = 1
        entry_dict = {}
        
        for line in fin:    
            position = line_counter % 2
            entry_dict[position] = line.strip()
            if position == 0:
                header = entry_dict.get(1)
                header_split = header.split()
                cluster_id = header_split[0][1:]
                seq = entry_dict.get(0)       
                if not cluster_id in merge_df[merge_df['any_true'] == True]['cluster'].tolist():
                    fout.write(header + '\n' + seq + '\n')

            line_counter += 1

### annotate based on BLAST hits

In [250]:
blast_in = '/mnt/ibm_lg/daniel_le/data/acoel/test_rattle/blast/blast_transcriptome_10plus.out'
blast_df = pd.read_csv(blast_in, delimiter='\t', header = None)
blast_df.columns = ['evalue','qcov','cluster','ssid', 'taxid', 'sci_name', 'common_name', 'blast_name', 'kingdom', 'stitle']

# blast_df = (pd.DataFrame(blast_df.groupby('cluster')['blast_name'].value_counts())
#       .rename(columns = {'blast_name':'count'})
#       .reset_index()
#       .sort_values('count', ascending=False)
#       .groupby('cluster')
#       .head(1)
#      )
blast_df.head()['stitle'].tolist()

['Drosophila yakuba uncharacterized protein, transcript variant A (Dyak\\GE20585), mRNA',
 'Drosophila yakuba uncharacterized protein, transcript variant H (Dyak\\GE20585), mRNA',
 'Drosophila yakuba uncharacterized protein, transcript variant G (Dyak\\GE20585), mRNA',
 'Drosophila yakuba uncharacterized protein, transcript variant F (Dyak\\GE20585), mRNA',
 'Drosophila yakuba uncharacterized protein, transcript variant E (Dyak\\GE20585), mRNA']

### map raw reads to CNS