## Building PSSM profiles from alignments

In [1]:
%%bash

base_path=/Users/evogytis/Documents/manuscripts/skeeters/data/darkmatter/RdRp_profiles ## where alignments are kept
cd $base_path

for msa in *full.fasta ## iterate over alignments of each Pfam group
    do
    echo $msa
    out=${msa/_full.fasta/_pssm.txt}
    hmmbuild $out $msa ## build PSSM
    
done;

cat *_pssm.txt > RdRp_profiles.pssm.txt ## concatenate Pfam PSSMs into a single file
hmmpress -f RdRp_profiles.pssm.txt ## compress for search

ArenaRdRp_PF06317_full.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.1b2 (February 2015); http://hmmer.org/
# Copyright (C) 2015 Howard Hughes Medical Institute.
# Freely distributed under the GNU General Public License (GPLv3).
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             ArenaRdRp_PF06317_full.fasta
# output HMM file:                  ArenaRdRp_PF06317_pssm.txt
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     ArenaRdRp_PF06317_full    16  2640  1096     1.62  0.591 

# CPU time: 2.29u 0.03s 00:00:02.31 Elapsed: 00:00:02.79
BirnaRdRp_PF04197_full.fasta
# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.1b2 (February 2015); http://hmmer.org/
# Copyright (C) 2015 Howard Hugh

## Extract representative contigs of each cluster

In [2]:
import pandas as pd
from collections import defaultdict, namedtuple
import os,re,glob
from Bio import SeqIO

Member = namedtuple('Member', ['contig', 'length', 'percent_id', 'percent_id_sign', 'sample',
                               'coverage', 'uuid', 'reads'])

base_path='/Users/evogytis/Documents/manuscripts/skeeters/data/'

# Use abundance information from mapping reads back to contigs
count_df = pd.read_csv('/Users/evogytis/Downloads/contig_counts.csv')
count_df['uuid'] = count_df['sample'] + '~' + count_df['contig']
count_lookup = {uuid: counts for uuid, counts in zip(count_df['uuid'], count_df['counts'])}

def parse_cdhit_row(row):
    if '*' in row:
        index, length, name, percent_id = row.split()
        percent_id_sign, percent_id = '0', 100
    else:
        index, length, name, _, percent_id = row.split()
    length = int(length.strip(',nt'))
    name = name.strip('>').strip('.')
    uuid = name
    sample, contig = name.split('~')
    coverage = float(contig.split('_')[-1])
    reads = count_lookup[uuid] if uuid in count_lookup else 0
    
    if percent_id != 100:
        percent_id_sign, percent_id = percent_id.strip('%').split('/')
        percent_id = float(percent_id)
        
    return Member(contig=contig, sample=sample, length=length,
                  percent_id=percent_id, percent_id_sign=percent_id_sign, coverage=coverage,
                  uuid=uuid, reads=reads)

clusters = defaultdict(list)
with open(os.path.join(base_path,'500_contigs_cluster.clstr'), 'r') as file:
    for line in file:
        if line.startswith('>Cluster'):
            cluster_id = line.split()[-1]
        else:
            member = parse_cdhit_row(line)
            if 'water' in member.sample.lower():
                continue
            clusters[cluster_id].append(member)
            
def get_representative(cluster):
    representative = [member for member in cluster if member.percent_id_sign == '0'][0]
    return representative

fetch={} ## cluster ID of representative contig
for clustID in clusters:
    try:
        representative_contig=get_representative(clusters[clustID])
        fetch[representative_contig.contig]=clustID
    except:
        print(clustID,'failed, probably because longest contig is from water')

store_representatives=open(os.path.join(base_path,'darkmatter/representative_contigs.fasta'),'w')

for contig_file in glob.glob(os.path.join(base_path,'s3/contigs','*','contigs.fasta')):
    sample=os.path.basename(os.path.dirname(contig_file))
    for seq in SeqIO.parse(open(contig_file,'r'),format='fasta'):
        if seq.id in fetch:
            seq_name=seq.id
            cluster_id=fetch[seq_name]
            seq=str(seq.seq)
            
            store_representatives.write('>%s|%s|%s\n%s\n'%(cluster_id,sample,seq_name,seq))
            
store_representatives.close()

3857 failed, probably because longest contig is from water
44862 failed, probably because longest contig is from water
68543 failed, probably because longest contig is from water


In [3]:
longestORF=re.compile('[A-Z]+') ## longest ORF is delimited by *

RdRp_motifs=re.compile('GDD|GDN|SDD|ADN') ## RdRp catalytic domain residues

repr_contig_path=os.path.join(base_path,'darkmatter/representative_contigs.fasta') ## path to representative contigs
RdRp_candidates=open(os.path.join(base_path,'darkmatter/RdRp_candidates.fasta'),'w') ## file where contigs likely to be polymerases will be stored
RdRp_candidate_proteins=open(os.path.join(base_path,'darkmatter/RdRp_candidates_prot.fasta'),'w') ## file where proteins likely to be polymerases will be stored

c=0
t=0

for seq in SeqIO.parse(open(repr_contig_path,'r'),format='fasta'): ## iterate over representative contigs
    candidateORFs=[]
    candidateProt=[]
    
    for s,sequence in enumerate([seq.seq,seq.seq.complement()[::-1]]): ## fwd+rev
        for i in range(3): ## reading frames
            
            translation=str(sequence[i:].translate()) ## translate direction+frame
            
            for protein in longestORF.findall(translation): ## find all proteins
#                 if RdRp_motifs.search(protein) and len(protein)>200: ## protein has RdRp-like motif and is longer than 200 amino acids
                if len(protein)>200:
                    candidateORFs.append(sequence) ## add sequence for later
                    candidateProt.append(protein)
                    
    if len(candidateORFs)>0:
        c+=1
        RdRp_candidates.write('>%s\n%s\n'%(seq.id,seq.seq))
        for p in range(len(candidateProt)):
            RdRp_candidate_proteins.write('>%s|%s\n%s\n'%(seq.id,p,candidateProt[p]))
    t+=1
    
RdRp_candidates.close()
RdRp_candidate_proteins.close()

print(c,t,c/t)



18340 82003 0.2236503542553321


In [4]:
%%bash

profiledb=/Users/evogytis/Documents/manuscripts/skeeters/data/darkmatter/RdRp_profiles/RdRp_profiles.pssm.txt ## PSSMs of RdRps
candidates=/Users/evogytis/Documents/manuscripts/skeeters/data/darkmatter/RdRp_candidates_prot.fasta ## proteins to be scanned with PSSMs

out=${candidates/.fasta/.pssm.out}

hmmscan --noali -o $out $profiledb $candidates ## scan away


In [5]:
from Bio.SearchIO import HmmerIO

hitfile=os.path.join(base_path,'darkmatter/RdRp_candidates_prot.pssm.out')
hits=HmmerIO.Hmmer3TextParser(open(hitfile,'r'))
RdRp_clusters=set()

# count=0
total=0
for query in sorted(hits,key=lambda k: int(k.id.split('|')[0])):
    if len(query.hits)>0:
        cluster_name,sample,contig_name,_=query.id.split('|')
    
        
        families=[hit.id for hit in query.hits]
        print('%s'%('|'.join(query.id.split('|')[:-1])))
        
        if len(clusters[cluster_name])<2:
            print('contig %s does not belong to a cluster'%(contig_name))
        
        for hit in query.hits:
            if hit.evalue<1e-5:
                print('%s: %s'%(hit.id,hit.evalue))
                if cluster_name in clusters:
                    RdRp_clusters.add(cluster_name)
            else:
                print('not significant',hit.id,hit.evalue)
#             print(query.id
        print()
        
    total+=1

count=len(RdRp_clusters)
print(len(RdRp_clusters))
print('%s hits out of %s (no hits: %s; fraction: %s)'%(count,total,total-count,count/total))
# 163 hits out of 21177 (no hits: 21014; fraction: 0.0076970297964773105) ## all proteins >200 aa, unfiltered by catalytic motif


1|CMS001_039_Ra_S9|NODE_1_length_18277_cov_48.170824
FlaviRdRp_PF00972_full: 5.9e-27

2|CMS002_032a_Rb_S166_L004|NODE_1_length_13844_cov_454.512457
MononegaRdRp_PF00946_full: 1.4e-216

3|CMS001_039_Ra_S9|NODE_2_length_13816_cov_21.969794
contig NODE_2_length_13816_cov_21.969794 does not belong to a cluster
MononegaRdRp_PF00946_full: 6.9e-217

4|CMS002_053a_Rb_S7_L004|NODE_1_length_13637_cov_31.429499
MononegaRdRp_PF00946_full: 2e-222

5|CMS002_017c_Rb_S124_L004|NODE_1_length_12898_cov_39.149754
MononegaRdRp_PF00946_full: 9.1e-163

6|CMS001_008_Ra_S3|NODE_1_length_12382_cov_11.752133
MononegaRdRp_PF00946_full: 6.8e-155

7|CMS002_020c_Rb_S133_L004|NODE_1_length_12368_cov_173.470507
MononegaRdRp_PF00946_full: 6.8e-155

8|CMS002_035a_Rb_S169_L004|NODE_1_length_11789_cov_864.832650
MononegaRdRp_PF00946_full: 1.3e-264

9|CMS002_047g_Rb_S200_L004|NODE_1_length_11763_cov_406.947715
BromoTobamoTogaRdRp_PF00978_full: 6.4e-77
PicornaRdRp_PF00680_full: 1.3e-07

11|CMS002_029d_Rb_S162_L004|NODE_1_l