This notebook extracts all viral contigs from the CD-HIT-EST hits, and concats them together for domain finding.

In [12]:
%load_ext autoreload
%autoreload 2

In [13]:
from glob import glob
import pandas as pd
from Bio import SeqIO

import util

In [51]:
lca_df = util.load_lca()
clusters = util.load_cdhit_clusters('../data/500_contigs_cluster.clstr')



In [15]:
df = util.merge_clusters_lca(clusters, lca_df)

In [16]:
# Get Counts

In [17]:
counts = pd.read_csv('~/src/skeeters/data/contig_quality_concat/bowtie_csp_counts_1000.txt',
                     sep = '\t', header=None)
counts.columns = ['contig', 'count', 'sample']
counts['contig_key'] = counts['sample'] + '~' + counts['contig']

TODO:
    * dataframe with each contig and kingdom
    * total counts for each cluster

In [18]:
clust2kingdom = dict(zip(df['cluster'], df['kingdom']))
df[['cluster', 'kingdom']].to_csv('/Users/josh/src/skeeters/data/cluster_kingdoms.csv')

In [21]:
viral_df = df[df['kingdom'] == 'Viruses']
viral_df.to_csv('../data/annotation/viral_clusters.csv', index=None)
len(viral_df)

340

In [7]:
bacteria_df = df[df['kingdom'] == 'Bacteria']
bacteria_df.to_csv('../data/annotation/bacteria_clusters.csv')
len(bacteria_df)

1564

In [8]:
recs = [record for record in SeqIO.parse("../data/s3/contigs/500_contigs_cluster.fasta", "fasta")]

In [9]:
from copy import deepcopy
key_to_cluster = dict(zip(df.contig_key, df.cluster.map(str)))
def clean_rec(record):
    record = deepcopy(record)
    record.id = key_to_cluster.get(record.id)
    record.name = key_to_cluster.get(record.id)
    record.description = ''
    return record

In [98]:
viral_records = [clean_rec(record) for record in recs if record.id in viral_df.contig_key.values]

print("There are ", len(viral_records), " viral clusters.")

SeqIO.write(viral_records, open('../data/annotation/viral_clusters.fasta', 'w'), 'fasta')

There are  340  viral clusters.


340

In [99]:
bacteria_records = [clean_rec(record) for record in recs if record.id in bacteria_df.contig_key.values]

print("There are ", len(bacteria_records), " bacteria clusters.")

SeqIO.write(bacteria_records, open('../data/annotation/bacterial_clusters.fasta', 'w'), 'fasta')

There are  1564  bacteria clusters.


1564

In [100]:
all_records = [clean_rec(record) for record in recs]

print("There are ", len(all_records), "  clusters.")

SeqIO.write(viral_records, open('../data/annotation/all_clusters.fasta', 'w'), 'fasta')

There are  82161   clusters.


340

# Chimera Hunter

In [32]:
any(df[df['cluster'] == int(id)].kingdom == "Viruses")

False

In [33]:
for id, cluster in clusters.items():
    lengths = sorted([member['length'] for member in cluster])
    if len(lengths) > 2:
        if lengths[-1] > 1.5*lengths[-2]:
            if (df[df['cluster'] == int(id)].kingdom == 'Viruses').any():
                print(lengths)

[502, 515, 524, 578, 594, 627, 629, 684, 707, 708, 795, 2245]
[505, 523, 544, 603, 612, 612, 617, 626, 647, 654, 712, 765, 778, 1614, 2571]
[520, 550, 703, 2273]
[525, 1103, 1753]
[535, 745, 1511]
[561, 577, 992, 2240]
[524, 529, 591, 724, 746, 13844]
[676, 923, 992, 2041]
[650, 739, 2041, 2051, 2116, 3729, 7060]
[636, 661, 1843]
[3867, 3907, 3909, 3912, 3918, 3931, 3946, 7067]
[737, 794, 1408]
[559, 568, 3034]
[503, 528, 550, 554, 586, 587, 617, 659, 664, 675, 695, 709, 783, 841, 1449, 1486, 1497, 2068, 2197, 3766]
[562, 596, 643, 9781]
[507, 626, 645, 823, 1504]
[509, 519, 535, 568, 623, 698, 905, 988, 1083, 1380, 1772, 7146]
[823, 849, 854, 867, 867, 884, 1697]
[507, 518, 542, 546, 550, 586, 650, 678, 691, 729, 744, 769, 798, 837, 918, 1119, 1552, 4969]
[507, 523, 564, 2052, 3174]
[805, 845, 3196]
[501, 509, 521, 529, 530, 542, 545, 573, 576, 577, 600, 627, 635, 644, 678, 718, 753, 808, 829, 902, 946, 1102, 1430, 1475, 1532, 1535, 1995, 2059, 2845, 7155]


# Scratch

For each cluster, we want the most specific mapping.

In [466]:
from statistics import mode
def mode(lst):
    if len(lst) > 0:
        return max(set(lst), key=lst.count)
    else:
        return None
def get_cluster_leaf(cluster):
    taxid_list = [member.taxid for member in cluster]
    if all([taxid is None for taxid in taxid_list]):
        return None
    leaf = None
    lineage = []
    for taxid in taxid_list:
        if taxid in lineage or taxid is None:
            continue
        else:
            new_lineage = ncbi.get_lineage(taxid)
            if leaf is None or leaf in new_lineage:
                leaf = taxid
                lineage = new_lineage
            else:
                return -1
    return leaf

def get_cluster_mode(cluster):
    taxid_list = [member.taxid for member in cluster]
    return mode(taxid_list)

def get_cluster_rep(cluster):
    for member in cluster:
        if member.is_ref:
            return member

# Merge Viral DF With Annotations

In [1]:
import pandas as pd

In [43]:
annotated = pd.read_csv('/Users/josh/src/skeeters/data/annotation/viral_clusters_annotated_amy.csv')

In [44]:
annotated = annotated[['cluster', 'manual_blastn', 'manual_blastx', 'manual_blast_notes', 'follow_up_notes']]

In [46]:
annotations = annotated[annotated.drop('cluster', axis = 1).isna().sum(axis = 1) < 4]
annotations.to_csv('../data/annotation/riboviria_annotations.csv')

In [47]:
viral_df = pd.read_csv('../data/annotation/viral_clusters.csv')

In [50]:
viral_df_annotated = viral_df.merge(annotations, how='left', on='cluster')
viral_df_annotated.to_csv('../data/annotation/viral_clusters_annotated.csv')

In [53]:
rdrps = pd.read_csv('../data/annotation/hmm_rdrp.tsv', sep='\t',
                    columns=['contig_key', ])

In [54]:
rdrps

Unnamed: 0,CMS002_026d_Rb_S149_L004|NODE_3_length_2417_cov_14.070085,OrthomyxoRdRp_PF00602_full
0,CMS002_045f_Rb_S189_L004|NODE_1_length_6613_co...,BunyaRdRp_PF04196_full
1,CMS002_045f_Rb_S189_L004|NODE_2_length_2972_co...,"PicornaRdRp_PF00680_full,LuteoTotiRotaRdRp_PF0..."
2,CMS002_045f_Rb_S189_L004|NODE_4_length_2425_co...,OrthomyxoRdRp_PF00602_full
3,CMS002_045d_Rb_S186_L004|NODE_2_length_3060_co...,"PicornaRdRp_PF00680_full,LuteoTotiRotaRdRp_PF0..."
4,CMS002_026a_Rb_S146_L004|NODE_1_length_6613_co...,BunyaRdRp_PF04196_full
5,CMS001_042_Ra_S23|NODE_1_length_7464_cov_55.51...,"BunyaRdRp_PF04196_full,ArenaRdRp_PF06317_full,..."
6,CMS001_042_Ra_S23|NODE_4_length_3148_cov_206.4...,MitoRdRp_PF05919_full
7,CMS002_029c_Rb_S161_L004|NODE_1_length_7463_co...,"BunyaRdRp_PF04196_full,ArenaRdRp_PF06317_full,..."
8,CMS002_029c_Rb_S161_L004|NODE_2_length_7155_co...,"LuteoTotiRotaRdRp_PF02123_full,PicornaRdRp_PF0..."
9,CMS002_029c_Rb_S161_L004|NODE_4_length_4950_co...,"BromoTobamoTogaRdRp_PF00978_full,PicornaRdRp_P..."
