# Merge LCA and Curated virus calls

In [1]:
import pandas as pd

In [2]:
lca = pd.read_csv('../data/s3/contig_quality_concat/contig_stats_lca.tsv', sep='\t')
lca['blasted'] = True
lca.rename({'taxid': 'taxid_blast', 'taxon_group': 'taxon_group_blast'}, axis = 1, inplace = True)
lca['taxon_group_blast'] = lca.apply(lambda x: 'Hexapoda' if x.hexapoda else x.taxon_group_blast, axis = 1)

read_counts = pd.read_csv('../data/s3/contig_quality_concat/contig_stats_all.tsv', sep='\t')

In [3]:
lca.head()

Unnamed: 0,sample,contig_name,contig_length,read_count,nt,nr,hexapoda,nt_or_nr,taxid_blast,bitscore,identity,align_length,taxon_group_blast,blasted
0,CMS001_001_Ra_S1,NODE_5_length_5116_cov_2209.828736,5116,186698,True,True,False,nt,2559587.0,5016.0,89.666,3948.0,Viruses,True
1,CMS001_001_Ra_S1,NODE_2_length_6399_cov_55.991142,6399,6127,False,True,False,nr,10239.0,1869.7,45.48,2137.0,Viruses,True
2,CMS001_001_Ra_S1,NODE_304_length_858_cov_335.376440,858,5024,True,False,False,nt,5654.0,944.0,91.180835,703.0,Eukaryota,True
3,CMS001_001_Ra_S1,NODE_11_length_3224_cov_1346.461710,3224,74963,True,True,False,nt,131567.0,2052.0,79.313,3055.0,Ambiguous,True
4,CMS001_001_Ra_S1,NODE_1088_length_508_cov_3732.148492,508,25176,True,False,False,nt,2304480.0,871.0,98.971019,486.0,Viruses,True


In [4]:
cluster_to_virus = pd.read_csv('../data/darkmatter/clusters_to_virus.csv')

In [5]:
cluster_to_contigs = pd.read_csv('../data/darkmatter/cluster_contig.csv')
cluster_to_contigs.rename({'contig': 'contig_name'}, axis = 1, inplace=True)

In [6]:
curated_viruses = (cluster_to_contigs
                       .merge(cluster_to_virus, on='cluster', how = 'right', validate = 'many_to_one')
                       .merge(read_counts, on=['sample', 'contig_name'], how = 'left', validate = 'one_to_one'))
curated_viruses['curated'] = True
curated_viruses['rdrp'] = curated_viruses['segment'].str.lower().str.contains('rdrp')
curated_viruses['rdrp'] |= (curated_viruses['segment'] == 'genome')

curated_viruses.rename({'submission_taxid': 'taxid_curated'}, axis = 1, inplace = True)

In [7]:
curated_viruses['rdrp'].sum()

663

In [8]:
contig_calls = lca.merge(curated_viruses,
                         on = ['sample', 'contig_name', 'contig_length', 'read_count'],
                         how = 'outer')
contig_calls.fillna({'blasted': False,
                     'curated': False,
                     'taxon_group_blast': 'Dark',
                     'rdrp': False}, inplace=True)

In [9]:
contig_calls['taxon_group'] = contig_calls.apply(lambda x: ('Viruses' if x['curated'] else x['taxon_group_blast']), axis = 1)
contig_calls['taxid'] = contig_calls.apply(lambda x: x['taxid_curated'] if x['curated'] else x['taxid_blast'], axis = 1)

In [10]:
contig_calls.head()

Unnamed: 0,sample,contig_name,contig_length,read_count,nt,nr,hexapoda,nt_or_nr,taxid_blast,bitscore,...,cluster,name,poly_group,provisional_name,segment,taxid_curated,curated,rdrp,taxon_group,taxid
0,CMS001_001_Ra_S1,NODE_5_length_5116_cov_2209.828736,5116,186698,True,True,False,nt,2559587.0,5016.0,...,128.0,Hubei mosquito virus 4,128.0,Hubei mosquito virus 4,genome,1922928.0,True,True,Viruses,1922928.0
1,CMS001_001_Ra_S1,NODE_2_length_6399_cov_55.991142,6399,6127,False,True,False,nr,10239.0,1869.7,...,76.0,TBD,76.0,76|Phasma-like,RdRp_L,2546221.0,True,True,Viruses,2546221.0
2,CMS001_001_Ra_S1,NODE_304_length_858_cov_335.376440,858,5024,True,False,False,nt,5654.0,944.0,...,,,,,,,False,False,Eukaryota,5654.0
3,CMS001_001_Ra_S1,NODE_11_length_3224_cov_1346.461710,3224,74963,True,True,False,nt,131567.0,2052.0,...,,,,,,,False,False,Ambiguous,131567.0
4,CMS001_001_Ra_S1,NODE_1088_length_508_cov_3732.148492,508,25176,True,False,False,nt,2304480.0,871.0,...,,,,,,,False,False,Viruses,2304480.0


Total reads per taxonomic group

In [11]:
contig_calls.groupby('taxon_group')['read_count'].sum()

taxon_group
Ambiguous      572414
Archaea           168
Bacteria       690251
Eukaryota      641345
Hexapoda      4090187
Metazoa        219258
Viruses      10938605
Name: read_count, dtype: int64

Sources of curated reads.

In [12]:
contig_calls[contig_calls['curated']].groupby(['taxon_group_blast'])['read_count'].sum()

taxon_group_blast
Ambiguous     595798
Bacteria       95171
Dark          384433
Eukaryota      37772
Hexapoda      547910
Viruses      8902247
Name: read_count, dtype: int64

Dark matter reads reclassified as virus: some are polymerase, some are other segments.

In [13]:
contig_calls[contig_calls['taxon_group_blast'] == 'Dark'].groupby(['rdrp'])['read_count'].sum()

rdrp
False    338472
True      45961
Name: read_count, dtype: int64

In [14]:
contig_calls.to_csv('../data/s3/contig_quality_concat/contig_calls.tsv',
                    sep='\t', index = False)

# Decontamination

We compute the average concentration of each taxon in the water. Markov's inequality says that for any nonnegative random variable $X$, the probability that $X$ is greater $k$ is less than $\mathbb{E}X/k$. Thus for any false-discovery rate $r$, the probability that $X > \mathbb{E}X/r$ is less than $\mathbb{E}X/(\mathbb{E}X/r) = r$. If we treat concentrations of contaminating taxa in each well as a random variable, we can estimate the mean from the water, and use that estimate to bound (using Markov's inequality), the amount of that taxon contained in another sample can be explained by contamination.

In [15]:
sample_table = pd.read_csv('../data/metadata/idseq_metadata.csv')

In [16]:
sample_table['water'] = sample_table['sample'].str.contains('ater')
sample_table = sample_table[['sample',
                              'total_reads',
                              'nonhost_reads',
                              'total_ercc_reads',
                              'compression_ratio',
                              'water']]
sample_table['input_conc'] = sample_table.eval('(total_reads - total_ercc_reads)/total_ercc_reads')
sample_table['nonhost_frac'] = sample_table.eval('nonhost_reads*compression_ratio/total_reads')
n_water = sum(sample_table['water'])

In [17]:
def sum_over(df, category):
    return df[[category, 'sample', 'read_count']].groupby([category, 'sample']).sum().reset_index()

In [18]:
read_counts = sum_over(contig_calls, 'poly_group')

In [19]:
def decontam(read_counts, category, fdr):
    background_levels = (read_counts.merge(sample_table[sample_table['water']],
                                          on='sample', validate='many_to_one')
                         .eval('r_per_ercc = read_count*compression_ratio/total_ercc_reads')
                         [['r_per_ercc', category]]
                         .groupby(category).sum()/n_water)
    
    df = read_counts.merge(sample_table, on = 'sample', how = 'left')
    df = df[~df['water']]
    df = df.merge(background_levels, on=category, how = 'left').fillna({'r_per_ercc': 0}).eval(
    'background_reads = total_ercc_reads*r_per_ercc/compression_ratio')
    df['is_contam'] = df['read_count'] < df['background_reads']/fdr
    
    return df[['sample', category, 'is_contam', 'read_count']]

## Curated Viruses

In [20]:
viral_df = sum_over(contig_calls[contig_calls['curated']], 'poly_group')
viral_df = decontam(viral_df, 'poly_group', 0.01)

In [21]:
viral_df.groupby('is_contam')['read_count'].sum()

is_contam
False    10555472
True         1417
Name: read_count, dtype: int64

Viral contamination: 2757 reads.

In [22]:
# Write contam and decontam files
viral_decontam = viral_df[~viral_df['is_contam']].filter(['sample', 'poly_group', 'read_count']).rename(
{'read_count': 'reads'}, axis = 1)
viral_decontam.to_csv('../data/s3/contig_quality_concat/viral_decontam.tsv',
                      sep='\t', index = False)

viral_contamination =  viral_df[viral_df['is_contam']].filter(['sample', 'poly_group', 'read_count']).rename(
{'read_count': 'reads'}, axis = 1)
viral_contamination.to_csv('../data/s3/contig_quality_concat/viral_decontam.tsv',
                           sep='\t', index = False)

In [23]:
viral_contamination.groupby('poly_group').sum()

Unnamed: 0_level_0,reads
poly_group,Unnamed: 1_level_1
44.0,706
458.0,544
493.0,167


# Other Taxa

Compute contaminants for all taxa, but only use for uncurated counts.

In [24]:
contig_calls_no_hex = contig_calls[(contig_calls['hexapoda'] != True) | (contig_calls['curated'])]

# Reset index
contig_calls_no_hex = contig_calls_no_hex.reset_index().drop('index', axis = 1)

In [25]:
all_decontam = decontam(sum_over(contig_calls_no_hex, 'taxid'), 'taxid', 0.01)

In [26]:
lca_df = contig_calls_no_hex[~contig_calls_no_hex['curated']]
lca_df = lca_df[~lca_df['sample'].str.contains('ater')]
lca_df = (sum_over(lca_df, 'taxid')
                          .merge(all_decontam.filter(['sample', 'taxid', 'is_contam']),
                                                                      on = ['sample', 'taxid'],
                                                                      how = 'left'))

In [27]:
lca_df.groupby('is_contam')['read_count'].sum()

is_contam
False    2459342
True       27720
Name: read_count, dtype: int64

In [28]:
# Write contam and decontam files
lca_decontam = lca_df[~lca_df['is_contam']].filter(['sample', 'taxid', 'read_count']).rename(
{'read_count': 'reads'}, axis = 1)
lca_decontam.to_csv('../data/s3/contig_quality_concat/lca_decontam.tsv',
                    sep='\t', index = False)

lca_contamination =  lca_df[lca_df['is_contam']].filter(['sample', 'taxid', 'read_count']).rename(
{'read_count': 'reads'}, axis = 1)
lca_contamination.to_csv('../data/s3/contig_quality_concat/lca_decontam.tsv',
                         sep='\t', index = False)

Output all contigs not part of contamination.

In [29]:
lca_contamination.groupby('taxid').sum()

Unnamed: 0_level_0,reads
taxid,Unnamed: 1_level_1
1.0,13316
2.0,4134
137.0,7
469.0,7
953.0,197
...,...
1783272.0,8
1925501.0,128
2057741.0,4
2304480.0,5


## Final decontam contigs

Strip contigs with taxa removed during decontam, and hexapoda.

In [30]:
viral_contam_idx = (contig_calls_no_hex.merge(viral_df[['sample', 'poly_group', 'is_contam']],
                          on = ['sample', 'poly_group'], how = 'left')['is_contam'] == True)

In [31]:
lca_contam_loaded = contig_calls_no_hex.merge(lca_df[['sample', 'taxid', 'is_contam']],
                          on = ['sample', 'taxid'], how = 'left')

lca_contam_idx = (lca_contam_loaded['is_contam'] == True) & (lca_contam_loaded['curated'] == False)

In [32]:
not_contam_idx = ~(viral_contam_idx | lca_contam_idx)

In [33]:
contig_calls_decontam = contig_calls_no_hex.loc[~(viral_contam_idx | lca_contam_idx)]

In [34]:
contig_calls_decontam = contig_calls_decontam[
    ~contig_calls_decontam['sample'].str.contains('ater')]

In [35]:
contig_calls_decontam.to_csv('../data/s3/contig_quality_concat/contig_calls_decontam.tsv',
                             sep='\t', index = False)

In [36]:
contig_calls.groupby('taxon_group')['read_count'].sum()

taxon_group
Ambiguous      572414
Archaea           168
Bacteria       690251
Eukaryota      641345
Hexapoda      4090187
Metazoa        219258
Viruses      10938605
Name: read_count, dtype: int64

In [37]:
contig_calls_decontam.groupby('taxon_group')['read_count'].sum()

taxon_group
Ambiguous      553300
Archaea           138
Bacteria       680711
Eukaryota      636288
Metazoa        214209
Viruses      10930168
Name: read_count, dtype: int64

Contamination

In [38]:
 contig_calls_no_hex.loc[(viral_contam_idx | lca_contam_idx)].groupby('taxon_group')['read_count'].sum()

taxon_group
Ambiguous    15679
Bacteria      4820
Eukaryota     4383
Metazoa       2623
Viruses       1632
Name: read_count, dtype: int64

# Validation

In [39]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()
def taxid2name(taxid):
    return ncbi.get_taxid_translator([taxid])[taxid]

In [40]:
contig_calls_decontam = pd.read_csv('../data/s3/contig_quality_concat/contig_calls_decontam.tsv',
                                    sep='\t')

In [41]:
contig_calls_decontam.head()

Unnamed: 0,sample,contig_name,contig_length,read_count,nt,nr,hexapoda,nt_or_nr,taxid_blast,bitscore,...,cluster,name,poly_group,provisional_name,segment,taxid_curated,curated,rdrp,taxon_group,taxid
0,CMS001_001_Ra_S1,NODE_5_length_5116_cov_2209.828736,5116,186698,True,True,False,nt,2559587.0,5016.0,...,128.0,Hubei mosquito virus 4,128.0,Hubei mosquito virus 4,genome,1922928.0,True,True,Viruses,1922928.0
1,CMS001_001_Ra_S1,NODE_2_length_6399_cov_55.991142,6399,6127,False,True,False,nr,10239.0,1869.7,...,76.0,TBD,76.0,76|Phasma-like,RdRp_L,2546221.0,True,True,Viruses,2546221.0
2,CMS001_001_Ra_S1,NODE_304_length_858_cov_335.376440,858,5024,True,False,False,nt,5654.0,944.0,...,,,,,,,False,False,Eukaryota,5654.0
3,CMS001_001_Ra_S1,NODE_11_length_3224_cov_1346.461710,3224,74963,True,True,False,nt,131567.0,2052.0,...,,,,,,,False,False,Ambiguous,131567.0
4,CMS001_001_Ra_S1,NODE_1088_length_508_cov_3732.148492,508,25176,True,False,False,nt,2304480.0,871.0,...,,,,,,,False,False,Viruses,2304480.0


In [42]:
partition = "Apicomplexa Crithidia Pecora Carnivora Homininae Rodentia Leporidae Aves".split()
partition = ncbi.get_name_translator(partition)
partition = {v[0]: k for k, v in partition.items()}

def get_category(taxid):
    if not taxid:
        return None
    try:
        lineage = ncbi.get_lineage(taxid)
    except:
        return 'NA'
    for k in partition:
        if k in lineage:
            return partition[k]
    else:
        return 'NA'

In [43]:
partition

{5655: 'Crithidia',
 5794: 'Apicomplexa',
 8782: 'Aves',
 9979: 'Leporidae',
 9989: 'Rodentia',
 33554: 'Carnivora',
 35500: 'Pecora',
 207598: 'Homininae'}

In [44]:
category = contig_calls_decontam['taxid'].apply(get_category)
contig_calls_decontam['category'] = category



In [45]:
contig_calls_decontam[['category', 'read_count']].groupby('category').sum()

Unnamed: 0_level_0,read_count
category,Unnamed: 1_level_1
Apicomplexa,6022
Aves,50572
Carnivora,5304
Crithidia,144563
Homininae,6313
Leporidae,903
,12771133
Pecora,17306
Rodentia,12698


In [46]:
lca = pd.read_csv('../data/s3/contig_quality_concat/contig_stats_lca.tsv', sep='\t')
category = lca['taxid'].apply(get_category)
lca['category'] = category
lca[['category', 'read_count']].groupby('category').sum()



Unnamed: 0_level_0,read_count
category,Unnamed: 1_level_1
Apicomplexa,6047
Aves,52469
Carnivora,5310
Crithidia,144563
Homininae,8255
Leporidae,903
,16519487
Pecora,17306
Rodentia,13455


In [57]:
apicomplexa = contig_calls_decontam[contig_calls_decontam['category'] == 'Apicomplexa'].copy()

In [58]:
apicomplexa['name'] = apicomplexa['taxid'].apply(taxid2name)

In [60]:
apicomplexa.groupby(['sample', 'name'])['read_count'].sum()

sample                  name                           
CMS001_001_Ra_S1        Eimeria maxima                     12
                        Plasmodium chabaudi chabaudi       20
                        Plasmodium cynomolgi               14
                        Plasmodium ovale                   22
CMS001_003_Ra_S2        Eimeria acervulina                  4
                                                           ..
CMS002_053a_Rb_S7_L004  Plasmodium ovale                   10
                        Plasmodium sp. gorilla clade G2     4
                        Plasmodium vivax                    9
                        Plasmodium yoelii killicki          3
CMS002_056a_Rb_S9_L004  Plasmodium yoelii                   6
Name: read_count, Length: 223, dtype: int64