This notebook looks at the differences in jaccard similarity and jaccard containment.

In [1]:
setwd("..")

In [3]:
library(readr)
library(dplyr)
library(tidyr)

In [6]:
comp <- read_csv("outputs/orpheum_compare/HSM67VF9_comp.csv", show_col_types = F) %>%
  mutate(sample = colnames(.)) %>%
  pivot_longer(cols = -sample, names_to = "sample2", values_to = "jaccard_similarity")

In [22]:
head(comp)

sample,sample2,jaccard_similarity
<chr>,<chr>,<dbl>
s__Phocaeicola_vulgatus,s__Phocaeicola_vulgatus,1.0
s__Phocaeicola_vulgatus,s__Bacteroides_uniformis,0.09333552
s__Phocaeicola_vulgatus,s__Clostridium_Q_symbiosum,5.823433e-05
s__Phocaeicola_vulgatus,s__Parabacteroides_distasonis,0.04755351
s__Phocaeicola_vulgatus,s__Ruminococcus_B_gnavus,0.0001747335
s__Phocaeicola_vulgatus,s__Parabacteroides_merdae,0.04365497


In [8]:
lineages <- read_csv("inputs/gtdb-rs202.taxonomy.v2.csv", show_col_types = F) %>%
  filter(ident %in% comp$sample) %>%
  mutate(species = gsub(" ", "_", species)) %>%
  filter(species %in% comp$sample) %>%
  select(gtdb_accession = ident, gtdb_species = species)

In [23]:
head(lineages)

gtdb_accession,gtdb_species
<chr>,<chr>
GCF_009020325.1,s__Bacteroides_uniformis
GCF_003458955.1,s__Bacteroides_fragilis
GCA_000162535.1,s__Parabacteroides_distasonis
GCF_009025805.1,s__Phocaeicola_vulgatus
GCF_002865385.1,s__Ruminococcus_B_gnavus
GCF_003475305.1,s__Parabacteroides_merdae


In [9]:
tmp <- left_join(comp, lineages, by = c("sample" = "gtdb_accession")) %>%
  filter(sample2 == gtdb_species)

In [13]:
tmp

sample,sample2,jaccard_similarity,gtdb_species
<chr>,<chr>,<dbl>,<chr>
GCF_009025805.1,s__Phocaeicola_vulgatus,0.842259,s__Phocaeicola_vulgatus
GCF_009020325.1,s__Bacteroides_uniformis,0.8039602,s__Bacteroides_uniformis
GCF_000189615.1,s__Clostridium_Q_symbiosum,0.1020408,s__Clostridium_Q_symbiosum
GCA_000162535.1,s__Parabacteroides_distasonis,0.9079019,s__Parabacteroides_distasonis
GCF_002865385.1,s__Ruminococcus_B_gnavus,0.1428571,s__Ruminococcus_B_gnavus
GCF_003475305.1,s__Parabacteroides_merdae,0.8922791,s__Parabacteroides_merdae
GCF_900537995.1,s__Roseburia_intestinalis,0.2511416,s__Roseburia_intestinalis
GCF_003458955.1,s__Bacteroides_fragilis,0.9056434,s__Bacteroides_fragilis
GCF_003433765.1,s__Enterocloster_bolteae,0.3205742,s__Enterocloster_bolteae


In [29]:
containment_comp <- read_csv("outputs/orpheum_compare/HSM67VF9_comp_containment.csv", show_col_types = F) %>%
  mutate(sample = colnames(.)) %>%
  pivot_longer(cols = -sample, names_to = "sample2", values_to = "containment")

In [30]:
head(containment_comp)

sample,sample2,containment
<chr>,<chr>,<dbl>
s__Phocaeicola_vulgatus,s__Phocaeicola_vulgatus,1.0
s__Phocaeicola_vulgatus,s__Bacteroides_uniformis,0.23874346
s__Phocaeicola_vulgatus,s__Clostridium_Q_symbiosum,0.06666667
s__Phocaeicola_vulgatus,s__Parabacteroides_distasonis,0.11159272
s__Phocaeicola_vulgatus,s__Ruminococcus_B_gnavus,0.21428571
s__Phocaeicola_vulgatus,s__Parabacteroides_merdae,0.10368902


In [37]:
containment_comp %>%
  left_join(comp, by = c("sample", "sample2")) %>%
  filter(!sample == sample2) %>%
  filter(grepl("s__", sample)) %>%
  filter(grepl("s__", sample2))

sample,sample2,containment,jaccard_similarity
<chr>,<chr>,<dbl>,<dbl>
s__Phocaeicola_vulgatus,s__Bacteroides_uniformis,0.23874346,9.333552e-02
s__Phocaeicola_vulgatus,s__Clostridium_Q_symbiosum,0.06666667,5.823433e-05
s__Phocaeicola_vulgatus,s__Parabacteroides_distasonis,0.11159272,4.755351e-02
s__Phocaeicola_vulgatus,s__Ruminococcus_B_gnavus,0.21428571,1.747335e-04
s__Phocaeicola_vulgatus,s__Parabacteroides_merdae,0.10368902,4.365497e-02
s__Phocaeicola_vulgatus,s__Roseburia_intestinalis,0.03636364,1.162048e-04
s__Phocaeicola_vulgatus,s__Bacteroides_fragilis,0.11875754,6.197866e-02
s__Phocaeicola_vulgatus,s__Enterocloster_bolteae,0.08955224,3.484523e-04
s__Bacteroides_uniformis,s__Phocaeicola_vulgatus,0.23874346,9.333552e-02
s__Bacteroides_uniformis,s__Clostridium_Q_symbiosum,0.06666667,1.045588e-04
