This notebook looks at the BLAST results.

In [1]:
setwd("..")

In [12]:
library(dplyr)
library(readr)
library(tidyr)
library(purrr)
library(rentrez)

## Functions

In [7]:
read_blast <- function(blast_path){
  read_csv(blast_path, col_names = c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
                                     "qstart", "qend", "sstart", "send", "evalue", "bitscore"),
          show_col_types = F)
}

## Read and format BLAST results

In [13]:
blast_results <- Sys.glob("outputs/orpheum_species_unmapped/04_blast/*csv") %>%
  set_names() %>%
  map_dfr(read_blast, .id = "species") %>%
  mutate(species = basename(species)) %>%
  separate(species, into = c("accession", "species", "blast"), sep = "-")

blast_results_filtered <- blast_results %>%
  group_by(species, qseqid) %>%
  arrange(desc(bitscore)) %>%
  slice_head(n = 1)

“Expected 3 pieces. Additional pieces discarded in 2482 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”


In [14]:
#  remove those that don't have matches to rentrez
sseqid <- unique(blast_results_filtered$sseqid)
sseqid <- sseqid[!sseqid %in% c("EHA9991508.1", "MCE9000919.1")]

In [15]:
sseqid_species_all <- data.frame()
for(i in sseqid){
  print(i)
  sseqid_species <- entrez_summary(db="protein", id = i)
  sseqid_species <- sseqid_species[27]
  sseqid_species_df <- data.frame(sseqid = i, species = sseqid_species)
  sseqid_species_all <- bind_rows(sseqid_species_all, sseqid_species_df)
}

[1] "EXY10881.1"
[1] "WP_045899403.1"
[1] "EEZ26295.1"
[1] "KAB4180210.1"
[1] "BCI62864.1"
[1] "EEX46026.1"
[1] "WP_195578804.1"
[1] "KAB3874055.1"
[1] "WP_005648621.1"
[1] "WP_234076625.1"
[1] "RJW86473.1"
[1] "EDP16705.1"
[1] "CUN93128.1"
[1] "SCH31575.1"
[1] "WP_121961437.1"
[1] "WP_032934790.1"
[1] "CCZ69406.1"
[1] "WP_005868532.1"
[1] "OKZ29873.1"
[1] "RGR09210.1"
[1] "WP_230595604.1"
[1] "ALK83770.1"
[1] "WP_032539464.1"
[1] "WP_195655936.1"
[1] "KHS58642.1"


In [16]:
tmp <- data.frame(sseqid = c("EHA9991508.1", "MCE9000919.1"),
                  species = c("Escherichia coli", "Bacteroides fragilis"))

In [17]:
sseqid_species_all <- bind_rows(sseqid_species_all, tmp)

In [18]:
blast_results_filtered <- left_join(blast_results_filtered, sseqid_species_all, by = "sseqid")

In [19]:
blast_results_filtered %>% select(species.x, organism)

[1m[22mAdding missing grouping variables: `qseqid`


qseqid,species.x,organism
<chr>,<chr>,<chr>
HSM67VF9_00001,s__Bacteroides_fragilis,Bacteroides fragilis str. 1007-1-F #8
HSM67VF9_00002,s__Bacteroides_fragilis,Bacteroides
HSM67VF9_00004,s__Bacteroides_fragilis,Bacteroides fragilis
HSM67VFJ_00001,s__Bacteroides_fragilis,Bacteroides fragilis
HSM67VFJ_00002,s__Bacteroides_fragilis,Bacteroides fragilis str. 1007-1-F #8
HSM67VFJ_00003,s__Bacteroides_fragilis,Bacteroides
HSM67VFJ_00004,s__Bacteroides_fragilis,
HSM6XRQO_00001,s__Bacteroides_fragilis,Bacteroides fragilis str. 1007-1-F #8
HSM7CYY7_00001,s__Bacteroides_fragilis,Bacteroides
HSM7CYY7_00002,s__Bacteroides_fragilis,Bacteroides fragilis
