From d95e689980485978ff3f7711082e2ff3b9e78044 Mon Sep 17 00:00:00 2001 From: Gordon Getzinger Date: Thu, 9 Jan 2020 14:15:58 -0500 Subject: [PATCH 1/2] closes #196 Checks html table for missing nodes. If no node found, NA is returned. --- R/chemid.R | 72 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/R/chemid.R b/R/chemid.R index c8e604cf..ca54bba5 100644 --- a/R/chemid.R +++ b/R/chemid.R @@ -149,22 +149,62 @@ ci_query <- function(query, type = c('name', 'rn', 'inchikey'), source_url <- gsub('^(.*)\\?.*', '\\1', qurl) } - name <- xml_text(xml_find_all(ttt, "//h3[contains(., 'Name of Substance')]/following-sibling::div[1]//li")) - synonyms <- xml_text(xml_find_all(ttt, "//h3[contains(., 'Synonyms')]/following-sibling::div[1]//li")) - cas <- xml_text(xml_find_all(ttt, "//h3[contains(., 'CAS Registry')]/following-sibling::ul[1]//li")) - inchi <- gsub('\\n|\\t', '', - xml_text(xml_find_all(ttt, "//h3[contains(., 'InChI')]/following-sibling::text()[1]"))[1] - ) - inchikey <- gsub('\\n|\\t|\\r', '', - xml_text(xml_find_all(ttt, "//h3[contains(., 'InChIKey')]/following-sibling::text()[1]")) - ) - smiles <- gsub('\\n|\\t|\\r', '', - xml_text(xml_find_all(ttt, "//h3[contains(., 'Smiles')]/following-sibling::text()[1]")) - ) - toxicity <- html_table(xml_find_all(ttt, "//h2[contains(., 'Toxicity')]/following-sibling::div//table"))[[1]] - physprop <- html_table(xml_find_all(ttt, "//h2[contains(., 'Physical Prop')]/following-sibling::div//table"))[[1]] - physprop[ , 'Value'] <- as.numeric(physprop[ , 'Value']) - #= same as physprop + if(is.na(xml_find_first(ttt, "//h3[contains(., 'Name of Substance')]/following-sibling::div[1]//li"))){ + name <- NA + }else{ + name <- xml_text(xml_find_all(ttt, "//h3[contains(., 'Name of Substance')]/following-sibling::div[1]//li")) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'Synonyms')]/following-sibling::div[1]//li"))){ + synonyms <- NA + }else{ + synonyms <- xml_text(xml_find_all(ttt, "//h3[contains(., 'Synonyms')]/following-sibling::div[1]//li")) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'CAS Registry')]/following-sibling::ul[1]//li"))){ + cas <- NA + } else { + cas <- xml_text(xml_find_all(ttt, "//h3[contains(., 'CAS Registry')]/following-sibling::ul[1]//li")) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'InChI')]/following-sibling::text()[1]"))){ + inchi <- NA + } else { + inchi <- gsub('\\n|\\t', '', + xml_text(xml_find_all(ttt, "//h3[contains(., 'InChI')]/following-sibling::text()[1]"))[1] + ) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'InChIKey')]/following-sibling::text()[1]"))){ + inchikey <- NA + } else { + inchikey <- gsub('\\n|\\t|\\r', '', + xml_text(xml_find_all(ttt, "//h3[contains(., 'InChIKey')]/following-sibling::text()[1]")) + ) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'Smiles')]/following-sibling::text()[1]"))){ + smiles <- NA + } else { + smiles <- gsub('\\n|\\t|\\r', '', + xml_text(xml_find_all(ttt, "//h3[contains(., 'Smiles')]/following-sibling::text()[1]")) + ) + } + + if(is.na(xml_find_first(ttt, "//h2[contains(., 'Toxicity')]/following-sibling::div//table"))){ + toxicity <- NA + } else { + toxicity <- html_table(xml_find_all(ttt, "//h2[contains(., 'Toxicity')]/following-sibling::div//table"))[[1]] + } + + if(is.na(xml_find_first(ttt, "//h2[contains(., 'Physical Prop')]/following-sibling::div//table"))){ + physprop <- NA + } else { + physprop <- html_table(xml_find_all(ttt, "//h2[contains(., 'Physical Prop')]/following-sibling::div//table"))[[1]] + physprop[ , 'Value'] <- as.numeric(physprop[ , 'Value']) + #= same as physprop + } + out <- list(name = name, synonyms = synonyms, cas = cas, inchi = inchi, inchikey = inchikey, smiles = smiles, toxicity = toxicity, From 620b713cb42de064682a22aa921ca2df117bcf59 Mon Sep 17 00:00:00 2001 From: Gordon Getzinger Date: Thu, 9 Jan 2020 14:15:58 -0500 Subject: [PATCH 2/2] update news --- NEWS | 2 +- R/chemid.R | 72 ++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index 466b2b8f..3e805afd 100644 --- a/NEWS +++ b/NEWS @@ -14,7 +14,7 @@ BUG FIXES * cs_prop() failed with duplicated return values [issue #148, reported and fixed by @stanstrup] * pp_query() failed when compound present, but no properties [issue #151, reported and fixed by @stanstrup] - +* ci_query() failed when missing table [issue #196, reported and fixed by @gjgetzinger] DEPRECATED FUNCTIONS diff --git a/R/chemid.R b/R/chemid.R index c8e604cf..ca54bba5 100644 --- a/R/chemid.R +++ b/R/chemid.R @@ -149,22 +149,62 @@ ci_query <- function(query, type = c('name', 'rn', 'inchikey'), source_url <- gsub('^(.*)\\?.*', '\\1', qurl) } - name <- xml_text(xml_find_all(ttt, "//h3[contains(., 'Name of Substance')]/following-sibling::div[1]//li")) - synonyms <- xml_text(xml_find_all(ttt, "//h3[contains(., 'Synonyms')]/following-sibling::div[1]//li")) - cas <- xml_text(xml_find_all(ttt, "//h3[contains(., 'CAS Registry')]/following-sibling::ul[1]//li")) - inchi <- gsub('\\n|\\t', '', - xml_text(xml_find_all(ttt, "//h3[contains(., 'InChI')]/following-sibling::text()[1]"))[1] - ) - inchikey <- gsub('\\n|\\t|\\r', '', - xml_text(xml_find_all(ttt, "//h3[contains(., 'InChIKey')]/following-sibling::text()[1]")) - ) - smiles <- gsub('\\n|\\t|\\r', '', - xml_text(xml_find_all(ttt, "//h3[contains(., 'Smiles')]/following-sibling::text()[1]")) - ) - toxicity <- html_table(xml_find_all(ttt, "//h2[contains(., 'Toxicity')]/following-sibling::div//table"))[[1]] - physprop <- html_table(xml_find_all(ttt, "//h2[contains(., 'Physical Prop')]/following-sibling::div//table"))[[1]] - physprop[ , 'Value'] <- as.numeric(physprop[ , 'Value']) - #= same as physprop + if(is.na(xml_find_first(ttt, "//h3[contains(., 'Name of Substance')]/following-sibling::div[1]//li"))){ + name <- NA + }else{ + name <- xml_text(xml_find_all(ttt, "//h3[contains(., 'Name of Substance')]/following-sibling::div[1]//li")) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'Synonyms')]/following-sibling::div[1]//li"))){ + synonyms <- NA + }else{ + synonyms <- xml_text(xml_find_all(ttt, "//h3[contains(., 'Synonyms')]/following-sibling::div[1]//li")) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'CAS Registry')]/following-sibling::ul[1]//li"))){ + cas <- NA + } else { + cas <- xml_text(xml_find_all(ttt, "//h3[contains(., 'CAS Registry')]/following-sibling::ul[1]//li")) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'InChI')]/following-sibling::text()[1]"))){ + inchi <- NA + } else { + inchi <- gsub('\\n|\\t', '', + xml_text(xml_find_all(ttt, "//h3[contains(., 'InChI')]/following-sibling::text()[1]"))[1] + ) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'InChIKey')]/following-sibling::text()[1]"))){ + inchikey <- NA + } else { + inchikey <- gsub('\\n|\\t|\\r', '', + xml_text(xml_find_all(ttt, "//h3[contains(., 'InChIKey')]/following-sibling::text()[1]")) + ) + } + + if(is.na(xml_find_first(ttt, "//h3[contains(., 'Smiles')]/following-sibling::text()[1]"))){ + smiles <- NA + } else { + smiles <- gsub('\\n|\\t|\\r', '', + xml_text(xml_find_all(ttt, "//h3[contains(., 'Smiles')]/following-sibling::text()[1]")) + ) + } + + if(is.na(xml_find_first(ttt, "//h2[contains(., 'Toxicity')]/following-sibling::div//table"))){ + toxicity <- NA + } else { + toxicity <- html_table(xml_find_all(ttt, "//h2[contains(., 'Toxicity')]/following-sibling::div//table"))[[1]] + } + + if(is.na(xml_find_first(ttt, "//h2[contains(., 'Physical Prop')]/following-sibling::div//table"))){ + physprop <- NA + } else { + physprop <- html_table(xml_find_all(ttt, "//h2[contains(., 'Physical Prop')]/following-sibling::div//table"))[[1]] + physprop[ , 'Value'] <- as.numeric(physprop[ , 'Value']) + #= same as physprop + } + out <- list(name = name, synonyms = synonyms, cas = cas, inchi = inchi, inchikey = inchikey, smiles = smiles, toxicity = toxicity,