Skip to content

Translating Between Chemical Identifiers

Dmitry Grapov edited this page Jul 17, 2013 · 12 revisions

Translating Between Chemical Identifiers Using:

Install necessary packages

#load necessary functions
#install background packages
install.packages("devtools");install.packages("RJSONIO")
library(devtools);library(RJSONIO)

#install packages for translations
# The Chemical Translation System
install_github(repo = "CTSgetR", username = "dgrapov")
library(CTSgetR)

#for the Chemical Identifier Resolver (CIR) 
install_github(repo = "CIRgetR", username = "dgrapov")
library(CIRgetR)

Define starting identifier

Here we use a hash key for the universal molecular identifier InchI

#InchiKeys used for example
id<-c("ZKHQWZAMYRWXGA-KQYNXXCUSA-N", "BAWFJGJZGIEFAR-NNYOXOHSSA-O","QNAYBMKLOCPYGJ-REOHCLBHSA-N") 
#create.csv to simulate loading fromm .csv (file written to current directory, getwd())
write.csv(data.frame(InchiKey=id),file="InchIKeys.csv",row.names=FALSE) #create.csv to simulate loading fromm .csv
#upload .csv 
id<-read.csv(file="InchIKeys.csv",header=TRUE)

Goal: translate from inchiKeys to ChemSpider Ids

Use Chemical Identifier Resolver (CIR)

results<-CIRgetR(id,to= "chemspider_id",return.all=FALSE) 

Use the Chemical Translation System (CTS)

results2<-CTSgetR(id,from="InChIKey",to="ChemSpider",parallel=FALSE)

Compare the two results

miss.match<-!as.matrix(results2)%in%as.matrix(results)|!as.matrix(results)%in%as.matrix(results2)
paste(sum(miss.match),"difference(s) between results",sep=" ") 
data.frame(CIR= results[,1], CTS = results2[,1])[miss.match,]#two different records for both are Alanine

#CTS (but not CIR) can be used to generate InChI key/code from identifier
CSid<-results[miss.match,] # convert CIR ChemSpider Id to inChiKey
results3<-CTSgetR(CSid,from="ChemSpider",to="InChIKey",parallel=FALSE)

#compare keys
if(as.matrix(results3)==as.matrix(id[miss.match,,drop=FALSE]))cat("codes match!","\n") else cat("codes DO NOT match!","\n")

Here is a more advanced example for translating from one ID to many

Translate from InchI Key to all possible options available in CIR: "smiles", "names", "iupac_name", "cas", "inchi", "stdinchi", "inchikey", "stdinchikey", "ficts", "ficus", "uuuuu", "image", "file", "mw", "monoisotopic_mass","chemspider_id", "pubchem_sid", "chemnavigator_sid", "formula", "chemnavigator_sid"

CIR.options<-c("smiles", "names", "iupac_name", "cas", "inchi", "stdinchi", "inchikey", "stdinchikey",
		"ficts", "ficus", "uuuuu", "image", "file", "mw", "monoisotopic_mass","chemspider_id",
		"pubchem_sid", "chemnavigator_sid", "formula", "chemnavigator_sid")		
	
all.results.CIR<-sapply(1:length(CIR.options), function(i)
	{
		cat(CIR.options[i],"\n")
		CIRgetR(id=id,to=CIR.options[i],return.all=FALSE)
	})
names(all.results.CIR)<-CIR.options	
all.results.CIR<-data.frame(all.results.CIR )# object

Translate InchIKey to all possible options from CTS:

"Chemical Name","InChIKey","InChI Code","PubChem CID","Pubchem SID","ChemDB","ZINC","Southern Research Institute","Specs","MolPort","ASINEX","ChemBank","MLSMR","Emory University Molecular Libraries Screening Center","ChemSpider","DiscoveryGate","Ambinter","Vitas-M Laboratory","ChemBlock"

CTS.options<-CTS.options()
CTS.options # see options
id<-results2
all.results.CTS<-sapply(1:length(CTS.options), function(i)
	{
		cat(CTS.options[i],"\n")
		CTSgetR(id=id,to=CTS.options[i],from="ChemSpider")
	})
names(all.results.CTS)<-CTS.options	
all.results.CTS<-data.frame(all.results.CTS) # object

Compare the two results and save the one with lowest % error

#calculate % error for each querry as a percent of asked translations
CIR.error<-round(((sum(unlist(all.results.CIR)=="<h1>Page not found (404)</h1>")/length(unlist(id))))/length(CIR.options)*100,0)
CTS.error<-round((sum(unlist(all.results.CTS)=="error")/length(unlist(id)))/length(CTS.options)*100,0)
data.frame(CIR.error=CIR.error,CTS.error=CTS.error)

#choose best
best<-c("all.results.CIR","all.results.CTS")[which.min(c(CIR.error,CTS.error))[1]] # [1] for tie breaker
cat("Best results: ",best, "\n")
#save the best result to a .csv 
write.csv(get(best),file="best translation.csv")

get image for querry

Get image URL from InchIKey using CIR.

#get image for query using CIR
#get image URL from InchIKey
id<-"ZKHQWZAMYRWXGA-KQYNXXCUSA-N"
image.url<-as.character(unlist(CIRgetR(id,to= "image",return.all=FALSE) ))
download.file(image.url,"image.gif")
install.packages("caTools")
library(caTools)
gif <- read.gif(image.url, verbose = TRUE, flip = TRUE)
par(pin=c(2,2)) # change this for multiple gif results
image(gif$image, col = gif$col, main = gif$comment, frame.plot=FALSE,xaxt="n", yaxt="n") 

example image