-
Notifications
You must be signed in to change notification settings - Fork 6
/
classyfire_functions.py
223 lines (209 loc) · 12.5 KB
/
classyfire_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import requests
import json
import time
import rdflib
from rdflib.namespace import RDF, VOID, DCTERMS, XSD, RDFS
import sys
import subprocess
import signal
import glob
import datetime
import gzip
import csv
import os
from urllib3.util import Retry
from urllib3 import PoolManager
# Prepare TimeoutExceptions
class TimeOutException(Exception):
pass
def alarm_handler(signum, frame):
raise TimeOutException()
def classify_df(df_index, df, g_direct_parent, g_alternative_parent, path_direct_p, path_alternative_p, path_out):
"""
This function is used to retrieve all ChemOnt classes associated to each molecules in df. As these processes are run in parralel, size of each created graph need to be exported in this function.
This function return a table of 4 values: nb. triples in direct_parent graph file, nb. subjects in direct_parent graph file, nb. triples in Alternative_parent graph file, nb. subjects in Alternative_parent graph file
"""
print("Treating df " + str(df_index))
retries = Retry(total=10,backoff_factor=0.1,connect=5, read=2, redirect=5, status_forcelist=[429, 500, 502, 503, 504])
http = PoolManager(retries=retries,timeout=30)
for index, row in df.iterrows():
CID=row['CID'].strip()
INCHIKEY=row['INCHIKEY'].strip()
classif = get_entity_from_ClassyFire(CID, INCHIKEY, path_out,http)
if not classif:
continue
chemont_ids = parse_entities(CID, classif, path_out)
if not chemont_ids:
continue
add_triples(CID, chemont_ids, g_direct_parent, g_alternative_parent)
print("Serialyze graphs")
g_direct_parent.serialize(destination = os.path.join(path_direct_p, "classyfire_direct_parent_" + str(df_index + 1) + ".ttl"), format='turtle')
g_alternative_parent.serialize(destination = os.path.join(path_alternative_p, "classyfire_alternative_parent_" + str(df_index + 1) + ".ttl"), format='turtle')
# Compress files:
try:
subprocess.run("gzip " + os.path.join(path_direct_p, "classyfire_direct_parent_" + str(df_index + 1) + ".ttl"), shell = True, check=True, stderr = subprocess.PIPE)
subprocess.run("gzip " + os.path.join(path_alternative_p, "classyfire_alternative_parent_" + str(df_index + 1) + ".ttl"), shell = True, check=True, stderr = subprocess.PIPE)
except subprocess.CalledProcessError as e:
print("Error while trying to compress files")
print(e)
sys.exit(3)
return [len(g_direct_parent), len(set([str(s) for s in g_direct_parent.subjects()])), len(g_alternative_parent), len(set([str(s) for s in g_alternative_parent.subjects()]))]
def get_entity_from_ClassyFire(CID, InchiKey, path_out,http):
"""
This function is used to send a query to classyfire.wishartlab.com/entities/INCHIKEY.json to retrieve classification result for a compound, given his InchiKey.
This function return the classification is json format or False if there was an error. Logs and ids for which the request failed are reported in classyFire.log and classyFire_error_ids.log
- CID: PubChem compound identifier (use for logs)
- InchiKey: input inchikey
"""
signal.signal(signal.SIGALRM, alarm_handler)
signal.alarm(60)
time.sleep(1)
try:
r = http.request("GET",'http://classyfire.wishartlab.com/entities/%s.json' % (InchiKey), headers = {"Content-Type": "application/json"})
print(InchiKey,":",r.status)
# Check timeout:
except TimeOutException:
print("Request timeout was reached (60s)!")
with open(os.path.join(path_out, "classyFire.log"), "a") as f_log:
f_log.write("CID " + CID + " - Request Timeout")
with open(os.path.join(path_out, "classyFire_error_ids.log"), "a") as id_log:
id_log.write(CID + "\n")
signal.alarm(0)
return False
# Check if there was an error while sending request:
except requests.exceptions.RequestException as e:
print("Error while trying to retrieve classification for CID: " + CID + ", Check logs.")
with open(os.path.join(path_out, "classyFire.log"), "a") as f_log:
f_log.write("CID " + CID + " - HTTP response status codes: ")
f_log.write(str(e) + "\n")
with open(os.path.join(path_out, "classyFire_error_ids.log"), "a") as id_log:
id_log.write(CID + "\n")
signal.alarm(0)
return False
# Test if the element is classified
classif = json.loads(r.data)
if len(classif) == 0:
with open(os.path.join(path_out, "ids_no_classify.log"), "a") as no_classif_log:
no_classif_log.write(CID + "\t" + InchiKey + "\n")
signal.alarm(0)
return False
signal.alarm(0)
return classif
def parse_entities(CID, classif, path_out):
"""
This function is used to parse a response from ClassyFire and extract direct parents and alternative parents
This function return a list of CHEMONTID associated to the classification result. The first is always the direct_parent and remaining are alternative parents
- response: The response of the request
"""
try:
chemont_ids = [classif["direct_parent"]['chemont_id'].split(':')[1]] + [alt_p['chemont_id'].split(':')[1] for alt_p in classif["alternative_parents"]]
except:
print("Error while trying to parse response for CID: " + CID + ", Check logs.")
with open(os.path.join(path_out, "classyFire.log"), "a") as f_log:
f_log.write("CID " + CID + " - Error while parsing response: ")
e = sys.exc_info()[0]
f_log.write(str(e) + "\n")
with open(os.path.join(path_out, "classyFire_error_ids.log"), "a") as id_log:
id_log.write(CID + "\n")
return False
return(chemont_ids)
def add_triples(CID, chemont_ids, g_direct_parent, g_alternative_parent):
"""
This function is used to create triples from ChemOnt classification. Direct-parent class is exported in direct-parent graph and, separately alternative classes are exported in alternative-classes graph.
"""
g_direct_parent.add((rdflib.URIRef("http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID" + CID), RDF["type"], rdflib.URIRef("http://purl.obolibrary.org/obo/CHEMONTID_" + chemont_ids[0])))
if len(chemont_ids) > 1:
for alt_p in chemont_ids[1:]:
g_alternative_parent.add((rdflib.URIRef("http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID" + CID), RDF["type"], rdflib.URIRef("http://purl.obolibrary.org/obo/CHEMONTID_" + alt_p)))
def extract_CID_InchiKey(pmids_cids_graph_list, inchikeys_graph_list, path_out):
# Inti output file
with open(path_out, "w") as out:
out_writer = csv.writer(out, delimiter = ',')
m = out_writer.writerow(['CID', 'INCHIKEY'])
# Init variables
available_cids = set()
for pmid_cid_f_input in pmids_cids_graph_list:
# release memory
g_pmid_cid = None
# Import pmid_cid graph
print("Importing " + pmid_cid_f_input + " ...", end = '')
g_pmid_cid = rdflib.ConjunctiveGraph()
with gzip.open(pmid_cid_f_input, "rb") as f:
g_pmid_cid.parse(f, format = "turtle")
# Get all objects
extracted_objects = [uri.toPython().split('http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID')[1] for uri in list(g_pmid_cid.objects())]
available_cids = available_cids.union(extracted_objects)
print(" Ok")
# Then, we browse inchikey files to select CID - inchikey association for which the CID has an associated corpus
for inchikey_f_input in inchikeys_graph_list:
g_inchikey = None
g_inchikey = rdflib.Graph()
print("treating file " + inchikey_f_input + " ...", end = '')
# Add InchiKeys triples to the graph
with gzip.open(inchikey_f_input, "rb") as f:
g_inchikey.parse(f, format = "turtle")
# Get cid - inchikey associations
cids_inchikeys = list(g_inchikey.subject_objects(rdflib.URIRef("http://semanticscience.org/resource/SIO_000011")))
inchikeys = [cid_inchikey[0].toPython().split("http://rdf.ncbi.nlm.nih.gov/pubchem/inchikey/")[1] for cid_inchikey in cids_inchikeys]
cids = [cid_inchikey[1].toPython().split("http://rdf.ncbi.nlm.nih.gov/pubchem/compound/CID")[1] for cid_inchikey in cids_inchikeys]
with open(path_out, "a") as out:
out_writer = csv.writer(out, delimiter = ',')
for cid_index in range(0, len(cids)):
if cids[cid_index] in available_cids:
m = out_writer.writerow([cids[cid_index], inchikeys[cid_index]])
print(" Ok")
# Release all memory
g_inchikey = None
g_pmid_cid = None
print("End procedure CID - InchiKeys associations !")
def get_CID_InchiKeys(url, graph_from, out_file):
return None
def ask_for_graph(url, graph_uri):
"""
This function is used to test if graph a exist without erase
- url: Virtuoso SPARQL endpoint url
- graph_uri: the graph uri to be tested
"""
header = {
"Content-Type": "application/x-www-form-urlencoded",
"Accept": "text/html"
}
data = {
"format": "html",
"query": "ASK WHERE { GRAPH <" + graph_uri + "> { ?s ?p ?o } }"
}
r = requests.post(url = url, headers = header, data = data)
if r.status_code != 200:
print("Error in request while trying to check if graph " + graph_uri + " exists.\nImpossible to continue, exit.\n")
print(r.text)
sys.exit(3)
if r.text == "true":
return True
return False
def export_ressource_metadata(ClassyFire_direct_p, ClassyFire_alternative_p, graph_sizes, uri_targeted_ressources, path_direct_p, path_alternative_p):
"""
This function is used export metadata for builted graphs
"""
ClassyFire_direct_p.add_version_attribute(RDF["type"], VOID["Linkset"])
ClassyFire_direct_p.add_version_attribute(DCTERMS["source"], rdflib.URIRef("http://classyfire.wishartlab.com"))
ClassyFire_direct_p.add_version_attribute(RDFS["seeAlso"], rdflib.URIRef("https://doi.org/10.1186/s13321-016-0174-y"))
ClassyFire_alternative_p.add_version_attribute(RDF["type"], VOID["Linkset"])
ClassyFire_alternative_p.add_version_attribute(DCTERMS["source"], rdflib.URIRef("http://classyfire.wishartlab.com"))
ClassyFire_alternative_p.add_version_attribute(RDFS["seeAlso"], rdflib.URIRef("https://doi.org/10.1186/s13321-016-0174-y"))
for uri_targeted_ressource in uri_targeted_ressources:
ClassyFire_direct_p.add_version_attribute(VOID["target"], uri_targeted_ressource)
ClassyFire_alternative_p.add_version_attribute(VOID["target"], uri_targeted_ressource)
ClassyFire_direct_p.add_version_attribute(DCTERMS["description"], rdflib.Literal("This subset contains RDF triples providing links between PubChem compounds and their class according to ChemOnt ontology from ClassyFire. The provided class correspond to the Direct Parent, representing the dominant class in the molecule"))
ClassyFire_direct_p.add_version_attribute(DCTERMS["title"], rdflib.Literal("ChemOnt Classification - Direct parent"))
# On ajoute les infos pour la seconde ressource, les endpoint:
ClassyFire_alternative_p.add_version_attribute(DCTERMS["description"], rdflib.Literal("This subset contains RDF triples providing links between PubChem compounds and their classes according to ChemOnt ontology from ClassyFire. The provided classes correspond to the Alternative Parents, representing classes describing the molecule but which not have an ancestor–descendant relationship with each other or with the Direct Parent"))
ClassyFire_alternative_p.add_version_attribute(DCTERMS["title"], rdflib.Literal("ChemOnt Classification - Alternative parents"))
# On exporte le graph des metadata :
print("Export version graph with metadata ... ", end = '')
ClassyFire_direct_p.add_version_attribute(VOID["triples"], rdflib.Literal(sum([g[0] for g in graph_sizes]), datatype=XSD.long))
ClassyFire_direct_p.add_version_attribute(VOID["distinctSubjects"], rdflib.Literal(sum([g[1] for g in graph_sizes]), datatype=XSD.long ))
ClassyFire_alternative_p.add_version_attribute(VOID["triples"], rdflib.Literal(sum([g[2] for g in graph_sizes]), datatype=XSD.long ))
ClassyFire_alternative_p.add_version_attribute(VOID["distinctSubjects"], rdflib.Literal(sum([g[3] for g in graph_sizes]), datatype=XSD.long ))
ClassyFire_direct_p.version_graph.serialize(destination = os.path.join(path_direct_p, "void.ttl"), format='turtle')
ClassyFire_alternative_p.version_graph.serialize(destination = os.path.join(path_alternative_p, "void.ttl"), format='turtle')
print("Ok")