forked from GarmanGroup/RIDL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
getPDBSeries.py
334 lines (291 loc) · 13.9 KB
/
getPDBSeries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import os.path
import shutil
class getSeries():
# retrieve pdb codes from pdb or pdb_redo
def __init__(self,
PDBcode='', pdb_redo='initial',
outputDir=''):
# PDBcode is 4 letter pdb code. pdb_redo takes 'false',
# 'initial' (= from pdb_redo without refinement),
# or 'final' (= from pdb_redo with refinement)
self.datasetName = str(PDBcode).lower()
self.outputDir = outputDir + self.datasetName
self.pdbFile = self.datasetName+'.pdb'
self.mmcifFile = self.datasetName+'-sf.cif'
self.mtzFile = self.datasetName+'.mtz'
# make directory for this dataset
if not os.path.exists(self.outputDir):
os.makedirs(self.outputDir)
if pdb_redo in ('initial', 'final'):
self.refineType = pdb_redo
# downloading pdb and mtz files from pdb_redo
self.downloadSuccess = self.downloadFromPDBredo()
if not self.downloadSuccess:
print 'Cannot continue processing current PDB code files..'
return
self.parseRfactorFromPDB(self.pdbFile)
# copy input files to directory
for fileName in (self.mtzFile, self.pdbFile):
shutil.move('{}'.format(fileName),
'{}/{}'.format(self.outputDir, fileName))
elif pdb_redo == 'false':
# downloading pdb and mmcif files from PDB
print 'Downloading from PDB...'
self.downloadSuccess = self.downloadFromPDB()
if not self.downloadSuccess:
print 'Cannot continue processing current PDB code files..'
return
# copy input files to directory
for fileName in (self.mmcifFile, self.pdbFile):
shutil.move(fileName, '{}/{}'.format(self.outputDir, fileName))
print 'running cif2mtz...'
self.runCif2MTZ()
vars = [self.outputDir, self.refmacPDBout]
# run rigid body refinement refmac jobs
print 'running rigid body refmac... 0 cycles'
self.runREFMAC('RIGID', 0)
self.parseRfactorFromPDB('{}/{}'.format(*vars))
doExtraRefinement = False
if doExtraRefinement:
print 'running rigid body refmac... 10 cycles'
self.runREFMAC('RIGID', 10)
self.parseRfactorFromPDB('{}/{}'.format(*vars))
# run restrained refinement refmac jobs
print 'running restrained refmac... 0 cycles'
self.runREFMAC('REST', 0)
self.parseRfactorFromPDB('{}/{}'.format(*vars))
print 'running restrained refmac... 10 cycles'
self.runREFMAC('REST', 10)
self.parseRfactorFromPDB('{}/{}'.format(*vars))
# compare Rfactors to those in downloaded PDB header
print 'values stated in downloaded .pdb header'
self.parseRfactorFromPDB('{}/{}'.format(self.outputDir,
self.pdbFile))
else:
print 'pdb_redo parameter takes one of values ' +\
' ("false","initial","final") only'
def downloadFromPDB(self):
# download the coordinate pdb file and mmcif
# structure factor files from PDB
loc = 'https://files.rcsb.org/download/'
os.system('curl "{}{}.pdb" -k -o "{}"'.format(loc,
self.datasetName.upper(),
self.pdbFile))
os.system('curl "{}{}" -k -o "{}"'.format(loc, self.mmcifFile,
self.mmcifFile))
# check if files downloaded
if (not self.checkFileExists('{}'.format(self.pdbFile)) or
not self.checkFileExists('{}'.format(self.mmcifFile))):
print 'One or more files failed to download successfully from PDB'
return False
else:
return True
def downloadFromPDBredo(self):
# download the coordinate pdb file and mtz file
# generated by pdb_redo to initially attempt
# to recalculate R(-free) values stated in header
# of pdb file from PDB (this structure is
# after 0 cycles of rigid body refinement
# and pdb curation)
pdb = self.datasetName
if self.refineType == 'final':
refType = 'final'
fileFmt = ''
else:
refType = '0cyc'
fileFmt = '.gz'
for fileType in ('mtz', 'pdb'):
print 'Downloading {} file..'.format(fileType)
loc = 'www.cmbi.ru.nl/pdb_redo/'
args = [loc, pdb[1:3], pdb, pdb, refType, fileType,
fileFmt, pdb, fileType, fileFmt]
os.system('curl "{}{}/{}/{}_{}.{}{}" -o "{}.{}{}"'.format(*args))
# check if file downloaded successfully
args = [pdb, fileType, fileFmt]
if not self.checkFileExists('{}.{}{}'.format(*args)):
print 'Failed to download {} file '.format(fileType) +\
'successfully from PDB'
return False
if self.refineType == 'initial':
os.system('gunzip {}.{}.gz -f'.format(pdb, fileType))
# want to change label names in mtz file to make unique to pdb code
self.changePDBredoMtzLabelInfo(pdb)
try:
shutil.move(pdb+'_editedLabels.mtz', pdb+'.mtz')
except IOError:
print 'Error moving file to correct location'
return False
return True
def changePDBredoMtzLabelInfo(self, pdbcode):
# default mtz column labels are FP and SIGFP. Wish to append
# pdb code to these to make them unique (important later on
# for CAD merging of mtz files)
self.jobName = 'sftools'
# run sftools from comman line
self.commandInput1 = 'sftools'
self.commandInput2 = 'mode batch\n' +\
'read {}.mtz mtz\n'.format(pdbcode) +\
'set types col "FP"\n' +\
'F\n' +\
'set labels col "FP"\n' +\
'"FP_{}"\n'.format(pdbcode) +\
'set types col "SIGFP"\n' +\
'Q\n' +\
'set labels col "SIGFP"\n' +\
'"SIGFP_{}"\n'.format(pdbcode) +\
'set types col "PHIC_ALL"\n' +\
'P\n' +\
'set labels col "PHIC_ALL"\n' +\
'"PHIC_{}"\n'.format(pdbcode) +\
'set types col "FOM"\n' +\
'W\n' +\
'set labels col "FOM"\n' +\
'"FOM_{}"\n'.format(pdbcode) +\
'set types col "FREE"\n' +\
'I\n' +\
'set labels col "FREE"\n' +\
'"FreeR_flag"\n'.format(pdbcode) +\
'write {}_editedLabels.mtz mtz\n'.format(pdbcode) +\
'EXIT\n' +\
'YES'
self.outputLogfile = 'sftoolslogfile.txt'
# run sftools job
self.runCCP4program()
def runCif2MTZ(self):
# run cif2mtz job to convert mmcif structure factor file to mmcif file
self.jobName = 'cif2mtz'
# check if input mmcif file exist
args = [self.outputDir, self.mmcifFile]
if not self.checkFileExists('{}/{}'.format(*args)):
print 'could not find correct cif file to convert to mtz'
return
# run Cif2Mtz from command line
args2 = [self.outputDir, self.mtzFile]
self.commandInput1 = 'cif2mtz ' +\
'HKLIN {}/{} '.format(*args) +\
'HKLOUT {}/{} '.format(*args2)
self.commandInput2 = 'END'
self.outputLogfile = 'Cif2Mtzlogfile.txt'
# run Cif2Mtz job
self.runCCP4program()
def runREFMAC(self, refineType, numCycles):
# run n = 'numCycles' cycles of 'refineType'
# refinement in refmac to get phases
self.jobName = 'refmac'
if refineType == 'RIGID':
bref = 'over'
numCycString = 'rigid ncycle {}'.format(numCycles)
elif refineType == 'REST':
bref = 'ISOT'
numCycString = 'ncyc {}'.format(numCycles)
else:
print 'Unreadable refinement type.. selecting ' +\
'0 cycles of rigid body refinement'
bref = 'over'
numCycString = 'rigid ncycle 0'
refineType = 'RIGID'
numCycles = 0
# make a refinement type id to append to file names from current job
fileInd = '_{}_{}cycles'.format(refineType, numCycles)
# check if input files exist
args1 = [self.outputDir, self.pdbFile]
args2 = [self.outputDir, self.mtzFile]
if (not self.checkFileExists('{}/{}'.format(*args1)) or
not self.checkFileExists('{}/{}'.format(*args2))):
return
args3 = [self.datasetName, fileInd]
self.refmacPDBout = '{}_refmac{}.pdb'.format(*args3)
self.refmacMTZout = '{}_refmac{}.mtz'.format(*args3)
self.refmacLIBout = '{}_refmac{}.cif'.format(*args3)
args4 = [self.outputDir, self.refmacPDBout]
args5 = [self.outputDir, self.refmacMTZout]
args6 = [self.outputDir, self.refmacLIBout]
self.commandInput1 = 'refmac5 ' +\
'XYZIN {}/{} '.format(*args1) +\
'XYZOUT {}/{} '.format(*args4) +\
'HKLIN {}/{} '.format(*args2) +\
'HKLOUT {}/{} '.format(*args5) +\
'LIBOUT {}/{} '.format(*args6)
self.commandInput2 = 'make check NONE\n' +\
'make -\n' +\
' hydrogen ALL -\n' +\
' hout NO -\n' +\
' peptide NO -\n' +\
' cispeptide YES -\n' +\
' ssbridge YES -\n' +\
' symmetry YES -\n' +\
' sugar YES -\n' +\
' connectivity NO -\n' +\
' link NO\n' +\
'refi -\n' +\
' type {} -\n'.format(refineType) +\
' resi MLKF -\n' +\
' meth CGMAT -\n' +\
' bref {}\n'.format(bref) +\
'{}\n'.format(numCycString) +\
'scal -\n' +\
' type SIMP -\n' +\
' LSSC -\n' +\
' ANISO -\n' +\
' EXPE\n' +\
'solvent YES\n' +\
'weight -\n' +\
' AUTO\n' +\
'monitor MEDIUM -\n' +\
' torsion 10.0 -\n' +\
' distance 10.0 -\n' +\
' angle 10.0 -\n' +\
' plane 10.0 -\n' +\
' chiral 10.0 -\n' +\
' bfactor 10.0 -\n' +\
' bsphere 10.0 -\n' +\
' rbond 10.0 -\n' +\
' ncsr 10.0\n' +\
'labin FP=FP SIGFP=SIGFP -\n' +\
' FREE=FREE\n' +\
'labout FC=FC FWT=FWT PHIC=PHIC ' +\
'PHWT=PHWT DELFWT=DELFWT PHDELWT=PHDELWT ' +\
'FOM=FOM\n' +\
'PNAME {}\n'.format(self.datasetName) +\
'DNAME 1\n' +\
'RSIZE 80\n' +\
'EXTERNAL WEIGHT SCALE 10.0\n' +\
'EXTERNAL USE MAIN\n' +\
'EXTERNAL DMAX 4.2\n' +\
'END'
self.outputLogfile = 'REFMAClogfile{}.txt'.format(fileInd)
# run REFMAC job
self.runCCP4program()
def runCCP4program(self):
# generic method to run a ccp4 program on command line
# write commandInput2 to txt file
textinput = open('{}/{}inputfile.txt'.format(
self.outputDir, self.jobName), 'w')
textinput.write(self.commandInput2)
textinput.close()
# run ccp4 program job
os.system('{} < {}/{}inputfile.txt > {}/{}'.format(
self.commandInput1, self.outputDir, self.jobName,
self.outputDir, self.outputLogfile))
def checkFileExists(self, filename):
# method to check if file exists
if not os.path.isfile(filename):
print 'File {} not found'.format(filename)
return False
else:
return True
def parseRfactorFromPDB(self, fileName):
# read a pdb file and print the Rwork and Rfree values out
RvalFlag = 'R VALUE (WORKING + TEST SET) :'
RfreeFlag = 'FREE R VALUE :'
fileOpen = open(fileName, 'r')
Rvalue = 'N/A'
Rfree = 'N/A'
for line in fileOpen.readlines():
if RvalFlag in line:
Rvalue = float(line.split(RvalFlag)[1])
elif RfreeFlag in line:
Rfree = float(line.split(RfreeFlag)[1])
print '---------------------------------------------'
print 'Rfactor: {}\nRfree: {}'.format(Rvalue, Rfree)
print '---------------------------------------------'