-
Notifications
You must be signed in to change notification settings - Fork 63
/
get_pdb_assembly.py
118 lines (106 loc) · 3.53 KB
/
get_pdb_assembly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os, sys
import glob
import json
from tqdm import tqdm
from multiprocessing import Pool
import requests
import time
input_dir = sys.argv[1]
output_dir = sys.argv[2]
input_files = glob.glob(input_dir + "*.cif") + glob.glob(input_dir + "*.cif.gz")
os.system("mkdir -p " + output_dir)
pdb_chain_mapper = json.load(open(sys.argv[3]))
rot_keys = [
"matrix11",
"matrix12",
"matrix13",
"matrix21",
"matrix22",
"matrix23",
"matrix31",
"matrix32",
"matrix33",
]
trans_keys = ["vector1", "vector2", "vector3"]
def get_oper(cont):
cont = cont["pdbx_struct_oper_list"]
ret = {}
for c in cont:
id = c["id"]
rot = []
trans = []
for k in rot_keys:
rot.append(c[k])
for k in trans_keys:
trans.append(c[k])
rot, trans = tuple(rot), tuple(trans)
if rot == (1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) and trans == (
0.0,
0.0,
0.0,
):
ret[id] = "I"
else:
ret[id] = (rot, trans)
ret["I"] = "I"
return ret
# refer to https://data.rcsb.org/redoc/index.html
def get_pdb_meta_info(mmcif_path):
name = os.path.split(mmcif_path)[-1].split(".")[0]
url = f"https://data.rcsb.org/rest/v1/core/assembly/{name}/1"
max_try_time = 10
for i in range(max_try_time):
try:
out_path = os.path.join(output_dir, name + ".json")
load = False
if os.path.isfile(out_path):
cont = json.load(open(out_path, "r"))
load = True
else:
r = requests.get(url)
cont = json.loads(r.text)
json.dump(cont, open(out_path, "w"))
load = r.ok
if load:
if "rcsb_struct_symmetry" not in cont:
break
cur_mapper = pdb_chain_mapper[name]["to_auth_id"]
for tt in cont["rcsb_struct_symmetry"]:
if tt["kind"] == "Global Symmetry":
symbol = tt["symbol"]
stoi = tt["stoichiometry"]
all_opers = get_oper(cont)
chains = []
opers = []
for c in tt["clusters"]:
for m in c["members"]:
chain_id = cur_mapper[m["asym_id"]]
if "pdbx_struct_oper_list_ids" in m:
for op_idx in m["pdbx_struct_oper_list_ids"]:
chains.append(chain_id)
opers.append(all_opers[op_idx])
else:
chains.append(chain_id)
opers.append("I")
return name, {
"symbol": symbol,
"stoi": stoi,
"chains": chains,
"opers": opers,
}
break
elif cont["status"] == "404":
break
except Exception as e:
print(name, e)
time.sleep(2)
return name, None
file_cnt = len(input_files)
meta_dict = {}
# failed = []
with Pool(64) as pool:
for ret in tqdm(pool.imap(get_pdb_meta_info, input_files), total=file_cnt):
name, res = ret
if res:
meta_dict[name] = res
json.dump(meta_dict, open("pdb_assembly.json", "w"), indent=2)