In [1]:
from artem import *

In [2]:
def artem(m, n):
    prim_transform  = get_transform(r_prim[m], q_prim[n])
    
    q_avg_tree = KDTree(apply_transform(q_avg, prim_transform))
    dist = r_avg_tree.sparse_distance_matrix(
        q_avg_tree,
        matchrange,
        p=2,
        output_type='ndarray'
    )

    neighbors = mutual_nearest_neighbors(dist)
    size = len(neighbors)
    if not sizemin <= size <= sizemax:
        return None
    
    X, Y = vstack([[r_scnd[i], q_scnd[j]] for i, j in neighbors])
    scnd_transform = get_transform(X, Y)
    
    X, Y = vstack([[r_eval[i], q_eval[j]] for i, j in neighbors])
    
    rmsd = RMSD(X, apply_transform(Y, scnd_transform))
    if not rmsdmin <= rmsd <= rmsdmax:
        return None
    
    rmsdsize = rmsd / size
    if not rmsdsizemin <= rmsdsize <= rmsdsizemax:
        return None
    
    neighbors = tuple(sorted(neighbors))
    return (neighbors, rmsd)

In [3]:
argv = [
    'r=data/1xjr.cif',
    'q=data/1xjr.pdb',
    'rmsdmax=2',
    'rmsdsize=0.2',
    'threads=-1',
    'saveto=results'
]

In [4]:
if argv[0] in {'--H', '-H', '--h', '-h', '--help', '-help'}:
    with open('README.txt', 'r') as rdme:
        print(*rdme)
    exit()
else:
    kwargs = dict([arg.split('=') for arg in argv])

In [5]:
threads = int(kwargs.get('threads', threads))
if threads != 1:
    mp.set_start_method('fork')     # ARTEM multiprocessing is available only for UNIX-like systems
    if threads <= 0:
        threads = mp.cpu_count()
    else:
        threads = min(threads, mp.cpu_count())

r       = kwargs.get('r')
rres    = kwargs.get('rres', rres)
rresneg = kwargs.get('rresneg', rresneg)
rseed   = kwargs.get('rseed', rseed)
rformat = kwargs.get('rformat', None)
rname, rext = r.split(os.sep)[-1].split('.')
rext = rext.upper()
if not rformat:
    if rext in pdb.formats:
        rformat = rext
    else:
        rformat = 'PDB'

q       = kwargs.get('q')
qres    = kwargs.get('qres',    qres)
qresneg = kwargs.get('qresneg', qresneg)
qseed   = kwargs.get('qseed',   qseed)
qformat = kwargs.get('qformat', None)
qname, qext = q.split(os.sep)[-1].split('.')
qext = qext.upper()
if not qformat:
    if qext in pdb.formats:
        qformat = qext
    else:
        qformat = 'PDB'

sizemin     = float(kwargs.get('sizemin', sizemin))
sizemax     = float(kwargs.get('sizemax', sizemax))

rmsdmin     = float(kwargs.get('rmsdmin', rmsdmin))
rmsdmax     = float(kwargs.get('rmsdmax', rmsdmax))

rmsdsizemin = float(kwargs.get('rmsdsizemin', rmsdsizemin))
rmsdsizemax = float(kwargs.get('rmsdsizemax', rmsdsizemax))

matchrange  = float(kwargs.get('matchrange', matchrange))

In [6]:
# Model preprocessing

rstruct  = pdb.parser(r, rformat, rname)
rstruct.drop_duplicates_alt_id(keep=keep)

rneg = bool(rresneg)
rresstuct = rstruct.get_res_substruct(
    (rres, rresneg)[rneg],
    rneg
)
rdata, rnoise = describe(rresstuct)
if not rdata:
    msg = 'No {}={} nucleotides in the {} for seed'.format(
        ('rres', 'rresneg')[rneg],
        (rres, rresneg)[rneg],
        r
    )
    raise Exception(msg)
else:
    r_code, r_prim, r_avg, r_scnd, r_eval = zip(*rdata)
    r_avg = np.vstack(r_avg)

rseed_code = set(rresstuct.get_res_code(rseed))
if not rseed_code:
    msg = 'No rseed={} nucleotides in the {}={} for seed {}'.format(
        rseed,
        ('rres', 'rresneg')[rneg],
        (rres, rresneg)[rneg],
        r
    )
    raise Exception(msg)
else:
    rseed_npc  = set(rnoise)  & rseed_code
    rseed_code = set(r_code) & rseed_code

r_ind = [i for i, code in enumerate(r_code) if code in rseed_code]


qstruct  = pdb.parser(q, qformat, qname)
qstruct.drop_duplicates_alt_id(keep=keep)

qneg = bool(qresneg)
qresstuct = qstruct.get_res_substruct(
    (qres, qresneg)[qneg],
    qneg
)
qrres, qures = describe(qresstuct)
if not qrres:
    msg = 'No {}={} nucleotides in the {} for seed'.format(
        ('qres', 'qresneg')[qneg],
        (qres, qresneg)[qneg],
        q
    )
    raise Exception(msg)
else:
    q_code, q_prim, q_avg, q_scnd, q_eval = zip(*qrres)
    q_avg = np.vstack(q_avg)

qseed_code = set(qresstuct.get_res_code(qseed))
if not qseed_code:
    msg = 'No qseed={} nucleotides in the {}={} for seed {}'.format(
        qseed,
        ('qres', 'qresneg')[qneg],
        (qres, qresneg)[qneg],
        q
    )
    raise Exception(msg)
else:
    qseed_npc  = set(qures)  & qseed_code
    qseed_code = set(q_code) & qseed_code

q_ind = [i for i, code in enumerate(q_code) if code in qseed_code]
q_count = len(q_code)

In [7]:
# Preparing a saved structure
saveto = kwargs.get('saveto', saveto)
if saveto:
    os.makedirs(saveto, exist_ok=True)
    saveres    = kwargs.get('saveres', saveres)
    saveformat = kwargs.get('saveformat', qformat).upper()

    if saveformat not in pdb.formats:
        if saveformat == 'MMCIF':
            saveformat = 'CIF'
        else:
            msg = '''Invalid saveformat value
            \rAcceptable values for saveformat are PDB, CIF or MMCIF'''
            raise TypeError(msg)
    
    if saveres:
        sstruct = qstruct.get_res_substruct(saveres)
    else:
        sstruct = qresstuct

In [8]:
qresstuct.tab

Unnamed: 0,group_PDB,id,auth_atom_id,label_alt_id,auth_comp_id,auth_asym_id,auth_seq_id,pdbx_PDB_ins_code,Cartn_x,Cartn_y,Cartn_z,occupancy,B_iso_or_equiv,type_symbol,pdbx_formal_charge,pdbx_PDB_model_num
0,HETATM,1,PG,,GTP,A,1,,68.682,36.983,39.043,1.0,101.62,P,,1
1,HETATM,2,O1G,,GTP,A,1,,68.400,38.469,38.944,1.0,101.65,O,,1
2,HETATM,3,O2G,,GTP,A,1,,68.127,36.367,37.772,1.0,102.06,O,,1
3,HETATM,4,O3G,,GTP,A,1,,67.953,36.397,40.238,1.0,102.53,O,,1
4,HETATM,5,O3B,,GTP,A,1,,70.282,36.691,39.242,1.0,100.71,O,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,HETATM,1034,O,,HOH,A,208,,48.024,35.550,49.378,1.0,69.46,O,,1
1033,HETATM,1035,O,,HOH,A,209,,47.819,32.214,46.865,1.0,69.42,O,,1
1034,HETATM,1036,O,,HOH,A,210,,46.309,34.859,47.093,1.0,68.29,O,,1
1035,HETATM,1037,O,,HOH,A,211,,49.513,33.043,49.027,1.0,67.87,O,,1


In [9]:
# ARTEM Computations 

indx_pairs = list(itertools.product(r_ind, q_ind))
r_avg_tree = KDTree(r_avg)
if threads == 1:
    result = [artem(m, n) for m, n in indx_pairs]
else:
    pool = mp.Pool(threads)
    
    delta   = 15 * threads
    cnt     = 0
    cnt_max = len(indx_pairs)
    result  = []
    while cnt < cnt_max:
        result.extend(
            pool.starmap(artem, indx_pairs[cnt:cnt + delta])
        )
        cnt += delta

In [10]:
rows = {}
for i, rslt in enumerate(result):
    if rslt:
        nb, rmsd = rslt
    else:
        continue
    
    if nb in rows:
        row = rows[nb]
        if rmsd < row[0]:
            row[0] = rmsd
            row.insert(1, i)
        else:
            row.append(i)
    else:
        rows[nb] = [rmsd, i]
rows = [[i, *v] for i, v in rows.items()]

tabrows = []
if sizemin <= 0:
    for pair in itertools.product(rseed_npc, qseed_npc | qseed_code):
        tabrows.append((None, 0, None, None, '='.join(pair), None))
    for pair in itertools.product(rseed_npc | rseed_code, qseed_npc):
        tabrows.append((None, 0, None, None, '='.join(pair), None))
    
for i, row in enumerate(rows):
    nb   = row[0]
    size = len(nb)
    rmsd = row[1]
    rmsdsize = rmsd / size
    seed_id  = row[2:]
    
    prim = ','.join(
        [
            '='.join([r_code[s // q_count], q_code[s % q_count]])
            for s in seed_id
        ]
    )
    
    scnd = ','.join(
        [
            '='.join([r_code[m], q_code[n]])
            for m, n in nb
        ]
    )
    
    tabrows.append((nb, size, rmsd, rmsdsize, prim, scnd))

In [11]:
columns = ['neighbors', 'SIZE', 'RMSD', 'RMSDSIZE', 'PRIM', 'SCND']
tab = pd.DataFrame(tabrows, columns=columns)
tab.sort_values(
    ['SIZE', 'RMSDSIZE'], 
    ascending=[True, False], 
    inplace=True
)
tab.index = list(range(1, len(tab) + 1))
tab.index.name = 'ID'

In [14]:
if saveto:
    if 'pool' in dir():
        pool = mp.Pool(threads)
        pool.map(save_superimpose, tab.iloc)
    else:
        for superimpose in tab.iloc:
            save_superimpose(superimpose)

Process ForkPoolWorker-21:
Process ForkPoolWorker-23:
Process ForkPoolWorker-20:
Process ForkPoolWorker-19:
Process ForkPoolWorker-22:
Process ForkPoolWorker-17:
Process ForkPoolWorker-24:
Process ForkPoolWorker-18:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/mult

In [13]:
def save_superimpose(superimpose:'pd.Series') -> 'None':
    neighbors = superimpose['neighbors']
    X, Y = vstack([[r_scnd[i], q_scnd[j]] for i, j in neighbors])
    scnd_transform = get_transform(X, Y)
    struct = sstruct.apply_transform(scnd_transform)
    struct.rename('{}_{}'.format(sstruct, superimpose.name))
    struct.saveto(saveto, saveformat)

In [21]:
pd.DataFrame([], columns=[1, 2 ]).empty

True

In [22]:
s = pdb.parser('data/1xjr.cif', 'CIF')

In [25]:
tab = s.tab.copy()

In [28]:
tab['pdbx_PDB_model_num']=2

In [31]:
tab2 = pd.concat([s.tab, tab])

In [32]:
s.tab = tab2

In [33]:
s.rename('1xjr_2mod')

In [34]:
s.saveto('data')

In [40]:
ss = s.get_res_code('#1', True)

In [42]:
ss

['2.A.GTP.1.',
 '2.A.G.2.',
 '2.A.A.3.',
 '2.A.G.4.',
 '2.A.U.5.',
 '2.A.U.6.',
 '2.A.C.7.',
 '2.A.A.8.',
 '2.A.C.9.',
 '2.A.C.10.',
 '2.A.G.11.',
 '2.A.A.12.',
 '2.A.G.13.',
 '2.A.G.14.',
 '2.A.C.15.',
 '2.A.C.16.',
 '2.A.A.17.',
 '2.A.C.18.',
 '2.A.G.19.',
 '2.A.C.20.',
 '2.A.G.21.',
 '2.A.G.22.',
 '2.A.A.23.',
 '2.A.G.24.',
 '2.A.U.25.',
 '2.A.A.26.',
 '2.A.C.27.',
 '2.A.G.28.',
 '2.A.A.29.',
 '2.A.U.30.',
 '2.A.C.31.',
 '2.A.G.32.',
 '2.A.A.33.',
 '2.A.G.34.',
 '2.A.G.35.',
 '2.A.G.36.',
 '2.A.U.37.',
 '2.A.A.38.',
 '2.A.C.39.',
 '2.A.A.40.',
 '2.A.G.41.',
 '2.A.U.42.',
 '2.A.G.43.',
 '2.A.A.44.',
 '2.A.A.45.',
 '2.A.U.46.',
 '2.A.U.47.',
 '2.A.MG.101.',
 '2.A.MG.201.',
 '2.A.HOH.202.',
 '2.A.HOH.203.',
 '2.A.HOH.204.',
 '2.A.HOH.205.',
 '2.A.HOH.206.',
 '2.A.HOH.207.',
 '2.A.HOH.208.',
 '2.A.HOH.209.',
 '2.A.HOH.210.',
 '2.A.HOH.211.',
 '2.A.HOH.212.']

python3 artem.py -h --h -H --H -help --help

python3 artem.py r=data/1xjr.cif q=data/1xjr.cif
python3 artem.py r=data/1xjr.cif q=data/1xjr.pdb
python3 artem.py r=data/1xjr.pdb q=data/1xjr.pdb
python3 artem.py r=data/1xjr.pdb q=data/1xjr.cif

python3 artem.py r=data/1xjr.pdb q=data/1xjr.pdb rformat=pdb
    python3 artem.py r=data/1xjr.cif q=data/1xjr.pdb rformat=pdb 
python3 artem.py r=data/1xjr.cif q=data/1xjr.pdb qformat=pdb 
python3 artem.py r=data/1xjr.cif q=data/1xjr.pdb qformat=cif 

python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rres=#
python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rres=#1
python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rres=#2
python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rres=
python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rres=/
python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rres=fadsf
python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rres=:
python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rres=:_4

    python3 artem.py r=data/1xjr.cif q=data/1xjr.cif rseed=_2