In [None]:
from artem import *

: 

In [None]:
print(1)

: 

In [None]:
def artem(m, n):
    prim_transform  = get_transform(r_prim[m], q_prim[n])
    
    q_avg_tree = KDTree(apply_transform(q_avg, prim_transform))
    dist = r_avg_tree.sparse_distance_matrix(
        q_avg_tree,
        matchrange,
        p=2,
        output_type='ndarray'
    )

    neighbors = mutual_nearest_neighbors(dist)
    size = len(neighbors)
    if not sizemin <= size <= sizemax:
        return None
    
    X, Y = vstack([[r_scnd[i], q_scnd[j]] for i, j in neighbors])
    scnd_transform = get_transform(X, Y)
    
    X, Y = vstack([[r_eval[i], q_eval[j]] for i, j in neighbors])
    
    rmsd = RMSD(X, apply_transform(Y, scnd_transform))
    if not rmsdmin <= rmsd <= rmsdmax:
        return None
    
    rmsdsize = rmsd / size
    if not rmsdsizemin <= rmsdsize <= rmsdsizemax:
        return None
    
    
    neighbors = sorted([i*q_count + j for i, j in neighbors])
    neighbors.append(round(rmsd, 3))
    return tuple(neighbors)

: 

In [96]:
argv = [
    'r=data/7o7y.cif',
    'q=data/7o7y.cif',
    'threads=-1',
    'saveto=results'
]

In [97]:
if argv[0] in {'--H', '-H', '--h', '-h', '--help', '-help'}:
    with open('README.md', 'r') as rdme:
        print(*rdme)
    exit()
else:
    kwargs = dict([arg.split('=') for arg in argv])

In [98]:
# Processing inputs

threads = int(kwargs.get('threads', threads))
if threads != 1:
    mp.set_start_method('fork')     # ARTEM multiprocessing is available only for UNIX-like systems
    if threads <= 0:
        threads = mp.cpu_count()
    else:
        threads = min(threads, mp.cpu_count())

r       = kwargs.get('r')
rres    = kwargs.get('rres', rres)

rresneg = kwargs.get('rresneg', rresneg)
rneg    = bool(rresneg)
rseed   = kwargs.get('rseed', '#')

rformat = kwargs.get('rformat', None)
rname, rext = r.split(os.sep)[-1].split('.')
rext = rext.upper()
if not rformat:
    if rext in pdb.formats:
        rformat = rext
    else:
        rformat = 'PDB'
else:
    rformat = rformat.upper()

q       = kwargs.get('q')
qres    = kwargs.get('qres', qres)
qresneg = kwargs.get('qresneg', qresneg)
qneg    = bool(qresneg)
qseed   = kwargs.get('qseed', '#')
qformat = kwargs.get('qformat', None)
qname, qext = q.split(os.sep)[-1].split('.')
qext = qext.upper()
if not qformat:
    if qext in pdb.formats:
        qformat = qext
    else:
        qformat = 'PDB'
else:
    qformat = qformat.upper()

sizemin     = float(kwargs.get('sizemin', sizemin))
sizemax     = float(kwargs.get('sizemax', sizemax))

rmsdmin     = float(kwargs.get('rmsdmin', rmsdmin))
rmsdmax     = float(kwargs.get('rmsdmax', rmsdmax))

rmsdsizemin = float(kwargs.get('rmsdsizemin', rmsdsizemin))
rmsdsizemax = float(kwargs.get('rmsdsizemax', rmsdsizemax))

matchrange  = float(kwargs.get('matchrange', matchrange))

RuntimeError: context has already been set

In [99]:
# Model preprocessing

rstruct  = pdb.parser(r, rformat, rname)
rstruct.drop_duplicates_alt_id(keep=keep)

rresstruct = rstruct.get_res_substruct(
    (rres, rresneg)[rneg],
    rneg
)

rdata, rnoise = describe(rresstruct)
if not rdata:
    msg = 'No {}={} nucleotides in the r={} for rseed={}'.format(
        ('rres', 'rresneg')[rneg],
        (rres, rresneg)[rneg],
        r,
        rseed
    )
    raise Exception(msg)
else:
    r_code, r_prim, r_avg, r_scnd, r_eval = zip(*rdata)
    r_avg = np.vstack(r_avg)

rseed_code = set(rstruct.get_res_code(rseed))
if not rseed_code:
    msg = 'No rseed={} nucleotides in the {}={} for r={}'.format(
        rseed,
        ('rres', 'rresneg')[rneg],
        (rres, rresneg)[rneg],
        r
    )
    raise Exception(msg)
else:
    rseed_npc  = set(rnoise) & rseed_code
    rseed_code = set(r_code) & rseed_code

r_ind = [i for i, code in enumerate(r_code) if code in rseed_code]


qstruct  = pdb.parser(q, qformat, qname)
qstruct.drop_duplicates_alt_id(keep=keep)

qneg = bool(qresneg)
qresstruct = qstruct.get_res_substruct(
    (qres, qresneg)[qneg],
    qneg
)
qrres, qures = describe(qresstruct)
if not qrres:
    msg = 'No {}={} nucleotides in the q={} for qseed={}'.format(
        ('qres', 'qresneg')[qneg],
        (qres, qresneg)[qneg],
        q,
        qseed
    )
    raise Exception(msg)
else:
    q_code, q_prim, q_avg, q_scnd, q_eval = zip(*qrres)
    q_avg = np.vstack(q_avg)

qseed_code = set(qstruct.get_res_code(qseed))
if not qseed_code:
    msg = 'No qseed={} nucleotides in the {}={} for q={}'.format(
        qseed,
        ('qres', 'qresneg')[qneg],
        (qres, qresneg)[qneg],
        q
    )
    raise Exception(msg)
else:
    qseed_npc  = set(qures)  & qseed_code
    qseed_code = set(q_code) & qseed_code

q_ind = [i for i, code in enumerate(q_code) if code in qseed_code]
q_count = len(q_code)

2826

In [7]:
# Preparing a saved structure
saveto = kwargs.get('saveto', saveto)
if saveto:
    os.makedirs(saveto, exist_ok=True)
    saveres    = kwargs.get('saveres', saveres)
    saveformat = kwargs.get('saveformat', qformat).upper()
    
    if saveformat not in pdb.formats:
        if saveformat == 'MMCIF':
            saveformat = 'CIF'
        else:
            msg = '''Invalid saveformat value
            \rAcceptable values for saveformat are PDB, CIF or MMCIF'''
            raise TypeError(msg)
    
    if saveres:
        ssmask = qstruct.get_res_mask(saveres)
    else:
        ssmask = qresstruct.tab['id'].astype(bool)

In [100]:
indx_pairs = list(itertools.product(r_ind, q_ind))

In [103]:
max([i*j for i, j in indx_pairs])

7980625

In [39]:
# ARTEM Computations 


result = {}
indx_pairs = list(itertools.product(r_ind, q_ind))
r_avg_tree = KDTree(r_avg)
if threads == 1:
    i = 0 
    for out in (artem(m, n) for m, n in indx_pairs):
        if out:
            if out in result:
                result[out].append(i)
            else:
                result[out] = [i]
        i += 1
else:
    pool = mp.Pool(threads)
    
    delta   = 15 * threads
    cnt     = 0
    cnt_max = len(indx_pairs)
    while cnt < cnt_max:
        i = cnt 
        for out in pool.starmap(artem, indx_pairs[cnt:cnt + delta]):
            if out:
                if out in result:
                    result[out].append(i)
                else:
                    result[out] = [i]
            i += 1 
        cnt += delta


In [40]:
tabrows = []
if sizemin <= 0:
    for pair in itertools.product(rseed_npc, qseed_npc | qseed_code):
        tabrows.append((None, 0, None, None, '='.join(pair), None))
    for pair in itertools.product(rseed_npc | rseed_code, qseed_npc):
        tabrows.append((None, 0, None, None, '='.join(pair), None))

In [41]:
items = result.items()
del result
for k, v in items:
    size = len(k) - 1
    rmsd = k[-1]
    rmsdsize = rmsd / size

    prim = ','.join(
        [
            '='.join([r_code[s // q_count], q_code[s % q_count]])
            for s in v
        ]
    )
    scnd = ','.join(
        [
            '='.join([r_code[s // q_count], q_code[s % q_count]])
            for s in k[:-1]
        ]
    )
    
    tabrows.append((k[:-1], size, rmsd, rmsdsize, prim, scnd))

In [54]:
a = 2
b = 1023423
c = 2.
d = 1023423.234
a.__sizeof__(), b.__sizeof__(), c.__sizeof__(), d.__sizeof__()

(28, 28, 24, 24)

In [89]:
tuple([*[i for i in range(10000, 100000)]])

(10000,
 10001,
 10002,
 10003,
 10004,
 10005,
 10006,
 10007,
 10008,
 10009,
 10010,
 10011,
 10012,
 10013,
 10014,
 10015,
 10016,
 10017,
 10018,
 10019,
 10020,
 10021,
 10022,
 10023,
 10024,
 10025,
 10026,
 10027,
 10028,
 10029,
 10030,
 10031,
 10032,
 10033,
 10034,
 10035,
 10036,
 10037,
 10038,
 10039,
 10040,
 10041,
 10042,
 10043,
 10044,
 10045,
 10046,
 10047,
 10048,
 10049,
 10050,
 10051,
 10052,
 10053,
 10054,
 10055,
 10056,
 10057,
 10058,
 10059,
 10060,
 10061,
 10062,
 10063,
 10064,
 10065,
 10066,
 10067,
 10068,
 10069,
 10070,
 10071,
 10072,
 10073,
 10074,
 10075,
 10076,
 10077,
 10078,
 10079,
 10080,
 10081,
 10082,
 10083,
 10084,
 10085,
 10086,
 10087,
 10088,
 10089,
 10090,
 10091,
 10092,
 10093,
 10094,
 10095,
 10096,
 10097,
 10098,
 10099,
 10100,
 10101,
 10102,
 10103,
 10104,
 10105,
 10106,
 10107,
 10108,
 10109,
 10110,
 10111,
 10112,
 10113,
 10114,
 10115,
 10116,
 10117,
 10118,
 10119,
 10120,
 10121,
 10122,
 10123,
 10124,


In [112]:
'7980625'.__sizeof__()

56

In [114]:
int(7980625).__sizeof__()

28

In [153]:
sys.getsizeof(tuple([*[i for i in range(7980625-200, 7980625)], 2.432]))

1648

In [154]:
sys.getsizeof(' '.join([str(i) for i in range(7980625-200, 7980625)]) + ' ' + str(2.432))

1654

In [165]:
k = '24:31:20 01/03/2022 127.0.0.1 54321.hlm out of memory and rewrite done.brhl.'
s = k[30:]
# if i != len(a) - 1:
s = s[: len(s) - 1]
s2 = s.split(" ")
for j in range(len(s2)):
    if s2[j].islower() and s2[j][0] != ".":
        s3 = s2[j].split(".hlm")
        s4 = s2[j].split(".brhl")
        if len(s3) == 2:
            if s3[1] == "." or s3[1] == "":
                if s3[1] == ".":
                    print(s2[j][: len(s2[j]) - 1] + '\n', end='')
                else:
                    print(s2[j] + '\n',end='')
        if len(s3) == 1:
            if s3[0] != s2[j]:
                print(s2[j] + '\n',end='')
        if len(s4) == 2:
            if s4[1] == "." or s4[1] == "":
                if s4[1] == ".":
                    print(s2[j][: len(s2[j]) - 1] + '\n',end='')
                else:
                    print(s2[j] + '\n',end='')
        if len(s4) == 1:
            if s4[0] != s2[j]:
                print(s2[j] + '\n',end='')

54321.hlm
done.brhl


In [169]:
f = open("input.txt")
a = f.readlines()
b = open("output.txt", "w")
for i in range(len(a)):
    k = a[i]
    s = k[30:]
    if i != len(a) - 1:
        s = s[: len(s) - 1]
    s2 = s.split(" ")
    for j in range(len(s2)):
        if s2[j].islower() and s2[j][0] != ".":
            s3 = s2[j].split(".hlm")
            s4 = s2[j].split(".brhl")
            if len(s3) == 2:
                if s3[1] == "." or s3[1] == "":
                    if s3[1] == ".":
                        b.write(s2[j][: len(s2[j]) - 1] + '\n')
                    else:
                        b.write(s2[j] + '\n')
            if len(s3) == 1:
                if s3[0] != s2[j]:
                    b.write(s2[j] + '\n')
            if len(s4) == 2:
                if s4[1] == "." or s4[1] == "":
                    if s4[1] == ".":
                        b.write(s2[j][: len(s2[j]) - 1] + '\n')
                    else:
                        b.write(s2[j] + '\n')
            if len(s4) == 1:
                if s4[0] != s2[j]:
                    b.write(s2[j] + '\n')
b.close()

In [170]:
with open('d.txt', 'r') as file:
    lns = file.readlines()

In [171]:
lns

['sfdfsd\n', 'sdfsdfsdf\n', 'fsdfsdfsdf\n']

In [19]:
from ast import pattern


'string'.endswith(pattern)
'string'.startswith(pattern)
'string'.islower()

ValueError: invalid literal for int() with base 2: '2147483647'

In [14]:
(28-24)/ 28

0.14285714285714285

In [23]:
columns = ['neighbors', 'SIZE', 'RMSD', 'RMSDSIZE', 'PRIM', 'SCND']
tab = pd.DataFrame(tabrows, columns=columns)
tab.sort_values(
    ['SIZE', 'RMSDSIZE'], 
    ascending=[True, False], 
    inplace=True
)
tab.index = list(range(1, len(tab) + 1))
tab.index.name = 'ID'

In [24]:
tab

Unnamed: 0_level_0,neighbors,SIZE,RMSD,RMSDSIZE,PRIM,SCND
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"(1082,)",1,0.907,0.907000,1.A.U.25.=1.A.A.26.,1.A.U.25.=1.A.A.26.
2,"(1085,)",1,0.907,0.907000,1.A.U.25.=1.A.A.29.,1.A.U.25.=1.A.A.29.
3,"(1096,)",1,0.907,0.907000,1.A.U.25.=1.A.A.40.,1.A.U.25.=1.A.A.40.
4,"(1127,)",1,0.907,0.907000,1.A.A.26.=1.A.U.25.,1.A.A.26.=1.A.U.25.
5,"(1265,)",1,0.907,0.907000,1.A.A.29.=1.A.U.25.,1.A.A.29.=1.A.U.25.
...,...,...,...,...,...,...
1955,"(1, 48, 95, 142, 189, 236, 283, 330, 424, 471,...",24,2.236,0.093167,1.A.A.3.=1.A.G.4.,"1.A.G.2.=1.A.A.3.,1.A.A.3.=1.A.G.4.,1.A.G.4.=1..."
1956,"(46, 93, 140, 187, 234, 281, 328, 375, 469, 51...",24,2.236,0.093167,1.A.G.4.=1.A.A.3.,"1.A.A.3.=1.A.G.2.,1.A.G.4.=1.A.A.3.,1.A.U.5.=1..."
1957,"(46, 93, 140, 187, 234, 281, 469, 563, 610, 65...",26,2.347,0.090269,1.A.U.46.=1.A.U.47.,"1.A.A.3.=1.A.G.2.,1.A.G.4.=1.A.A.3.,1.A.U.5.=1..."
1958,"(1, 48, 95, 142, 189, 236, 424, 518, 565, 612,...",26,2.347,0.090269,1.A.U.47.=1.A.U.46.,"1.A.G.2.=1.A.A.3.,1.A.A.3.=1.A.G.4.,1.A.G.4.=1..."


AttributeError: module 'os' has no attribute '__version__'

In [2]:
_5_atomic_representation = {
    # Purine
    'A':   ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    'G':   ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    
    '1MA': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    '2MG': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    '6MZ': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    '7MG': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    'A2M': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    'MA6': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    'OMG': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    'YYG': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    'SAM': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N9', 'C2', 'C6'),
    
    # Pyrimidine
    'C':   ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    'U':   ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    
    '4AC': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    '4SU': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    '5MC': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    '5MU': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    'LV2': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    'OMC': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    'OMU': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    'SSU': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    'UR3': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'N1', 'C2', 'C4'),
    
    # Pyrimidine, C-glycosidic bond
    'PSU': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'C5', 'C4', 'C2'),
    'B8N': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'C5', 'C4', 'C2'),
    '3TD': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'C5', 'C4', 'C2'),
    'UY1': ('P', "C1' C2' O2' C3' O3' C4' O4' C5' O5'", 'C5', 'C4', 'C2'),
}


_3_atomic_representation = {
    # Purine
    'A':   ('N9', 'C2', 'C6'),
    'G':   ('N9', 'C2', 'C6'),
    
    '1MA': ('N9', 'C2', 'C6'),
    '2MG': ('N9', 'C2', 'C6'),
    '6MZ': ('N9', 'C2', 'C6'),
    '7MG': ('N9', 'C2', 'C6'),
    'A2M': ('N9', 'C2', 'C6'),
    'MA6': ('N9', 'C2', 'C6'),
    'OMG': ('N9', 'C2', 'C6'),
    'YYG': ('N9', 'C2', 'C6'),
    'SAM': ('N9', 'C2', 'C6'),
    
    # Pyrimidine
    'C':   ('N1', 'C2', 'C4'),
    'U':   ('N1', 'C2', 'C4'),
    
    '4AC': ('N1', 'C2', 'C4'),
    '4SU': ('N1', 'C2', 'C4'),
    '5MC': ('N1', 'C2', 'C4'),
    '5MU': ('N1', 'C2', 'C4'),
    'LV2': ('N1', 'C2', 'C4'),
    'OMC': ('N1', 'C2', 'C4'),
    'OMU': ('N1', 'C2', 'C4'),
    'SSU': ('N1', 'C2', 'C4'),
    'UR3': ('N1', 'C2', 'C4'),
    
    # Pyrimidine, C-glycosidic bond
    'PSU': ('C5', 'C4', 'C2'),
    'B8N': ('C5', 'C4', 'C2'),
    '3TD': ('C5', 'C4', 'C2'),
    'UY1': ('C5', 'C4', 'C2'),
}


seed_atomic_representations = (
    _5_atomic_representation,   # For primary alignment
    _5_atomic_representation,   # To calculate centers of mass
    _3_atomic_representation,   # For secondary alignment
    _3_atomic_representation,   # To calculate the RMSD
)

In [3]:
prepared = []
for atomic_representation in seed_atomic_representations:
    if atomic_representation in prepared:
        continue
    
    for res, atoms in atomic_representation.items():
        atomic_representation[res] = [pd.Index(atom.split()) for atom in atoms]
    
    prepared.append(atomic_representation)

In [7]:
any((1, 2 ,3, 0))

True

In [9]:
pool = 13423

In [10]:
%%timeit
'pool' in dir()

165 ns ± 0.6 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [11]:
%%timeit
'pool' in globals()

25.2 ns ± 0.143 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [7]:
for atomic_representation in pd.unique(seed_atomic_representations):
    for res, atoms in atomic_representation.items():
        atomic_representation[res] = [pd.Index(val.split()) for val in atoms]

TypeError: unhashable type: 'dict'

In [1]:
from lib.nar import *