In [2]:
import os, pickle, time, gzip, sys
sys.path.append('..')
from src.feature import composition_to_feature
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from pymatgen.core import Element
from src.utils import composit_parser, MetalElements, LigandElements, ActiveElements

with gzip.open('../data/unique_precursor.pkl.gz','rb') as f:
    unique_precursor = pickle.load(f)

# from VAE_test

## classification

In [60]:
record = {}
root = '/home/jhyang/WORKSPACES/MODELS/isyn/VAE_test/cls'
for dir in os.listdir(os.path.join(root)):
    path = path = os.path.join(root, dir)
    if len(os.listdir(path)) < 2:
        continue
#        [print(fn) for fn in os.listdir(path)]
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, f'output.{ds}.pkl'), 'rb'))
        pred, label = output['pred'].argmax(1), output['label']
        f1i = f1_score(label, pred, average='micro')
        f1a = f1_score(label, pred, average='macro')
        record[model][-1].extend([f1i, f1a])
        rxn = {}
        if model != 'fcnn':
            i = 0
            for info in output['info']:
                id = info['id']
                rxn[id] = [[], [], 0]
                for j in range(len(info['precursor_comp'])):
                    rxn[id][0].append(output['label'][i+j])
                    rxn[id][1].append(output['pred'][i+j].argmax())
                i += len(info['precursor_comp'])
                metals = {}
                for precursor_comp in info['precursor_comp']:
                    for ele in precursor_comp.keys():
                        if ele not in MetalElements:
                            continue
                        if ele not in metals.keys():
                            metals[ele] = 0
                        metals[ele] += 1
                rxn[id][-1] = np.max(list(metals.values()))
            result = [np.sum((np.array(l) != np.array(p))) == 0 for l, p, m in rxn.values() if m == 1]
        else:
            rxn = {}
            for info, l, p in zip(output['info'], output['label'], output['pred'].argmax(1)):
                id = info['id']
                if id not in rxn.keys():
                    rxn[id] = [[], [], {}]
                rxn[id][0].append(l)
                rxn[id][1].append(p)
                for ele in info['precursor_comp'].keys():
                    if ele not in MetalElements:
                        continue
                    if ele not in rxn[id][2].keys():
                        rxn[id][2][ele] = 0
                    rxn[id][2][ele] += 1
            result = [np.sum((np.array(l) != np.array(p))) == 0 for l, p, m in rxn.values() if np.max(list(m.values())) == 1]
        record[model][-1].append(np.mean(result))

In [61]:
#record = pickle.load(open('../dump/vae_test_cls.pkl','rb'))
for k,v in record.items():
    j = np.array(v).T[0].argmax()
    print(k, np.array(v)[j])
#pickle.dump(record, open('../dump/vae_test_cls.pkl','wb'))

fcnn [0.91305179 0.44238276 0.80159091 0.91160586 0.47782328 0.79559505]
convolution [0.91139563 0.49212555 0.79318182 0.91336934 0.51541477 0.80023184]


## regression 

In [121]:

pstr_to_label = {p['precursor_str']:i for i, p in enumerate(unique_precursor)}
legacy_active_elements = sorted(MetalElements + LigandElements, key=lambda x: Element(x).number)

def legacy_composit_feature(comp):
    v = np.zeros(len(legacy_active_elements))
    div = np.sum(list(comp.values()))
    for e, f in comp.items():
        v[legacy_active_elements.index(e)] = f / div
    return v

ref = np.array([legacy_composit_feature(p['precursor_comp'], ) for p in unique_precursor])

root = '/home/jhyang/WORKSPACES/MODELS/isyn/VAE_test/reg'
record = {}
for dir in os.listdir(os.path.join(root)):
    path = path = os.path.join(root, dir)
    if len(os.listdir(path)) < 2:
        continue
#        [print(fn) for fn in os.listdir(path)]
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, f'output.{ds}.pkl'), 'rb'))
        pred = np.square(ref[..., np.newaxis] - output['pred'].T[np.newaxis, ...]).sum(1).argmin(0)
        if 'fcnn' in dir:
            label = np.array([pstr_to_label[composit_parser(o['precursor_comp'])] for o in output['info']])
        else:
            label = [np.array([pstr_to_label[composit_parser(p)] for p in o['precursor_comp']]) for o in output['info']]

        f1i = f1_score(np.hstack(label), pred, average='micro')
        f1a = f1_score(np.hstack(label), pred, average='macro')
        record[model][-1].extend([f1i, f1a])
        if 'fcnn' in dir:
            rxn = {}
            for info, p, l in zip(output['info'], pred, label):
                id = info['id']
                if id not in rxn.keys():
                    rxn[id] = [[], []]
                rxn[id][0].append(l)
                rxn[id][1].append(p)
            result = [np.sum((np.array(l) != np.array(p))) == 0 for l, p in rxn.values()]
            record[model][-1].append(np.mean(result))
        else:
            rxn = []
            i = 0
            for l in label:
                j = i + len(l)
                rxn.append(np.sum(l != pred[i:j]) == 0)
                i = j
            record[model][-1].append(np.mean(rxn)) 

In [125]:
for k,v in record.items():
    j = np.array(v).T[0].argmax()
    print(k, np.array(v)[j])


convolution [0.50064608 0.258432   0.2294353  0.45767815 0.27659369 0.1958684 ]
attention [0. 0. 0. 0. 0. 0.]
fcnn [0.05808244 0.01478776 0.00022232 0.06546225 0.01584401 0.        ]


# VAE_FCNN

In [67]:
root = '/home/jhyang/WORKSPACES/MODELS/isyn/VAE_FCNN'
record = {}
outputs = {}
c = 0
for dir in os.listdir(os.path.join(root)):
    path = path = os.path.join(root, dir)
    if len(os.listdir(path)) < 2:
        continue
#        [print(fn) for fn in os.listdir(path)]
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    outputs[dir] = {}
    for ds in ['valid','test']:
        outputs[dir][ds] = pickle.load(open(os.path.join(path, f'best.output.{ds}.pkl'), 'rb'))
        c += 1

In [76]:
def legacy_ligand_feat(comp):
    v = np.zeros(len(LigandElements)+1)
    for e, n in comp.items():
        if e in MetalElements:
            v[0] = 1
        else:
            v[LigandElements.index(e)+1] = 1
    return v

def ligand_feat_to_string(vecs):
    strs = [''.join([f'{v:.0f}' for v in vec]) for vec in vecs]
    return strs

legacy_ligand_feat_ref = np.vstack([np.array(list(s)).astype(int) for s in np.unique([''.join([f'{n:.0f}' for n in legacy_ligand_feat(p['precursor_comp'])]) for p in unique_precursor])])
legacy_ligand_label = ligand_feat_to_string(legacy_ligand_feat_ref)
ligand_feat_str_to_label = {s:i for i,s in enumerate(legacy_ligand_label)}

record = {}
for dir, ds0 in outputs.items():
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds, output in ds0.items():
        pred = np.square(legacy_ligand_feat_ref[..., np.newaxis, ] - output['pred'].T[np.newaxis, ...]).sum(1).argmin(0)
        label = np.array([ligand_feat_str_to_label[s] for s in ligand_feat_to_string(output['input'])])
        f1 = f1_score(label, pred, average='micro')
        record[model][-1].append(f1)
        rxn = {}
        for info, p, l in zip(output['info'], pred, label):
            id = info['id']
            if id not in rxn.keys():
                rxn[id] = [[], [],{}]
            rxn[id][0].append(l)
            rxn[id][1].append(p)
            for ele in info['precursor_comp'].keys():
                if ele not in MetalElements:
                    continue
                if ele not in rxn[id][2].keys():
                    rxn[id][2][ele] = 0
                rxn[id][2][ele] += 1
        result = [np.sum((np.array(l) != np.array(p))) == 0 for l, p, m in rxn.values() if np.max(list(m.values())) == 1]
        record[model][-1].append(np.mean(result))

In [77]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

oliynyk [0.9066430693382214, 0.7970687711386697, 0.8990975318038491, 0.7842987804878049]
mat2vec [0.906832690727514, 0.797294250281849, 0.9014896161791889, 0.788109756097561]
magpie [0.9077175905442134, 0.7988726042841037, 0.8927911275415896, 0.7648628048780488]
matscholar [0.9043044055369446, 0.7851183765501691, 0.8964879852125692, 0.7736280487804879]
elemnet [0.9086656974906769, 0.8, 0.899641187343699, 0.786204268292683]
megnet16 [0.9030402629416598, 0.7857948139797069, 0.8969229096444493, 0.7778201219512195]
cgcnn [0.9049364768345869, 0.786696730552424, 0.8971403718603893, 0.7808689024390244]
active [0.9059477909108148, 0.7952649379932356, 0.901272153963249, 0.7915396341463414]


In [73]:
result = [np.sum((np.array(l) != np.array(p))) == 0 for l, p, m in rxn.values() if np.max(list(m.values())) == 1]
np.mean(result)


0.7439024390243902

In [43]:
output['info']

[{'id': 49,
  'raw_id': 38563,
  'year': 2020,
  'precursor_comp': {'Nb': 0.2857143, 'O': 0.71428573},
  'precursor_feat_type': ['ligand_composit', False],
  'target_comp': {'Mg': 0.0666, 'Nb': 0.1334, 'Pb': 0.2, 'O': 0.6},
  'target_feat_type': ['elemnet', True],
  'metal_comp': {'Nb': 0.2857143},
  'metal_feat_type': ['metal_composit', False]},
 {'id': 49,
  'raw_id': 38563,
  'year': 2020,
  'precursor_comp': {'Pb': 0.42857143, 'O': 0.5714286},
  'precursor_feat_type': ['ligand_composit', False],
  'target_comp': {'Mg': 0.0666, 'Nb': 0.1334, 'Pb': 0.2, 'O': 0.6},
  'target_feat_type': ['elemnet', True],
  'metal_comp': {'Pb': 0.42857143},
  'metal_feat_type': ['metal_composit', False]},
 {'id': 49,
  'raw_id': 38563,
  'year': 2020,
  'precursor_comp': {'Mg': 0.2, 'H': 0.08, 'C': 0.16, 'O': 0.56},
  'precursor_feat_type': ['ligand_composit', False],
  'target_comp': {'Mg': 0.0666, 'Nb': 0.1334, 'Pb': 0.2, 'O': 0.6},
  'target_feat_type': ['elemnet', True],
  'metal_comp': {'Mg': 0.2

# sequence

## urxn_v0

In [175]:
path = '/home/jhyang/WORKSPACES/MODELS/isyn/sequence/urxn_v0'
record = {}
for dir in os.listdir(path):
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, dir, f'output.{ds}.pkl'),'rb'))
        pred = output['pred'].argmax(-1)
        label = output['label']
        nd, sl = label.shape
        mask = np.hstack([np.ones((nd, 1), dtype=bool), output['label'] != 444])[:, :sl]
        record[model][-1].append(f1_score(label[mask], pred[mask], average='micro'))
        result = []
        for p, l, m in zip(pred, label, mask):
            result.append(np.sum((p != l)[m]) == 0)
        record[model][-1].append(np.mean(result))        

In [185]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

cgcnn [0.6023215608792294, 0.09020284851100561, 0.5461405363837355, 0.053276047261009665]
oliynyk [0.5734255371696715, 0.0697022011221407, 0.5193694126694223, 0.04854994629430719]
elemnet [0.5699184983946654, 0.06668105308588693, 0.5599826972988561, 0.06337271750805586]
magpie [0.5599407261052112, 0.05826499784203712, 0.5419109872152263, 0.05198711063372718]
composit [0.6252901951099037, 0.11696158826068191, 0.5686821109295396, 0.07733619763694952]
megnet16 [0.5650284020745863, 0.061501942166594735, 0.5189849081995578, 0.04038668098818475]
mat2vec [0.6079525808841689, 0.09775571860164005, 0.5552244544842834, 0.05413533834586466]
matscholar [0.608150160533465, 0.09775571860164005, 0.5477266173219264, 0.06251342642320086]


## urxn_lstm

In [186]:
path = '/home/jhyang/WORKSPACES/MODELS/isyn/sequence/urxn_lstm'
record = {}
for dir in os.listdir(path):
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, dir, f'output.{ds}.pkl'),'rb'))
        pred = output['pred'].argmax(-1)
        label = output['label']
        nd, sl = label.shape
        mask = np.hstack([np.ones((nd, 1), dtype=bool), output['label'] != 444])[:, :sl]
        record[model][-1].append(f1_score(label[mask], pred[mask], average='micro'))
        result = []
        for p, l, m in zip(pred, label, mask):
            result.append(np.sum((p != l)[m]) == 0)
        record[model][-1].append(np.mean(result))        

In [187]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

lstm [0.8742832480424194, 0.39749676305567544, 0.8653981893509284, 0.3804511278195489]


## urxn_transformer

In [188]:
path = '/home/jhyang/WORKSPACES/MODELS/isyn/sequence/urxn_transformer'
record = {}
for dir in os.listdir(path):
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, dir, f'output.{ds}.pkl'),'rb'))
        pred = output['pred'].argmax(-1)
        label = output['label']
        nd, sl = label.shape
        mask = np.hstack([np.ones((nd, 1), dtype=bool), output['label'] != 444])[:, :sl]
        record[model][-1].append(f1_score(label[mask], pred[mask], average='micro'))
        result = []
        for p, l, m in zip(pred, label, mask):
            result.append(np.sum((p != l)[m]) == 0)
        record[model][-1].append(np.mean(result))        

In [189]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

transformer [0.8629693569270609, 0.3359948208890807, 0.8508516188430258, 0.3074113856068743]


## crxn_v0

In [196]:
path = '/home/jhyang/WORKSPACES/MODELS/isyn/sequence/crxn_v0'
record = {}
for dir in os.listdir(path):
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, dir, f'output.{ds}.pkl'),'rb'))
        pred = output['pred'].argmax(-1)
        label = output['label']
        nd, sl = label.shape
        mask = np.hstack([np.ones((nd, 1), dtype=bool), output['label'] != 444])[:, :sl]
        record[model][-1].append(f1_score(label[mask], pred[mask], average='micro'))
        result = []
        for p, l, m in zip(pred, label, mask):
            result.append(np.sum((p != l)[m]) == 0)
        record[model][-1].append(np.mean(result))

In [197]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

cgcnn [0.6084588644264195, 0.0997978777160182, 0.5341318757995116, 0.04311251314405889]
magpie [0.5677288528389339, 0.06114199090449722, 0.5020351203628329, 0.029705573080967402]
matscholar [0.6018539976825029, 0.09600808489135927, 0.5436097220607047, 0.04232386961093586]
oliynyk [0.5692352259559675, 0.06114199090449722, 0.5437841609489475, 0.04363827549947424]
elemnet [0.5721900347624566, 0.060384032339565435, 0.5450052331666473, 0.05783385909568875]
megnet16 [0.5683082271147161, 0.060131379484588174, 0.5475636701942086, 0.047055730809674026]
mat2vec [0.6097914252607184, 0.10409297625063163, 0.5466333294569136, 0.053364879074658256]
composit [0.6184241019698725, 0.10864072764022234, 0.5593673682986394, 0.0628286014721346]


In [35]:
with gzip.open('../data/unique_reaction.pkl.gz','rb') as f:
    urxn = pickle.load(f)

nmax = []
for rxn in urxn:
    metals = {}
    for pcomp in rxn['precursor_comp']:
        for ele in pcomp.keys():
            if ele not in MetalElements:
                continue
            if ele not in metals.keys():
                metals[ele] = 0
            metals[ele] += 1
    nmax.append(np.max(list(metals.values())))

np.unique(nmax, return_counts=True)


(array([1, 2, 3, 4]), array([30320,   987,    74,     6]))

In [38]:
[f'{v*100:.4f}' for v in np.unique(nmax, return_counts=True)[1] / np.unique(nmax, return_counts=True)[1].sum()]

['96.6005', '3.1446', '0.2358', '0.0191']