In [36]:
import os, pickle, time, gzip, sys
sys.path.append('..')
from src.feature import composition_to_feature
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from pymatgen.core import Element
from src.utils import composit_parser, MetalElements, LigandElements, ActiveElements

with gzip.open('../data/unique_precursor.pkl.gz','rb') as f:
    unique_precursor = pickle.load(f)

# from VAE_test

## classification

In [11]:
root = '/home/jhyang/WORKSPACES/MODELS/isyn/VAE_test/cls'
record = {}
for dir in os.listdir(os.path.join(root)):
    path = path = os.path.join(root, dir)
    if len(os.listdir(path)) < 2:
        continue
#        [print(fn) for fn in os.listdir(path)]
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, f'output.{ds}.pkl'), 'rb'))
        pred, label = output['pred'].argmax(1), output['label']
        f1i = f1_score(label, pred, average='micro')
        f1a = f1_score(label, pred, average='macro')
        record[model][-1].extend([f1i, f1a])
        rxn = {}
        for info, p, l in zip(output['info'], pred, label):
            id = info['id']
            if id not in rxn.keys():
                rxn[id] = [[], []]
            rxn[id][0].append(l)
            rxn[id][1].append(p)
        result = [np.sum((np.array(l) != np.array(p))) == 0 for l, p in rxn.values()]
        record[model][-1].append(np.mean(result))

In [2]:
record = pickle.load(open('../dump/vae_test_cls.pkl','rb'))
for k,v in record.items():
    j = np.array(v).T[0].argmax()
    print(k, np.array(v)[j])
#pickle.dump(record, open('../dump/vae_test_cls.pkl','wb'))

fcnn [0.92104923 0.50989267 0.79035127 0.91825583 0.54290925 0.7972456 ]
attention [0.86742473 0.3551037  0.88083593 0.85613918 0.36042555 0.86036725]
convolution [0.9191756  0.53919154 0.89862161 0.91914799 0.5585939  0.89977047]


## regression 

In [121]:

pstr_to_label = {p['precursor_str']:i for i, p in enumerate(unique_precursor)}
legacy_active_elements = sorted(MetalElements + LigandElements, key=lambda x: Element(x).number)

def legacy_composit_feature(comp):
    v = np.zeros(len(legacy_active_elements))
    div = np.sum(list(comp.values()))
    for e, f in comp.items():
        v[legacy_active_elements.index(e)] = f / div
    return v

ref = np.array([legacy_composit_feature(p['precursor_comp'], ) for p in unique_precursor])

root = '/home/jhyang/WORKSPACES/MODELS/isyn/VAE_test/reg'
record = {}
for dir in os.listdir(os.path.join(root)):
    path = path = os.path.join(root, dir)
    if len(os.listdir(path)) < 2:
        continue
#        [print(fn) for fn in os.listdir(path)]
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, f'output.{ds}.pkl'), 'rb'))
        pred = np.square(ref[..., np.newaxis] - output['pred'].T[np.newaxis, ...]).sum(1).argmin(0)
        if 'fcnn' in dir:
            label = np.array([pstr_to_label[composit_parser(o['precursor_comp'])] for o in output['info']])
        else:
            label = [np.array([pstr_to_label[composit_parser(p)] for p in o['precursor_comp']]) for o in output['info']]

        f1i = f1_score(np.hstack(label), pred, average='micro')
        f1a = f1_score(np.hstack(label), pred, average='macro')
        record[model][-1].extend([f1i, f1a])
        if 'fcnn' in dir:
            rxn = {}
            for info, p, l in zip(output['info'], pred, label):
                id = info['id']
                if id not in rxn.keys():
                    rxn[id] = [[], []]
                rxn[id][0].append(l)
                rxn[id][1].append(p)
            result = [np.sum((np.array(l) != np.array(p))) == 0 for l, p in rxn.values()]
            record[model][-1].append(np.mean(result))
        else:
            rxn = []
            i = 0
            for l in label:
                j = i + len(l)
                rxn.append(np.sum(l != pred[i:j]) == 0)
                i = j
            record[model][-1].append(np.mean(rxn)) 

In [125]:
for k,v in record.items():
    j = np.array(v).T[0].argmax()
    print(k, np.array(v)[j])


convolution [0.50064608 0.258432   0.2294353  0.45767815 0.27659369 0.1958684 ]
attention [0. 0. 0. 0. 0. 0.]
fcnn [0.05808244 0.01478776 0.00022232 0.06546225 0.01584401 0.        ]


# VAE_FCNN

In [27]:
root = '/home/jhyang/WORKSPACES/MODELS/isyn/VAE_FCNN'
record = {}
outputs = {}
for dir in os.listdir(os.path.join(root)):
    path = path = os.path.join(root, dir)
    if len(os.listdir(path)) < 2:
        continue
#        [print(fn) for fn in os.listdir(path)]
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    outputs[dir] = {}
    for ds in ['valid','test']:
        outputs[dir][ds] = pickle.load(open(os.path.join(path, f'best.output.{ds}.pkl'), 'rb'))

In [97]:
def legacy_ligand_feat(comp):
    v = np.zeros(len(LigandElements)+1)
    for e, n in comp.items():
        if e in MetalElements:
            v[0] = 1
        else:
            v[LigandElements.index(e)+1] = 1
    return v

def ligand_feat_to_string(vecs):
    strs = [''.join([f'{v:.0f}' for v in vec]) for vec in vecs]
    return strs

legacy_ligand_feat_ref = np.vstack([np.array(list(s)).astype(int) for s in np.unique([''.join([f'{n:.0f}' for n in legacy_ligand_feat(p['precursor_comp'])]) for p in unique_precursor])])
legacy_ligand_label = ligand_feat_to_string(legacy_ligand_feat_ref)
ligand_feat_str_to_label = {s:i for i,s in enumerate(legacy_ligand_label)}

record = {}
for dir, ds0 in outputs.items():
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds, output in ds0.items():
        pred = np.square(legacy_ligand_feat_ref[..., np.newaxis, ] - output['pred'].T[np.newaxis, ...]).sum(1).argmin(0)
        label = np.array([ligand_feat_str_to_label[s] for s in ligand_feat_to_string(output['input'])])
        f1 = f1_score(label, pred, average='micro')
        record[model][-1].append(f1)
        rxn = {}
        for info, p, l in zip(output['info'], pred, label):
            id = info['id']
            if id not in rxn.keys():
                rxn[id] = [[], []]
            rxn[id][0].append(l)
            rxn[id][1].append(p)
        result = [np.sum((np.array(l) != np.array(p))) == 0 for l, p in rxn.values()]
        record[model][-1].append(np.mean(result))

In [126]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

oliynyk [0.9066430693382214, 0.7661971830985915, 0.8990975318038491, 0.7648588410104011]
mat2vec [0.906832690727514, 0.7666305525460455, 0.9014896161791889, 0.7685735512630015]
magpie [0.9077175905442134, 0.7677139761646804, 0.8927911275415896, 0.7455423476968797]
matscholar [0.9043044055369446, 0.7544962080173347, 0.8964879852125692, 0.7540861812778603]
elemnet [0.9086656974906769, 0.7687973997833153, 0.899641187343699, 0.7663447251114414]
megnet16 [0.9030402629416598, 0.7551462621885157, 0.8969229096444493, 0.7581723625557206]
cgcnn [0.9049364768345869, 0.7562296858071506, 0.8971403718603893, 0.7611441307578009]
active [0.9059477909108148, 0.7644637053087757, 0.901272153963249, 0.7719167904903418]


# sequence

## urxn_v0

In [175]:
path = '/home/jhyang/WORKSPACES/MODELS/isyn/sequence/urxn_v0'
record = {}
for dir in os.listdir(path):
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, dir, f'output.{ds}.pkl'),'rb'))
        pred = output['pred'].argmax(-1)
        label = output['label']
        nd, sl = label.shape
        mask = np.hstack([np.ones((nd, 1), dtype=bool), output['label'] != 444])[:, :sl]
        record[model][-1].append(f1_score(label[mask], pred[mask], average='micro'))
        result = []
        for p, l, m in zip(pred, label, mask):
            result.append(np.sum((p != l)[m]) == 0)
        record[model][-1].append(np.mean(result))        

In [185]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

cgcnn [0.6023215608792294, 0.09020284851100561, 0.5461405363837355, 0.053276047261009665]
oliynyk [0.5734255371696715, 0.0697022011221407, 0.5193694126694223, 0.04854994629430719]
elemnet [0.5699184983946654, 0.06668105308588693, 0.5599826972988561, 0.06337271750805586]
magpie [0.5599407261052112, 0.05826499784203712, 0.5419109872152263, 0.05198711063372718]
composit [0.6252901951099037, 0.11696158826068191, 0.5686821109295396, 0.07733619763694952]
megnet16 [0.5650284020745863, 0.061501942166594735, 0.5189849081995578, 0.04038668098818475]
mat2vec [0.6079525808841689, 0.09775571860164005, 0.5552244544842834, 0.05413533834586466]
matscholar [0.608150160533465, 0.09775571860164005, 0.5477266173219264, 0.06251342642320086]


## urxn_lstm

In [186]:
path = '/home/jhyang/WORKSPACES/MODELS/isyn/sequence/urxn_lstm'
record = {}
for dir in os.listdir(path):
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, dir, f'output.{ds}.pkl'),'rb'))
        pred = output['pred'].argmax(-1)
        label = output['label']
        nd, sl = label.shape
        mask = np.hstack([np.ones((nd, 1), dtype=bool), output['label'] != 444])[:, :sl]
        record[model][-1].append(f1_score(label[mask], pred[mask], average='micro'))
        result = []
        for p, l, m in zip(pred, label, mask):
            result.append(np.sum((p != l)[m]) == 0)
        record[model][-1].append(np.mean(result))        

In [187]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

lstm [0.8742832480424194, 0.39749676305567544, 0.8653981893509284, 0.3804511278195489]


## urxn_transformer

In [188]:
path = '/home/jhyang/WORKSPACES/MODELS/isyn/sequence/urxn_transformer'
record = {}
for dir in os.listdir(path):
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, dir, f'output.{ds}.pkl'),'rb'))
        pred = output['pred'].argmax(-1)
        label = output['label']
        nd, sl = label.shape
        mask = np.hstack([np.ones((nd, 1), dtype=bool), output['label'] != 444])[:, :sl]
        record[model][-1].append(f1_score(label[mask], pred[mask], average='micro'))
        result = []
        for p, l, m in zip(pred, label, mask):
            result.append(np.sum((p != l)[m]) == 0)
        record[model][-1].append(np.mean(result))        

In [189]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

transformer [0.8629693569270609, 0.3359948208890807, 0.8508516188430258, 0.3074113856068743]


## crxn_v0

In [196]:
path = '/home/jhyang/WORKSPACES/MODELS/isyn/sequence/crxn_v0'
record = {}
for dir in os.listdir(path):
    model = dir.split('_')[0]
    if model not in record.keys():
        record[model] = []
    record[model].append([])
    for ds in ['valid','test']:
        output = pickle.load(open(os.path.join(path, dir, f'output.{ds}.pkl'),'rb'))
        pred = output['pred'].argmax(-1)
        label = output['label']
        nd, sl = label.shape
        mask = np.hstack([np.ones((nd, 1), dtype=bool), output['label'] != 444])[:, :sl]
        record[model][-1].append(f1_score(label[mask], pred[mask], average='micro'))
        result = []
        for p, l, m in zip(pred, label, mask):
            result.append(np.sum((p != l)[m]) == 0)
        record[model][-1].append(np.mean(result))

In [197]:
for k,v in record.items():
    print(k, v[np.argmax(v, 0)[0]])

cgcnn [0.6084588644264195, 0.0997978777160182, 0.5341318757995116, 0.04311251314405889]
magpie [0.5677288528389339, 0.06114199090449722, 0.5020351203628329, 0.029705573080967402]
matscholar [0.6018539976825029, 0.09600808489135927, 0.5436097220607047, 0.04232386961093586]
oliynyk [0.5692352259559675, 0.06114199090449722, 0.5437841609489475, 0.04363827549947424]
elemnet [0.5721900347624566, 0.060384032339565435, 0.5450052331666473, 0.05783385909568875]
megnet16 [0.5683082271147161, 0.060131379484588174, 0.5475636701942086, 0.047055730809674026]
mat2vec [0.6097914252607184, 0.10409297625063163, 0.5466333294569136, 0.053364879074658256]
composit [0.6184241019698725, 0.10864072764022234, 0.5593673682986394, 0.0628286014721346]
