In [1]:
from tgnn.model.TGNN import TGNN as TGNNR
from tgnn.model.TGNN import TGNNX
from tgnn.model.CGCNN import CGCNN as CGCNNX
from tgnn.model.CGCNN import CGCNNR 
import tgnn.util.trainer as tr
import tgnn.util.crystal_conv as cc
from torch.utils.data import DataLoader
import os, torch, gc
import pandas as pd

In [2]:
gc.collect()
torch.cuda.empty_cache()
model_root = 'C:/WORKSPACE_KRICT/MODELS/202204/baseline'
root_data  = 'C:/WORKSPACE_KRICT/DATA/data_snu/with_metal'


In [3]:

cc.load_mat_atom_feats()

dataset = cc.load_dataset(root_data, fn='id_target.test.baseline.csv', target_idx=3, ref_idx=1, 
                radius=4, test_only=True, model_type='tgnn')
data_loader = DataLoader(dataset=dataset, batch_size=1, collate_fn=tr.collate)

for mtype, ModelObject in [('tgnn_x',TGNNX), ('tgnn_r',TGNNR)]:
    model_type_path = os.path.join(model_root, mtype)
    for model_name in os.listdir(model_type_path):
        output_root = os.path.join(model_type_path, model_name)
        model = ModelObject(cc.num_atom_feats, cc.num_bond_feats, 1)
        
        pts = []
        for pt in os.listdir(output_root):
            if not pt.endswith('.pt'): continue
            pts.append(pt)
        pts = sorted(pts)
        model_path = os.path.join(output_root, pts[-1])
        epoch = int(pts[-1].split('.')[1])
        model.load_state_dict(torch.load(model_path))
        model.cuda()
        
        _, _, idxs, targets, preds = tr.test(model, data_loader, torch.nn.L1Loss())
        df = pd.DataFrame(dict(
            icsd_id=idxs.astype(int).squeeze(), 
            target=targets.squeeze(), 
            prediction=preds.squeeze()))
        df = df.sort_values('icsd_id')
        df.to_csv(os.path.join(output_root, 'test.{:05d}.csv'.format(epoch)), index=False)
        gc.collect()
        torch.cuda.empty_cache()

dataset = cc.load_dataset(root_data, fn='id_target.test.baseline.csv', target_idx=3, ref_idx=1, 
                radius=8, test_only=True, model_type='cgcnn')
data_loader = DataLoader(dataset=dataset, batch_size=2048, collate_fn=tr.collate_cgcnn)

for mtype, ModelObject in [('cgcnn_x',CGCNNX), ('cgcnn_r',CGCNNR)]:
    model_type_path = os.path.join(model_root, mtype)
    for model_name in os.listdir(model_type_path):
        output_root = os.path.join(model_type_path, model_name)
        model = ModelObject(cc.num_atom_feats, cc.num_edge_feats, 1)
        
        pts = []
        for pt in os.listdir(output_root):
            if not pt.endswith('.pt'): continue
            pts.append(pt)
        pts = sorted(pts)
        model_path = os.path.join(output_root, pts[-1])
        epoch = int(pts[-1].split('.')[1])
        model.load_state_dict(torch.load(model_path))
        model.cuda()
        
        _, _, idxs, targets, preds = tr.test(model, data_loader, torch.nn.L1Loss())
        df = pd.DataFrame(dict(
            icsd_id=idxs.astype(int).squeeze(), 
            target=targets.squeeze(), 
            prediction=preds.squeeze()))
        df = df.sort_values('icsd_id')
        df.to_csv(os.path.join(output_root, 'test.{:05d}.csv'.format(epoch)), index=False)
        gc.collect()
        torch.cuda.empty_cache()



100%|██████████| 2019/2019 [01:10<00:00, 28.80it/s]
100%|██████████| 2019/2019 [00:14<00:00, 141.58it/s]


In [13]:
for mn in os.listdir(model_type_path):
    mp = os.path.join(model_type_path, mn)
    pts = [fn for fn in os.listdir(mp) if fn.endswith('.pt')]
    print(pts)
    print(sorted(pts))

['model.00000.pt', 'model.00020.pt', 'model.00040.pt', 'model.00060.pt', 'model.00080.pt', 'model.00100.pt', 'model.00120.pt', 'model.00140.pt', 'model.00160.pt', 'model.00180.pt', 'model.00200.pt', 'model.00220.pt', 'model.00240.pt', 'model.00260.pt', 'model.00280.pt']
['model.00000.pt', 'model.00020.pt', 'model.00040.pt', 'model.00060.pt', 'model.00080.pt', 'model.00100.pt', 'model.00120.pt', 'model.00140.pt', 'model.00160.pt', 'model.00180.pt', 'model.00200.pt', 'model.00220.pt', 'model.00240.pt', 'model.00260.pt', 'model.00280.pt']
['model.00020.pt', 'model.00040.pt', 'model.00060.pt', 'model.00080.pt', 'model.00100.pt', 'model.00120.pt', 'model.00140.pt', 'model.00160.pt', 'model.00180.pt', 'model.00200.pt', 'model.00220.pt', 'model.00240.pt', 'model.00260.pt', 'model.00280.pt', 'model.00300.pt']
['model.00020.pt', 'model.00040.pt', 'model.00060.pt', 'model.00080.pt', 'model.00100.pt', 'model.00120.pt', 'model.00140.pt', 'model.00160.pt', 'model.00180.pt', 'model.00200.pt', 'model

In [7]:
_ = cc.load_dataset(root_data, fn='id_target.csv', target_idx=3, ref_idx=1, 
                radius=0.8, model_type='cgcnn', save_ids=True)
jn = 'C:/WORKSPACE_KRICT/MODELS/202204/baseline/ids.json'
os.rename(jn, jn.replace('ids.json','ids.metal.json'))
_ = cc.load_dataset(root_data, fn='id_target.ins.csv', target_idx=3, ref_idx=1, 
                radius=1, model_type='cgcnn', save_ids=True)
jn = 'C:/WORKSPACE_KRICT/MODELS/202204/baseline/ids.json'
os.rename(jn, jn.replace('ids.json','ids.ins.json'))

100%|██████████| 18605/18605 [05:53<00:00, 52.62it/s] 
100%|██████████| 10387/10387 [03:55<00:00, 44.03it/s]


In [11]:
import pandas as pd
import numpy as np
import json
df = pd.read_csv('C:/WORKSPACE_KRICT/DATA/data_snu/with_metal/id_target.csv')


ndim = 800000
with open('C:/WORKSPACE_KRICT/MODELS/202204/baseline/ids.metal.json') as f:
    mids = json.load(f)
with open('C:/WORKSPACE_KRICT/MODELS/202204/baseline/ids.ins.json') as f:
    iids = json.load(f)

amask = np.zeros((ndim), dtype=bool)
mmask = np.zeros((ndim), dtype=bool)
imask = np.zeros((ndim), dtype=bool)
amask[np.array(df.icsd_number)] = True
mmask[np.array(mids['train']).astype(int)] = True
mmask[np.array(mids['valid']).astype(int)] = True
imask[np.array(iids['train']).astype(int)] = True
imask[np.array(iids['valid']).astype(int)] = True

not_seen_mask = amask & ~ mmask & ~imask

In [13]:
dft = pd.read_csv('C:/WORKSPACE_KRICT/DATA/data_snu/with_metal/id_target.test.csv')
tmask = np.zeros((ndim), dtype=bool)
tmask[np.array(dft.icsd_number)] = True


In [17]:
gga_metal = np.zeros((ndim), dtype=bool)
gga_metal[df[df.gap_gga == 0].icsd_number] = True
hse_metal = np.zeros((ndim), dtype=bool)
hse_metal[df[df.gap_hse == 0].icsd_number] = True

In [25]:
#np.sum(not_seen_mask & gga_metal), np.sum(not_seen_mask & hse_metal), np.sum(not_seen_mask & gga_metal & ~hse_metal), np.sum(not_seen_mask & gga_metal & ~hse_metal & tmask)
#icsd_ids = np.arange(ndim)[not_seen_mask]

#df_mask = [icsd in icsd_ids for icsd in df.icsd_number]
df[df_mask].to_csv('C:/WORKSPACE_KRICT/DATA/data_snu/with_metal/id_target.test.baseline.csv', index=False)

In [4]:
dataset = cc.load_dataset(root_data, fn='id_target.subset.csv', target_idx=3, ref_idx=1, 
                radius=4, test_only=True, model_type='tgnn')


100%|██████████| 9/9 [00:00<00:00, 13.44it/s]


'16'