diff --git a/dpgen/database/run.py b/dpgen/database/run.py index 5416bcfb9..7a524a459 100644 --- a/dpgen/database/run.py +++ b/dpgen/database/run.py @@ -4,6 +4,7 @@ import os import time +import json from uuid import uuid4 from threading import Thread from glob import glob @@ -13,31 +14,43 @@ from dpgen.database.vasp import VaspInput from dpdata import System,LabeledSystem from monty.serialization import loadfn,dumpfn +import numpy as np +import traceback OUTPUT=SHORT_CMD+'_db.json' -SUPPORTED_CACULATOR=['vasp','pwscf','siesta','gaussian'] +SUPPORTED_CACULATOR=['vasp','pwscf','gaussian'] ITERS_PAT="iter.*/02.fp/task*" INIT_PAT="init/*/02.md/sys-*/scale-*/*" def db_run(args): dlog.info ("collecting data") - print(args.ID_PREFIX) - _main(args.PATH, args.CALCULATOR, args.OUTPUT,args.ID_PREFIX) + #print(args.ID_PREFIX) + _main(args.PARAM) dlog.info ("finished") -def _main(path,calculator,output,id_prefix): +def _main(param): + with open(param, "r") as fp: + jdata = json.load(fp) + calculator = jdata["calculator"] + path = jdata["path"] + calulator = jdata["calculator"] + output = jdata["output"] + config_info_dict = jdata["config_info_dict"] + id_prefix = jdata["id_prefix"] + skip_init = False + if "skip_init" in jdata: + skip_init = jdata["skip_init"] + ## The mapping from sys_info to sys_configs assert calculator.lower() in SUPPORTED_CACULATOR dlog.info('data collection from: %s'%path) if calculator == "vasp": - parsing_vasp(path,output,id_prefix) + parsing_vasp(path,config_info_dict,skip_init, output,id_prefix) elif calculator == 'gaussian': parsing_gaussian(path,output) - elif calculator == "siesta": - parsing_siesta(path, output) else: parsing_pwscf(path,output) -def parsing_vasp(path,output=OUTPUT,id_prefix=None): +def parsing_vasp(path,config_info_dict, skip_init, output=OUTPUT,id_prefix=None): fp_iters=os.path.join(path,ITERS_PAT) dlog.debug(fp_iters) @@ -46,54 +59,103 @@ def parsing_vasp(path,output=OUTPUT,id_prefix=None): fp_init=os.path.join(path,INIT_PAT) dlog.debug(fp_init) f_fp_init=glob(fp_init) - dlog.info("len initialization data: %s"%len(f_fp_init)) - entries=_parsing_vasp(f_fp_init,id_prefix,iters=False) - entries.extend(_parsing_vasp(f_fp_iters,id_prefix)) - dlog.info("len collected data: %s"%len(entries)) - + if skip_init: + entries = _parsing_vasp(f_fp_iters,config_info_dict, id_prefix) + dlog.info("len collected data: %s"%len(entries)) + else: + dlog.info("len initialization data: %s"%len(f_fp_init)) + entries=_parsing_vasp(f_fp_init,config_info_dict, id_prefix,iters=False) + entries.extend(_parsing_vasp(f_fp_iters,config_info_dict, id_prefix)) + dlog.info("len collected data: %s"%len(entries)) + #print(output) + #print(entries) dumpfn(entries,output,indent=4) -def _parsing_vasp(paths,id_prefix,iters=True): +def _parsing_vasp(paths,config_info_dict, id_prefix,iters=True): entries=[] icount=0 + if iters: + iter_record = [] + iter_record_new = [] + try: + with open ("record.database", "r") as f_record: + iter_record = [i.split()[0] for i in f_record.readlines()] + iter_record.sort() + dlog.info("iter_record") + dlog.info(iter_record) + except: + pass for path in paths: + try: f_outcar = os.path.join(path,'OUTCAR') f_job = os.path.join(path,'job.json') - - try: - vi = VaspInput.from_directory(path) - if os.path.isfile(f_job): - attrib=loadfn(f_job) - else: - attrib={} + tmp_iter = path.split('/')[-3] + if (tmp_iter in iter_record) and (tmp_iter != iter_record[-1]): + continue + if tmp_iter not in iter_record_new: + iter_record_new.append(tmp_iter) + vi = VaspInput.from_directory(path) + if os.path.isfile(f_job): + attrib=loadfn(f_job) + else: + attrib={} - if iters and attrib: - tmp_=path.split('/')[-1] - iter_info=tmp_.split('.')[1] - task_info=tmp_.split('.')[-1] - attrib['iter_info']=iter_info - attrib['task_info']=task_info - else: - pass - comp=vi['POSCAR'].structure.composition - ls = LabeledSystem(f_outcar) - lss=ls.to_list() - for ls in lss: - if id_prefix: - eid=id_prefix+"_"+str(icount) - else: - eid = str(uuid4()) - entry=Entry(comp,'vasp',vi.as_dict(),ls.as_dict(),attribute=attrib,entry_id=eid) - entries.append(entry) - icount+=1 - except: - dlog.info("failed here : %s"%path) + if iters and attrib: + # generator/Cu/iter.000031/02.fp/task.007.000000 + tmp_=path.split('/')[-1] + #config_info=tmp_.split('.')[1] + task_info=tmp_.split('.')[-1] + tmp_iter = path.split('/')[-3] + iter_info = tmp_iter.split('.')[-1] + sys_info = path.split('/')[-4] + config_info_int = int(tmp_.split('.')[1]) + for (key, value) in config_info_dict.items(): + if config_info_int in value: + config_info = key + attrib['config_info']=config_info + attrib['task_info']=task_info + attrib['iter_info']=iter_info + attrib['sys_info']=sys_info + with open(f_outcar , "r") as fin_outcar: + infile_outcar = fin_outcar.readlines() + for line in infile_outcar: + if "running on" in line: + attrib["core"] = int(line.split()[2]) + if "Elapse" in line: + attrib["wall_time"] = float(line.split()[-1]) + if "executed on" in line: + attrib["date"] = line.split()[-2] + attrib["clocktime"] = line.split()[-1] + dlog.info("Attrib") + dlog.info(attrib) + comp=vi['POSCAR'].structure.composition + ls = LabeledSystem(f_outcar) + lss=ls.to_list() + for ls in lss: + if id_prefix: + eid=id_prefix+"_"+str(icount) + else: + eid = str(uuid4()) + entry=Entry(comp,'vasp',vi.as_dict(),ls.as_dict(),attribute=attrib,entry_id=eid) + entries.append(entry) + icount+=1 + except Exception: + #dlog.info(str(Exception)) + dlog.info("failed for %s"%(path)) + #pass + if iters: + iter_record.sort() + iter_record_new.sort() + with open("record.database" , "w") as fw: + for line in iter_record: + fw.write(line + "\n") + for line in iter_record_new: + fw.write(line + "\n") return entries def parsing_pwscf(path,output=OUTPUT): pass -def parsing_siesta(path,output=OUTPUT): - pass + def parsing_gaussian(path,output=OUTPUT): pass diff --git a/dpgen/main.py b/dpgen/main.py index 5a339fbd7..283e6d89e 100644 --- a/dpgen/main.py +++ b/dpgen/main.py @@ -104,16 +104,10 @@ def main(): # db parser_db = subparsers.add_parser( "db", - help="Collecting data from Deep Generator.") - parser_db.add_argument('PATH', type=str, - help="root path for dpgen modeling") - parser_db.add_argument('ENGINE', type=str, - help="engine used for labeling: vasp/pwscf/cp2k/gaussian/siesta") - parser_db.add_argument('OUTPUT', type=str, - help="output filename : file.json/file.yaml") - parser_db.add_argument("ID_PREFIX", type=str, default=None, - nargs="?", - help="prefix of an entry id") + help="Collecting data from DP-GEN.") + + parser_db.add_argument('PARAM', type=str, + help="parameter file, json format") parser_db.set_defaults(func=db_run) diff --git a/examples/database/param_Ti.json b/examples/database/param_Ti.json new file mode 100644 index 000000000..5b222f30a --- /dev/null +++ b/examples/database/param_Ti.json @@ -0,0 +1,19 @@ +{ + "path" : "/path/to/Ti", + "calculator" : "vasp", + "_comment" : "Current only support VASP", + "output" : "./db_Ti.json", + "id_prefix" : "", + "config_info_dict" : { + "fcc-bulk" : [0,1,2,3,4,5,6,7], + "hcp-bulk" : [8,9,10,11,12,13,14,15], + "bcc-bulk" : [16,17,18,19,20,21,22,23], + "fcc-surf-100" : [24,25,26,27,28,29,30,31], + "fcc-surf-111" : [32,33,34,35,36,37,38,39], + "fcc-surf-110" : [40,41,42,43,44,45,46,47], + "hcp-surf-001" : [48,49,50,51,52,53,54,55], + "hcp-surf-100" : [56,57,58,59,60,61,62,63], + "hcp-surf-110" : [64,65,66,67,68,69,70,71] + }, + "skip_init" : true +} diff --git a/tests/database/data.tar.gz b/tests/database/data.tar.gz index 2f733dd0f..ad7e54d6b 100644 Binary files a/tests/database/data.tar.gz and b/tests/database/data.tar.gz differ diff --git a/tests/database/param_Al.json b/tests/database/param_Al.json new file mode 100644 index 000000000..3295d8ce8 --- /dev/null +++ b/tests/database/param_Al.json @@ -0,0 +1,19 @@ +{ + "path" : "./", + "calculator" : "vasp", + "_comment" : "vasp/pwscf/gaussian", + "output" : "dpgen_db.json", + "id_prefix" : "", + "config_info_dict" : { + "fcc-bulk" : [0,1,2,3,4,5,6,7], + "hcp-bulk" : [8,9,10,11,12,13,14,15], + "bcc-bulk" : [16,17,18,19,20,21,22,23], + "fcc-surf-100" : [24,25,26,27,28,29,30,31], + "fcc-surf-111" : [32,33,34,35,36,37,38,39], + "fcc-surf-110" : [40,41,42,43,44,45,46,47], + "hcp-surf-001" : [48,49,50,51,52,53,54,55], + "hcp-surf-100" : [56,57,58,59,60,61,62,63], + "hcp-surf-110" : [64,65,66,67,68,69,70,71] + }, + "skip_init" : true +} diff --git a/tests/database/test_db_vasp.py b/tests/database/test_db_vasp.py index 3023ab5d0..577d2f7da 100644 --- a/tests/database/test_db_vasp.py +++ b/tests/database/test_db_vasp.py @@ -1,5 +1,6 @@ import os,sys,shutil import unittest +import json import numpy as np import tarfile from glob import glob @@ -42,6 +43,11 @@ def setUp(self): self.ref_entries=loadfn(os.path.join(self.cwd,'data/entries.json')) self.init_path=sorted(glob(os.path.join(self.r_init_path,init_pat))) self.iter_path=sorted(glob(os.path.join(self.r_iter_path,iter_pat))) + with open("param_Al.json", "r") as fr: + jdata = json.load(fr) + self.config_info_dict = jdata["config_info_dict"] + self.skip_init = jdata["skip_init"] + self.output = jdata["output"] def testDPPotcar(self): @@ -111,12 +117,15 @@ def testEntry(self): self.assertEqual(ret0.entry_id,'pku-0') def testParsingVasp(self): - parsing_vasp(self.cwd,id_prefix=dpgen.SHORT_CMD) - try: - Potcar(['Al']) - ref=os.path.join(self.cwd,'data/all_data_pp.json') - except: - ref=os.path.join(self.cwd,'data/all_data.json') + parsing_vasp(self.cwd, self.config_info_dict, self.skip_init,self.output, id_prefix=dpgen.SHORT_CMD ) + #try: + # Potcar(['Al']) + # ref=os.path.join(self.cwd,'data/all_data_pp.json') + #except: + # ref=os.path.join(self.cwd,'data/all_data.json') + #Potcar(['Al']) + ref=os.path.join(self.cwd,'data/all_data_pp.json') + ret=os.path.join(self.cwd,'dpgen_db.json') retd=loadfn(ret) @@ -134,10 +143,14 @@ def testParsingVasp(self): self.assertEqual(len(i.composition),len(j.composition)) self.assertEqual(len(i.attribute),len(j.attribute)) os.remove(os.path.join(self.cwd,'dpgen_db.json')) - + def tearDown(self): for path in [self.r_init_path, self.r_iter_path, self.data]: if os.path.isdir(path) : shutil.rmtree(path) + if os.path.isfile("dpgen.log"): + os.remove("dpgen.log") + if os.path.isfile("record.database"): + os.remove("record.database")