## core code 

In [1]:
from kgcnn.data.utils import load_json_file
import os
from pymatgen.core import Structure
from robocrys import StructureCondenser, StructureDescriber
import inflect
import pandas as pd
import warnings

# import signal
import threading




In [2]:
warnings.filterwarnings('ignore')
# download json file include structure and label
# default offline matbench data path config
data_main_dir = os.path.join(os.path.expanduser(r"C:\Users\dhw059\.kgcnn"), "datasets")
data_directory_name = "jarvis_dft_3d_max_efg"
file_name_download = "max_efg.pymatgen.json"  
label_file = 'max_efg.csv'
# 提取前缀
prefix = label_file.split('.')[0]
# data_directory_name = "matbench_phonons"
# file_name_download = "matbench_phonons.json"  

df_dict = {} # "index": data["index"]
data_directory = os.path.join(data_main_dir, data_directory_name) 
data = load_json_file(os.path.join(data_directory, file_name_download))
# 读取CSV文件
data_label = pd.read_csv(os.path.join(data_directory, label_file))

py_mat_list = data # this include all structure infos
# 提取index和exfoliation_energy列数据
index = data_label['index'].tolist()
labels = data_label[prefix].tolist()

# read from .json file 
desc = []  #1500-1540  1700   2500-2549
formula_list= []
structure_list =[]
label_list = []
index_list = []

def timeout_handler(signum=None):
    print("Task timed out!")
    raise TimeoutError("The operation has timed out.")

for num,struct in enumerate(py_mat_list):
    if num%100==0:
        print(num)
        
    # 设置定时器，120秒后调用timeout_handler
    timer = threading.Timer(30, timeout_handler)
    timer.start()
  
    try:
        ss = Structure.from_dict(struct)
        ss.add_oxidation_state_by_guess()
        # use pymatgen.Structure obj read .json file get pymatgen.Structure obj
        condenser = StructureCondenser()
        condensed_structure = condenser.condense_structure(ss)
        # 尝试使用 Unicode 格式创建 StructureDescriber 实例
        describer = StructureDescriber(
            bond_length_decimal_places=3,
            fmt="unicode",
        )
        # use robocrys convert  pymatgen.Structure obj  to text
        description = describer.describe(condensed_structure)
        
        # 取消定时器，防止在任务完成后执行timeout_handler
        timer.cancel()
        
    except TimeoutError as e:
        continue
    
    except Exception as e:
        try:
            # 如果发生错误，捕获异常并打印错误信息
            print(f"An error occurred: {e}")
            # 然后创建一个不使用 Unicode 格式的 StructureDescriber 实例
            describer = StructureDescriber(
                bond_length_decimal_places=3,
                fmt="raw",
            )
            # use robocrys convert  pymatgen.Structure obj  to text
            description = describer.describe(condensed_structure)
            
        except Exception as e:
            # 如果 raw 格式也失败，添加空字符串到描述列表
            print(f"Failed to describe structure {num} with raw format: {e}")
            # description = ""
            continue
    finally:
        timer.cancel()
    desc.append(description)
    index_list.append(index[num])
    formula_list.append(ss.composition.reduced_formula)
    structure_list.append(ss.as_dict())
    label_list.append(labels[num])
    
df_dict["index"] = index_list
df_dict["formula"] = formula_list
df_dict["structure"] = structure_list
df_dict["description"] = desc
df_dict[prefix] = label_list

# save to csv file : index, formula,structure,description,exfoliation_en
# 创建文件夹（如果不存在）
if not os.path.exists(f'data/{data_directory_name}'):
    os.makedirs(f'data/{data_directory_name}')
file_path = f'data/{data_directory_name}/{prefix}.csv'
df = pd.DataFrame(df_dict)  
df.to_csv(file_path,index=False)


0


Reading file d:\ProgramData\envs\LLM\lib\site-packages\robocrys\condense\mineral_db.json.gz: 0it [00:00, ?it/s]#####1| 165/180 [00:00<00:00, 1633.70it/s]
Decoding objects from d:\ProgramData\envs\LLM\lib\site-packages\robocrys\condense\mineral_db.json.gz: 100%|##########| 180/180 [00:00<00:00, 1666.27it/s]


10

In [None]:
description

'Zr(HPO3)2. crystallizes in the trigonal P-3m1 space group. The structure is two-dimensional and consists of one Zr(HPO3)2. sheet oriented in the (0, 0, 1) direction. Zr2+ is bonded to six equivalent O2- atoms to form ZrO6 octahedra that share corners with six equivalent PHO3 tetrahedra. All Zr-O bond lengths are 2.08 Å. P5+ is bonded to one H and three equivalent O2- atoms to form distorted PHO3 tetrahedra that share corners with three equivalent ZrO6 octahedra. The corner-sharing octahedral tilt angles are 18°. The P-H bond length is 1.40 Å. All P-O bond lengths are 1.53 Å. H is bonded in a single-bond geometry to one P5+ atom. O2- is bonded in a distorted bent 150 degrees geometry to one Zr2+ and one P5+ atom.'

In [None]:
from inflect import engine

# 假设 orientations 是 [(0, 0, 1)]
# s_direction 应该是一个字符串，例如 "direction"
# 如果 s_direction 不是字符串，您需要将其转换为字符串
s_direction = "direction"  # 或者其他您想要的方向字符串
orientations = [(0, 0, 1)]
en = inflect.engine()
# 将 orientations 中的每个元组转换为字符串，例如 "(0, 0, 1)"
orientations_str = ["({})".format(", ".join(map(str, orientation))) for orientation in orientations]
# 使用 join 方法连接字符串
orientations_joined = en.join(orientations_str)

# 格式化字符串并添加到 comp_desc
# comp_desc += " oriented in the {} {}".format(orientations_joined, s_direction)

## test

In [None]:
description

'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na¹⁺ sites. In the first Na¹⁺ site, Na¹⁺ is bonded to five O²⁻ atoms to form NaO₅ square pyramids that share corners with five PO₄ tetrahedra. There are a spread of Na-O bond distances ranging from 2.2967-2.4126 Å. In the second Na¹⁺ site, Na¹⁺ is bonded in a 5-coordinate geometry to five O²⁻ atoms. There are a spread of Na-O bond distances ranging from 2.3059-2.5941 Å. In the third Na¹⁺ site, Na¹⁺ is bonded in a 8-coordinate geometry to eight O²⁻ atoms. There are a spread of Na-O bond distances ranging from 2.2914-3.0838 Å. Bi⁵⁺ is bonded in a 7-coordinate geometry to seven O²⁻ atoms. There are a spread of Bi-O bond distances ranging from 2.2680-2.7539 Å. There are four inequivalent P⁵⁺ sites. In the first P⁵⁺ site, P⁵⁺ is bonded to four O²⁻ atoms to form PO₄ tetrahedra that share  a cornercorner with one NaO₅ square pyramid and  a cornercorner with one PO₄ tetrahedra. There are a spread of P-O b

In [None]:
description

'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na¹⁺ sites. In the first Na¹⁺ site, Na¹⁺ is bonded to five O²⁻ atoms to form NaO₅ square pyramids that share corners with five PO₄ tetrahedra. There are a spread of Na-O bond distances ranging from 2.2967-2.4126 Å. In the second Na¹⁺ site, Na¹⁺ is bonded in a 5-coordinate geometry to five O²⁻ atoms. There are a spread of Na-O bond distances ranging from 2.3059-2.5941 Å. In the third Na¹⁺ site, Na¹⁺ is bonded in a 8-coordinate geometry to eight O²⁻ atoms. There are a spread of Na-O bond distances ranging from 2.2914-3.0838 Å. Bi⁵⁺ is bonded in a 7-coordinate geometry to seven O²⁻ atoms. There are a spread of Bi-O bond distances ranging from 2.2680-2.7539 Å. There are four inequivalent P⁵⁺ sites. In the first P⁵⁺ site, P⁵⁺ is bonded to four O²⁻ atoms to form PO₄ tetrahedra that share  a cornercorner with one NaO₅ square pyramid and  a cornercorner with one PO₄ tetrahedra. There are a spread of P-O b

In [None]:
"Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na¹⁺ sites. In the first Na¹⁺ site, Na¹⁺ is bonded to five O²⁻ atoms to form NaO₅ square pyramids that share corners with five PO₄ tetrahedra. There are a spread of Na–O bond distances ranging from 2.30–2.41 Å. In the second Na¹⁺ site, Na¹⁺ is bonded in a 5-coordinate geometry to five O²⁻ atoms. There are a spread of Na–O bond distances ranging from 2.31–2.59 Å. In the third Na¹⁺ site, Na¹⁺ is bonded in a 8-coordinate geometry to eight O²⁻ atoms. There are a spread of Na–O bond distances ranging from 2.29–3.08 Å. Bi⁵⁺ is bonded in a 7-coordinate geometry to seven O²⁻ atoms. There are a spread of Bi–O bond distances ranging from 2.27–2.75 Å. There are four inequivalent P⁵⁺ sites. In the first P⁵⁺ site, P⁵⁺ is bonded to four O²⁻ atoms to form PO₄ tetrahedra that share  a cornercorner with one NaO₅ square pyramid and  a cornercorner with one PO₄ tetrahedra. There are a spread of P–O bond distances ranging from 1.52–1.64 Å. In the second P⁵⁺ site, P⁵⁺ is bonded to four O²⁻ atoms to form PO₄ tetrahedra that share corners with two equivalent NaO₅ square pyramids and  a cornercorner with one PO₄ tetrahedra. There are a spread of P–O bond distances ranging from 1.52–1.62 Å. In the third P⁵⁺ site, P⁵⁺ is bonded to four O²⁻ atoms to form PO₄ tetrahedra that share  a cornercorner with one NaO₅ square pyramid and  a cornercorner with one PO₄ tetrahedra. There are a spread of P–O bond distances ranging from 1.53–1.64 Å. In the fourth P⁵⁺ site, P⁵⁺ is bonded to four O²⁻ atoms to form PO₄ tetrahedra that share  a cornercorner with one NaO₅ square pyramid and  a cornercorner with one PO₄ tetrahedra. There are a spread of P–O bond distances ranging from 1.51–1.63 Å. There are fourteen inequivalent O²⁻ sites. In the first O²⁻ site, O²⁻ is bonded in a distorted single-bond geometry to one Bi⁵⁺ and one P⁵⁺ atom. In the second O²⁻ site, O²⁻ is bonded in a 2-coordinate geometry to one Na¹⁺, one Bi⁵⁺, and one P⁵⁺ atom. In the third O²⁻ site, O²⁻ is bonded in a distorted single-bond geometry to one Bi⁵⁺ and one P⁵⁺ atom. In the fourth O²⁻ site, O²⁻ is bonded in a 3-coordinate geometry to one Na¹⁺, one Bi⁵⁺, and one P⁵⁺ atom. In the fifth O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Na¹⁺, one Bi⁵⁺, and one P⁵⁺ atom. In the sixth O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to two Na¹⁺ and one P⁵⁺ atom. In the seventh O²⁻ site, O²⁻ is bonded in a bent 120 degrees geometry to two P⁵⁺ atoms. In the eighth O²⁻ site, O²⁻ is bonded in a 3-coordinate geometry to three Na¹⁺ and one P⁵⁺ atom. In the ninth O²⁻ site, O²⁻ is bonded in a 4-coordinate geometry to three Na¹⁺ and one P⁵⁺ atom. In the tenth O²⁻ site, O²⁻ is bonded in a bent 120 degrees geometry to two P⁵⁺ atoms. In the eleventh O²⁻ site, O²⁻ is bonded in a 3-coordinate geometry to one Na¹⁺, one Bi⁵⁺, and one P⁵⁺ atom. In the twelfth O²⁻ site, O²⁻ is bonded in a distorted single-bond geometry to one Na¹⁺, one Bi⁵⁺, and one P⁵⁺ atom. In the thirteenth O²⁻ site, O²⁻ is bonded in a 3-coordinate geometry to three Na¹⁺ and one P⁵⁺ atom. In the fourteenth O²⁻ site, O²⁻ is bonded in a 3-coordinate geometry to two equivalent Na¹⁺ and one P⁵⁺ atom."


In [None]:
description

'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na sites. In the first Na site, Na is bonded to five O atoms to form NaO₅ square pyramids that share corners with five PO₄ tetrahedra. There are a spread of Na-O bond distances ranging from 2.2967-2.4126 Å. In the second Na site, Na is bonded in a 5-coordinate geometry to five O atoms. There are a spread of Na-O bond distances ranging from 2.3059-2.5941 Å. In the third Na site, Na is bonded in a 6-coordinate geometry to six O atoms. There are a spread of Na-O bond distances ranging from 2.2914-2.7846 Å. Bi is bonded in a 7-coordinate geometry to seven O atoms. There are a spread of Bi-O bond distances ranging from 2.2680-2.7539 Å. There are four inequivalent P sites. In the first P site, P is bonded to four O atoms to form PO₄ tetrahedra that share  a cornercorner with one NaO₅ square pyramid and  a cornercorner with one PO₄ tetrahedra. There are a spread of P-O bond distances ranging from 1.5247-

In [None]:
# describe_symmetry_labels
description

'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na sites. In the first Na site, Na is bonded to five O atoms to form NaO₅ square pyramids that share corners with five PO₄ tetrahedra. There are a spread of Na-O bond distances ranging from 2.2967-2.4126 Å. In the second Na site, Na is bonded in a 5-coordinate geometry to five O atoms. There are a spread of Na-O bond distances ranging from 2.3059-2.5941 Å. In the third Na site, Na is bonded in a 6-coordinate geometry to six O atoms. There are a spread of Na-O bond distances ranging from 2.2914-2.7846 Å. Bi is bonded in a 7-coordinate geometry to seven O atoms. There are a spread of Bi-O bond distances ranging from 2.2680-2.7539 Å. There are four inequivalent P sites. In the first P site, P is bonded to four O atoms to form PO₄ tetrahedra that share  a cornercorner with one NaO₅ square pyramid and  a cornercorner with one PO₄ tetrahedra. There are a spread of P-O bond distances ranging from 1.5247-

In [None]:
# describe_components
description

'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group.'

In [None]:
# describe_mineral
description


'There are three inequivalent Na sites. In the first Na site, Na(1) is bonded to one O(2), one O(4), one O(6), one O(8), and one O(9) atom to form NaO₅ square pyramids that share  a cornercorner with one P(1)O₄ tetrahedra,  a cornercorner with one P(3)O₄ tetrahedra,  a cornercorner with one P(4)O₄ tetrahedra, and corners with two equivalent P(2)O₄ tetrahedra. The Na(1)-O(2) bond length is 2.3943 Å. The Na(1)-O(4) bond length is 2.3800 Å. The Na(1)-O(6) bond length is 2.2967 Å. The Na(1)-O(8) bond length is 2.4126 Å. The Na(1)-O(9) bond length is 2.3202 Å. In the second Na site, Na(2) is bonded in a 5-coordinate geometry to one O(5), one O(8), one O(9), and two equivalent O(13) atoms. The Na(2)-O(5) bond length is 2.3779 Å. The Na(2)-O(8) bond length is 2.3059 Å. The Na(2)-O(9) bond length is 2.5746 Å. There is one shorter (2.3128 Å) and one longer (2.5941 Å) Na(2)-O(13) bond length. In the third Na site, Na(3) is bonded in a 6-coordinate geometry to one O(11), one O(12), one O(6), one 

In [None]:
# only_describe_cation_polyhedra_connectivity
description

'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na sites. In the first Na site, Na(1) is bonded in a square pyramidal geometry to one O(2), one O(4), one O(6), one O(8), and one O(9) atom. The Na(1)-O(2) bond length is 2.3943 Å. The Na(1)-O(4) bond length is 2.3800 Å. The Na(1)-O(6) bond length is 2.2967 Å. The Na(1)-O(8) bond length is 2.4126 Å. The Na(1)-O(9) bond length is 2.3202 Å. In the second Na site, Na(2) is bonded in a 5-coordinate geometry to one O(5), one O(8), one O(9), and two equivalent O(13) atoms. The Na(2)-O(5) bond length is 2.3779 Å. The Na(2)-O(8) bond length is 2.3059 Å. The Na(2)-O(9) bond length is 2.5746 Å. There is one shorter (2.3128 Å) and one longer (2.5941 Å) Na(2)-O(13) bond length. In the third Na site, Na(3) is bonded in a 6-coordinate geometry to one O(11), one O(12), one O(6), one O(9), and two equivalent O(14) atoms. The Na(3)-O(11) bond length is 2.4383 Å. The Na(3)-O(12) bond length is 2.5996 Å. The Na(3)-O

In [None]:
# describe_oxidation_states
description

'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na sites. In the first Na site, Na(1) is bonded to one O(2), one O(4), one O(6), one O(8), and one O(9) atom to form NaO₅ square pyramids that share  a cornercorner with one P(1)O₄ tetrahedra,  a cornercorner with one P(3)O₄ tetrahedra,  a cornercorner with one P(4)O₄ tetrahedra, and corners with two equivalent P(2)O₄ tetrahedra. The Na(1)-O(2) bond length is 2.3943 Å. The Na(1)-O(4) bond length is 2.3800 Å. The Na(1)-O(6) bond length is 2.2967 Å. The Na(1)-O(8) bond length is 2.4126 Å. The Na(1)-O(9) bond length is 2.3202 Å. In the second Na site, Na(2) is bonded in a 5-coordinate geometry to one O(5), one O(8), one O(9), and two equivalent O(13) atoms. The Na(2)-O(5) bond length is 2.3779 Å. The Na(2)-O(8) bond length is 2.3059 Å. The Na(2)-O(9) bond length is 2.5746 Å. There is one shorter (2.3128 Å) and one longer (2.5941 Å) Na(2)-O(13) bond length. In the third Na site, Na(3) is bonded in a 6

In [None]:
# describe_component_makeup
description


'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na sites. In the first Na site, Na(1) is bonded to one O(2), one O(4), one O(6), one O(8), and one O(9) atom to form NaO₅ square pyramids that share  a cornercorner with one P(1)O₄ tetrahedra,  a cornercorner with one P(3)O₄ tetrahedra,  a cornercorner with one P(4)O₄ tetrahedra, and corners with two equivalent P(2)O₄ tetrahedra. The Na(1)-O(2) bond length is 2.3943 Å. The Na(1)-O(4) bond length is 2.3800 Å. The Na(1)-O(6) bond length is 2.2967 Å. The Na(1)-O(8) bond length is 2.4126 Å. The Na(1)-O(9) bond length is 2.3202 Å. In the second Na site, Na(2) is bonded in a 5-coordinate geometry to one O(5), one O(8), one O(9), and two equivalent O(13) atoms. The Na(2)-O(5) bond length is 2.3779 Å. The Na(2)-O(8) bond length is 2.3059 Å. The Na(2)-O(9) bond length is 2.5746 Å. There is one shorter (2.3128 Å) and one longer (2.5941 Å) Na(2)-O(13) bond length. In the third Na site, Na(3) is bonded in a 6

In [None]:
# all
description

'Na₃Bi(P₂O₇)₂ crystallizes in the triclinic P̅1 space group. There are three inequivalent Na sites. In the first Na site, Na(1) is bonded to one O(2), one O(4), one O(6), one O(8), and one O(9) atom to form NaO₅ square pyramids that share  a cornercorner with one P(1)O₄ tetrahedra,  a cornercorner with one P(3)O₄ tetrahedra,  a cornercorner with one P(4)O₄ tetrahedra, and corners with two equivalent P(2)O₄ tetrahedra. The Na(1)-O(2) bond length is 2.39 Å. The Na(1)-O(4) bond length is 2.38 Å. The Na(1)-O(6) bond length is 2.30 Å. The Na(1)-O(8) bond length is 2.41 Å. The Na(1)-O(9) bond length is 2.32 Å. In the second Na site, Na(2) is bonded in a 5-coordinate geometry to one O(5), one O(8), one O(9), and two equivalent O(13) atoms. The Na(2)-O(5) bond length is 2.38 Å. The Na(2)-O(8) bond length is 2.31 Å. The Na(2)-O(9) bond length is 2.57 Å. There is one shorter (2.31 Å) and one longer (2.59 Å) Na(2)-O(13) bond length. In the third Na site, Na(3) is bonded in a 6-coordinate geometry

In [None]:
description

'Na$_{3}$Bi(P$_{2}$O$_{7}$)$_{2}$ crystallizes in the triclinic P$\\overline{1}$ space group. There are three inequivalent Na sites. In the first Na site, Na(1) is bonded to one O(2), one O(4), one O(6), one O(8), and one O(9) atom to form NaO$_{5}$ square pyramids that share  a cornercorner with one P(1)O$_{4}$ tetrahedra,  a cornercorner with one P(3)O$_{4}$ tetrahedra,  a cornercorner with one P(4)O$_{4}$ tetrahedra, and corners with two equivalent P(2)O$_{4}$ tetrahedra. The Na(1)-O(2) bond length is 2.39 $\\AA$. The Na(1)-O(4) bond length is 2.38 $\\AA$. The Na(1)-O(6) bond length is 2.30 $\\AA$. The Na(1)-O(8) bond length is 2.41 $\\AA$. The Na(1)-O(9) bond length is 2.32 $\\AA$. In the second Na site, Na(2) is bonded in a 5-coordinate geometry to one O(5), one O(8), one O(9), and two equivalent O(13) atoms. The Na(2)-O(5) bond length is 2.38 $\\AA$. The Na(2)-O(8) bond length is 2.31 $\\AA$. The Na(2)-O(9) bond length is 2.57 $\\AA$. There is one shorter (2.31 $\\AA$) and one lo