In [None]:
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.rdDistGeom import ETKDGv3, EmbedMolecule
from rdkit.Chem.rdForceFieldHelpers import MMFFOptimizeMolecule
import pandas as pd
import time
import timeout_decorator
from timeout_decorator import timeout, TimeoutError

import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools

# 加载数据集
original_dataset = pd.read_csv("transvae_SMILES_Inhibitor1368_mose_data_tanimoto_similarity.csv")

# 确保SMILES列是字符串类型，并去除任何NaN值
original_dataset['raw_smiles'] = original_dataset['raw_smiles'].astype(str).dropna()

# 添加分子列到DataFrame中
PandasTools.AddMoleculeColumnToFrame(frame=original_dataset, smilesCol='raw_smiles')

# 删除那些无法读取的分子（None值检查）
original_dataset = original_dataset[original_dataset.ROMol.notnull()]

# 现在original_dataset包含了有效的分子对象
print(original_dataset.head())


# 删除那些无法读取的分子
original_dataset['MOL'] = original_dataset.ROMol.map(lambda x: False if x == None else True)
del_index = original_dataset[original_dataset.MOL == False].index
edited_dataset = original_dataset.drop(del_index)

# 数据的编辑
last_data_ID = 0
drop_list = list(range(0, last_data_ID))
calc_datasets = edited_dataset.drop(drop_list)

# 数据数量的获取
data_num = len(calc_datasets.index)
print('样本数为 {}'.format(data_num))
mols=[]
for i, (ID, smile) in enumerate(zip(calc_datasets['id'], calc_datasets['raw_smiles'])):
    # 从smiles制作三维结构→粗糙的结构优化
    mol = Chem.MolFromSmiles(smile)  # Mol对象的生成
    mol = Chem.AddHs(mol)  # 添加H元素
    
    params = ETKDGv3()
    params.randomseed = 1
    EmbedMolecule(mol, params)  # 扩展到三维
#     mol=Chem.RemoveHs(mol)
    # 通过MMFF（Merck Molecular Force Field，默克分子力场）优化结构
    MMFFOptimizeMolecule(mol)
    
    path = "transvae/raw_SMILES"+"/"+str(ID)+".mol"
    Chem.MolToMolFile(mol, path)
    mols.append(mol)

In [None]:
import psi4
for i in range(60,71):
    try:
        # 设置Psi4的输出文件
        dir="Inhibitor1368_mose_data/gjf/rnnattn/reconstruct_smiles/"
        outdatfile = dir + str(i) + '_output.dat'
        inputfile = dir + str(i) + '.gjf'
        calfile = dir + 'caluresults.txt'
        psi4.core.set_output_file(outdatfile, False)
        
        # 读取分子结构文件，并提取有用信息
        with open(inputfile, 'r') as file:
            lines = file.readlines()  # 读取所有行
            molecule_structure = '\n'.join(lines[4:])  # 从第6行开始读取（索引4因为从0开始计数）
        
        # 创建分子对象
        molecule = psi4.geometry(molecule_structure)
        
        # 设置计算方法和基组
        psi4.set_options({'basis': '6-31G*'})
        
        # 进行单点能量计算
        scf_e, wfn = psi4.energy('B3LYP', return_wfn=True, molecule=molecule)
        
        # 提取轨道能量
        orbital_energies = wfn.epsilon_a_subset('AO', 'ALL').to_array()
        homo_energy = orbital_energies[wfn.nalpha() - 1]  # HOMO
        lumo_energy = orbital_energies[wfn.nalpha()]      # LUMO
        gap = lumo_energy - homo_energy  # 能隙
        
        print(f"HOMO Energy: {homo_energy} a.u.")
        print(f"LUMO Energy: {lumo_energy} a.u.")
        print(f"Energy Gap: {gap} a.u.")
        
        with open(calfile, "a+") as f:
            f.write("\t".join(("filename", str(i), str(homo_energy), str(lumo_energy), str(gap))) + '\n')
    except Exception as e:
        print(f"An error occurred in iteration {i}: {e}")
        continue  # 忽略错误，继续下一个循环


HOMO Energy: -0.20030964896190065 a.u.
LUMO Energy: -0.027514132259085312 a.u.
Energy Gap: 0.17279551670281534 a.u.
HOMO Energy: -0.2313476164521256 a.u.
LUMO Energy: -0.0005015593974529893 a.u.
Energy Gap: 0.2308460570546726 a.u.

QcdbException BasisSetNotFound: BasisSet::construct: Unable to find a basis set for atom 13 for key BASIS among:
  Shell Entries: ['I']
  Basis Sets: [('6-31G*', '6-31G*', None)]
  File Path: C:\Users\DELL\Desktop\TransVAE-important-master, C:\Users\DELL\Desktop\TransVAE-important-master, D:\tools\anaconda3\envs\my-rdkit\Library\share\psi4\basis
  Input Blocks: 



An error occurred in iteration 62: BasisSet::construct: Unable to find a basis set for atom 13 for key BASIS among:
  Shell Entries: ['I']
  Basis Sets: [('6-31G*', '6-31G*', None)]
  File Path: C:\Users\DELL\Desktop\TransVAE-important-master, C:\Users\DELL\Desktop\TransVAE-important-master, D:\tools\anaconda3\envs\my-rdkit\Library\share\psi4\basis
  Input Blocks: 

An error occurred in iteration 

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

# 读取数据
file_path = 'Inhibitor1368_mose_data/gjf/transvae/raw_SMILES/caluresults.txt'
# 读取数据
data = []
with open(file_path, 'r') as file:
    # 读取表头
    header = file.readline().strip().split('\t')
    for line in file:
        filename, id, homo_energy, lumo_energy, gap = line.strip().split('\t')
        data.append((filename, id, float(homo_energy), float(lumo_energy), float(gap)))

# 计算硬度和软度，并更新数据
for i, (filename, id, homo_energy, lumo_energy, gap) in enumerate(data):
    hardness = gap / 2
    softness = 1 / gap
    data[i] = (filename, id, homo_energy, lumo_energy, gap, hardness, softness)

# 将更新后的数据写回原文件
with open(file_path, 'w') as file:
    # 写入表头
    file.write('\t'.join(header + ['Hardness', 'Softness']) + '\n')
    for row in data:
        file.write('\t'.join(map(str, row)) + '\n')

