## Data processing
EDBO  진행을 위한 데이터 전처리과정입니다. 
yh-qchem을 통해 계산된 결과를 Tag 별로 조회하여 데이터셋을 만들어 csv 파일로 저장하는 과정입니다. 

데이터셋 저장 조건
 - 'global' : 여러 conformer에 대해 bolztmann average와 Lowest Energy Conformer 값을 저장
 - 'min', 'max' : atom level descriptor.
 - 'transition' : 모든 transition을 반영

In [1]:
### 조회할 TAG명과 dataset column명에 붙이 prefix 와 output 파일명을 지정해주세요. 
from autoqchem.db_local_functions import *

TAG = "GFTE_subs"
PREFIX = "substrate"   ## prefix for column name : {prefix}_{colnames}
out_file = './result/subs.csv'  ## 특정 경로 아래로 지정할 수도 있습니다.  ex: ./result/project_name/sample.csv

In [2]:
data = descriptors(tags=[TAG], 
                   presets=['global', 'min_max', 'transitions'], 
                   conf_option='boltzmann', 
                   solvent='ALL', 
                   functional='ALL', 
                   basis_set='ALL', 
                   substructure="", 
                   smiles="")
data.keys()

dict_keys(['global', 'min', 'max', 'transitions'])

In [3]:
data['min'].drop(columns=['X', 'Y', 'Z'], inplace=True )
data['max'].drop(columns=['X', 'Y', 'Z'], inplace=True )


cols = data['global'].columns
data['global'].columns = [PREFIX + '_' + col_name + '_boltz' for col_name in cols]
cols = data['min'].columns
data['min'].columns = [PREFIX + '_min_' + col_name for col_name in cols]
cols = data['max'].columns
data['max'].columns = [PREFIX + '_max_' + col_name for col_name in cols]
cols = data['transitions'].columns
data['transitions'].columns = [PREFIX + '_' + col_name for col_name in cols]

conf_options = ['boltzmann', 'max', 'min', 'mean', 'std', 'any']

conf_options_long = ['Boltzman Average', 'Lowest Energy Conformer', 'Highest Energy Conformer', 'Arithmetic Average',
                     'Standard Deviation', 'Random']


In [4]:
### solvent 의 경우 생략 
conf_dict = {'max' : 'minE', 'min' : 'maxE', 'mean' : 'mean', 'std' : 'std'}
conf = 'max'
data_conf = {}
for conf in conf_dict.keys():
    data_conf[conf] = descriptors(tags=[TAG], 
                   presets=['global'], 
                   conf_option=conf, 
                   solvent='ALL', 
                   functional='ALL', 
                   basis_set='ALL', 
                   substructure="", 
                   smiles="")
    cols = data_conf[conf]['global'].columns
    data_conf[conf]['global'].columns = [PREFIX + '_' + col_name + '_' + conf_dict[conf] for col_name in cols]

In [5]:
data_merge = pd.concat([data['global'], data['min'], data['max'],data['transitions'], 
                        data_conf['max']['global'],   # solvent 의 경우 comment 처리 
                        data_conf['min']['global'],   # solvent 의 경우 comment 처리 
                        data_conf['mean']['global'],  # solvent 의 경우 comment 처리 
                        data_conf['std']['global']   # solvent 의 경우 comment 처리 
                       ], axis=1)

In [6]:
data_merge

Unnamed: 0_level_0,substrate_E_boltz,substrate_ES_root_dipole_boltz,substrate_ES_root_electronic_spatial_extent_boltz,substrate_ES_root_molar_volume_boltz,substrate_E_scf_boltz,substrate_E_thermal_correction_boltz,substrate_E_zpe_boltz,substrate_G_boltz,substrate_G_thermal_correction_boltz,substrate_H_boltz,...,substrate_electronegativity_std,substrate_electronic_spatial_extent_std,substrate_hardness_std,substrate_homo_energy_std,substrate_lumo_energy_std,substrate_molar_mass_std,substrate_molar_volume_std,substrate_multiplicity_std,substrate_number_of_atoms_std,substrate_zero_point_correction_std
can,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FC(C1=COCCN1CC2=C(C=CC=C3)C3=CC=C2)(F)F,-1046.26188,10.307235,5920.698827,2139.663989,-1046.536332,0.293724,-1046.278409,-1046.32372,0.231884,-1046.260936,...,0.004722,577.171644,0.002292,0.0068,0.002979,0.0,310.753389,0.0,0.0,0.00039


In [7]:
data_merge.to_csv(out_file)

### 모든 cell을 동작시키고 나면 out_file 경로에 output 파일명이 생깁니다. 
### 이 과정을 ligand, solvent 등 모든 물질에 대해 진행 후 1_EDBO_process.ipynb을 실행시킵니다