In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import glob
from tqdm.auto import tqdm
from rdkit import Chem

In [23]:
# Adds helpful information for machine learning

def compute_max_atomic_num(df):
    smiles = df['neut-smiles'].values
    mols = [Chem.MolFromSmiles(s) for s in smiles]
    max_atomic_nums = []
    max_num_atoms = []

    for m in mols:
        atom_nums = [a.GetAtomicNum() for a in m.GetAtoms()]
        max_atomic_nums.append(np.max(atom_nums))
        
        num_atoms = len(m.GetAtoms())
        max_num_atoms.append(num_atoms)
    
    df['Largest atomic number'] = np.max(max_atomic_nums)
    df['# atoms'] = np.max(max_num_atoms)

    return df

In [2]:
paths = glob.glob('retrieved/*')

In [18]:
len(paths)

60

In [None]:
OUT_DIR = 'retrieved_default_pXC50'

for p in tqdm(paths):
    ds_name = p.split('/')[-1]

    out_dir = os.path.join(OUT_DIR, ds_name)
    Path(out_dir).mkdir(exist_ok=True, parents=True)

    if '-' not in ds_name:
        df = pd.read_csv(os.path.join(p, 'SD.csv'))
        df = df[~df['DR'].isna()]
        df = compute_max_atomic_num(df)
        df.to_csv(os.path.join(out_dir, 'DR.csv'), index=False)
    else:    
        df = pd.read_csv(os.path.join(p, 'DR.csv'))
        df['DR determination'] = df['DR'].apply(lambda x: 'default' if np.isnan(x) else 'experimental')

        df_dr_not_nan = df[~df['DR'].isna()]
        dr_values = df_dr_not_nan['DR'].values

        assert not np.isnan(dr_values).any()

        min_dr = np.min(dr_values)

        df['DR'] = df['DR'].apply(lambda x: min_dr if np.isnan(x) else x)

        df = compute_max_atomic_num(df)
        df.to_csv(os.path.join(out_dir, 'DR.csv'), index=False)