In [5]:
import numpy as np
import pandas as pd
import glob
import json
import os
from pathlib import Path
from rdkit import Chem
from tqdm.auto import tqdm

In [2]:
# From https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [3]:
with open('MF_PCBA_random_seeds.json', 'r') as f:
    random_seeds = json.load(f)

In [8]:
NAME = '504329'
SAVE_PATH = 'train_val_test_splits'

df = pd.read_csv(f'retrieved/AID{NAME}/SD.csv')
seeds = random_seeds[NAME]

for j, s in enumerate(seeds):
    train, validate, test = train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=s)

    save_dirpath = os.path.join(SAVE_PATH, NAME, str(j))
    Path(save_dirpath).mkdir(exist_ok=True, parents=True)

    train.to_csv(f'{save_dirpath}/train.csv', index=False)
    validate.to_csv(f'{save_dirpath}/validate.csv', index=False)
    test.to_csv(f'{save_dirpath}/test.csv', index=False)