In [1]:
import pandas as pd
import os
from tqdm import tqdm
import yaml

def make_yaml(seq, lig, msa=False):
    data = {
        "sequences": [
            {
                "protein": {
                    "id": "A",
                    "sequence": seq,
                    }
            },
            {
                "ligand": {
                    "id": "B",
                    "smiles": lig,
                    }
            },
        ],
        "properties": [
            {
                "affinity": {
                    "binder": "B",
                }
            }
        ],
    }
    if not msa:
        data["sequences"][0]["protein"]["msa"] = "empty"
    return data

def write_yaml(data, name):
    with open(f"{name}", 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True)

def process(out_path, df, lig, col='lmpnn', msa=False, NUM_FOLDERS=1):
    for i in tqdm(range(len(df))):
        name, sequence = df[((col, 'tag'))][i], df[((col, 'seq'))][i]
        data = make_yaml(sequence, lig, msa)
        folder_index = i % NUM_FOLDERS
        folder_path = f'{out_path}/{folder_index}'
        if not os.path.exists(folder_path):
            os.makedirs(folder_path, exist_ok=True)
        write_yaml(data, f'{folder_path}/{name}.yaml')


In [2]:
projects = ['pht_demo']
mappings = {
    'pht': 'C1=CC=C(C=C1)C2(C(=O)NC(=O)N2)C3=CC=CC=C3',
    }
df = pd.read_parquet('../4_rscore_filter/lmpnn_filt_pht.parquet')
out_path = 'yaml'
col_1 = 'lmpnn'
msa = False
NUM_FOLDERS = 1
for project in projects:
    df_ = df[df[('diffusion', 'batch')] == project].reset_index(drop=True)
    lig_name = project.split('_')[0]
    lig = mappings[lig_name]
    process(out_path, df_, lig, col=col_1, msa=msa, NUM_FOLDERS=NUM_FOLDERS)

100%|██████████| 36/36 [00:00<00:00, 1209.52it/s]
