In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm

def make_json(name, sequence, lig, msa=True):
    temp = {
        "name": name,
        "sequences": [
            {
                "protein": {
                    "id": "A",
                    "sequence": sequence
                }
            },
            {
                "ligand": {
                    "id": "B",
                    "smiles": lig,
                }
            }
        ],
        "modelSeeds": [1],
        "dialect": "alphafold3",
        "version": 1
    }
    if not msa:
        temp["sequences"][0]["protein"].update({
            "modifications": [],
            "unpairedMsa": "",
            "pairedMsa": "",
            "templates": []
        })
    return temp

def write_json(data, name):
    with open(name, 'w') as f:
        json.dump(data, f)

def process(out_path, df, lig, col='lmpnn', msa=False, NUM_FOLDERS=1):
    for i in tqdm(range(len(df))):
        name, sequence = df[((col, 'tag'))][i], df[((col, 'seq'))][i]
        data = make_json(name, sequence, lig, msa)
        folder_index = i % NUM_FOLDERS  
        folder_path = f'{out_path}/{folder_index}'
        if not os.path.exists(folder_path):
            os.makedirs(folder_path, exist_ok=True)
        write_json(data, f'{folder_path}/{name}.json')


In [2]:
projects = ['pht_demo']
mappings = {
    'pht': 'C1=CC=C(C=C1)C2(C(=O)NC(=O)N2)C3=CC=CC=C3',
    }
df = pd.read_parquet('../4_rscore_filter/lmpnn_filt_pht.parquet')
out_path = 'json'
col_1 = 'lmpnn'
msa = False
NUM_FOLDERS = 4
for project in projects:
    df_ = df[df[('diffusion', 'batch')] == project].reset_index(drop=True)
    lig_name = project.split('_')[0]
    lig = mappings[lig_name]
    process(out_path, df_, lig, col=col_1, msa=msa, NUM_FOLDERS=NUM_FOLDERS)

100%|██████████| 36/36 [00:00<00:00, 1917.66it/s]
