In [None]:
import pandas as pd
import shutil
import tqdm
import os
import re

In [None]:
PATHOGENS = ["abaumannii"]

def clean_string_regex(input_string):
    return re.sub(r'[^a-zA-Z0-9_]', '', input_string)

for pathogen in tqdm.tqdm(PATHOGENS[:1]):

    # Defin path to outputs
    PATH_TO_OUTPUT = f"/home/acomajuncosa/Documents/model_{pathogen}/"  # Modidy as needed
    os.makedirs(os.path.join(PATH_TO_OUTPUT, 'checkpoints'), exist_ok=True)
    os.makedirs(os.path.join(PATH_TO_OUTPUT, 'framework/code'), exist_ok=True)
    os.makedirs(os.path.join(PATH_TO_OUTPUT, 'framework/columns'), exist_ok=True)
    os.makedirs(os.path.join(PATH_TO_OUTPUT, 'framework/examples'), exist_ok=True)

    # Define path to data
    PATH_TO_DATA = os.path.join("..", "output", "03_baseline_models", f"{pathogen}_organism")
    tasks = sorted(os.listdir(PATH_TO_DATA))

    # Copy Summary report
    PATH_TO_CAMT = f"/home/acomajuncosa/Documents/chembl-antimicrobial-tasks/output/{pathogen}_organism/018_selected_tasks_FINAL.csv"
    shutil.copyfile(PATH_TO_CAMT, os.path.join(PATH_TO_OUTPUT, "checkpoints", "018_selected_tasks_FINAL.csv"))

    # Create run_columns.csv file
    run_columns = pd.DataFrame({
        "name": tasks,
        "type": ['float' for i in tasks],
        "direction": ['high' for i in tasks],
        "description": ["Predicted probability of being active according to task " + clean_string_regex(i) for i in tasks] # Caution with %s and special characters
    })
    run_columns.to_csv(os.path.join(PATH_TO_OUTPUT, "framework", "columns", "run_columns.csv"), index=False)

    # Greate run_input.csv file
    run_input = pd.DataFrame({
        "smiles": ["COc1ccc(\C=C\C(O)=O)cc1", "C[C@H](N[C@@H](CCc1ccccc1)C(O)=O)C(=O)N1CCC[C@H]1C(O)=O", "Cc1ccc(cc1C)N1CCN(Cc2nc3ccccc3[nH]2)CC1"]
    })
    run_input.to_csv(os.path.join(PATH_TO_OUTPUT, "framework", "examples", "run_input.csv"), index=False)

    # Create empty main.py file
    with open(os.path.join(PATH_TO_OUTPUT, "framework", "code", "main.py"), 'w') as f:
        f.write("# This is a placeholder for the main.py file.\n")

    # Create the run.sh file
    with open(os.path.join(PATH_TO_OUTPUT, "framework", "run.sh"), 'w') as f:
        f.write("python $1/code/main.py $2 $3\n")

    # Foor each task
    for task in tasks:

        # Copy the zsRF model
        shutil.copyfile(os.path.join(PATH_TO_DATA, task, "RF.joblib"), 
                        os.path.join(PATH_TO_OUTPUT, "checkpoints", clean_string_regex(task) + "_RF.joblib"))
        

In [None]:
### CONTENTS FOR THE MAIN.PY FILE ###

In [None]:
# imports
import os
import csv
import sys
import tqdm
import joblib
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

# root = os.path.abspath(os.path.dirname(__file__))
# sys.path.append(root)
root = os.path.join(PATH_TO_OUTPUT, "framework", "code")

# current file directory
checkpoints_dir = os.path.join(root, "..", "..", "checkpoints")
tasks = pd.read_csv(os.path.join(root, "..", "columns", "run_columns.csv"))['name'].tolist()

# parse arguments
# input_file = sys.argv[1]
# output_file = sys.argv[2]

input_file = os.path.join(root, "..", "examples", "run_input.csv")
output_file = os.path.join(root, "..", "examples", "run_output.csv")

# Read smiles
with open(input_file, "r") as f:
    smiles = []
    reader = csv.reader(f)
    next(reader)
    for r in reader:
        smiles += [r[0]]

# Get Morgan fingerprints
X = []
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=2048)
for smi in smiles:
    mol = Chem.MolFromSmiles(smi)
    mfp = mfpgen.GetCountFingerprint(mol)
    X.append(mfp.ToList())

# Convert to numpy array
X = np.array(X, dtype=np.int16)

# Create output DataFrame
OUTPUT = pd.DataFrame({"smiles": smiles})  # We will remove this column later

In [None]:
for task in tasks:

    # Load the model
    model = joblib.load(os.path.join(checkpoints_dir, task + "_RF.joblib"))

    # Save predictions
    preds = model.predict_proba(X)[:, 1]
    # fitting power transformation?
    OUTPUT[task] = preds
    OUTPUT[task] = OUTPUT[task].astype(float)

In [None]:
# Global score
