This notebook screens the top 5000 CO2-adsorbing MOFs, predicted using the native Uni-MOF model, for water stability using the fine-tuned WS24-Uni-MOF model. The "mostly freeze weighted" approach is used.

## Import Packages

In [None]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

## Clone the GitHub repository locally (only do once at start of session)

In [None]:
token = "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"
username = "emd-aquila"
repo = "Xc51-MOFs"

if not os.path.exists(repo):
    !git clone https://{username}:{token}@github.com/{username}/{repo}.git
%cd {repo}

!git config --global user.name "emd-aquila"
!git config --global user.email "emduggan@mit.edu"
!git pull

## Configure Initial Values

In [None]:
MOF_DIR = "MOF_screening/MOFX_CIFs"
TOP_MOFS_CSV = "MOF_screening/5000_top_co2_adsorbing_mofs.csv"
OUTPUT_CSV = "MOF_screening/5000_top_mofs_water_stability.csv"
UNIMOF_INFER_CMD = "unimof-infer"  # Update if you use a wrapper or Docker
MODEL_PATH = "models/ws24_mostly_freeze_weighted.pt"  # Adjust path to model if needed

## Load top MOFs

In [None]:
top_mofs = pd.read_csv(TOP_MOFS_CSV)
print(f"Loaded {len(top_mofs)} MOFs from top CO2 adsorption file")



## Run water stability predictions and save results.


In [None]:
results = []

for _, row in tqdm(top_mofs.iterrows(), total=len(top_mofs), desc="Screening water stability"):
    mof_name = row["MOF"]
    co2_uptake = row.get("CO2_Uptake_mmol/g", None)
    cif_path = os.path.join(MOF_DIR, f"{mof_name}.cif")

    if not os.path.exists(cif_path):
        print(f"[WARN] Missing CIF for {mof_name}, skipping")
        continue

    cmd = [
        UNIMOF_INFER_CMD,
        "--structure", cif_path,
        "--model", MODEL_PATH,
        "--property", "water_stability"
    ]

    try:
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        predicted_stability = result.stdout.strip()
    except Exception as e:
        print(f"[ERROR] {mof_name}: {e}")
        predicted_stability = "U"

    results.append({
        "MOF": mof_name,
        "CO2_Uptake_mmol/g": co2_uptake,
        "Predicted_Water_Stability": predicted_stability
    })

#organize results by stability
stability_order = {"TS": 0, "HK": 1, "LK": 2, "U": 3}
results_df = pd.DataFrame(results)
results_df["Stability_Rank"] = results_df["Predicted_Water_Stability"].map(stability_order).fillna(4)
results_df = results_df.sort_values(by="Stability_Rank")

# print how many MOFs in each stability category
stability_counts = results_df["Predicted_Water_Stability"].value_counts()
print("\nWater Stability Classification Summary:")
for label in ["TS", "HK", "LK", "U"]:
    print(f"  {label}: {stability_counts.get(label, 0)} MOFs")

# save predicted water stability to csv
results_df.drop(columns=["Stability_Rank"], inplace=True)
results_df.to_csv(OUTPUT_CSV, index=False)
print(f"Water stability predictions saved to {OUTPUT_CSV}")

# add/commit/push to GitHub
try:
    subprocess.run(["git", "add", OUTPUT_CSV], check=True)
    subprocess.run(["git", "commit", "-m", "Add water stability predictions for top 5000 MOFs"], check=True)
    subprocess.run(["git", "push"], check=True)
    print("✅ Results pushed to GitHub under MOF_screening/")
except subprocess.CalledProcessError as e:
    print(f"[GIT ERROR] Could not commit/push results: {e}")