Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# DFT dataset generation using PySCF IPU

## Dependencies and configuration

Install the JAX experimental for IPU (and addons).  

In [None]:
import os
# Update working directory to root directory of the repo.
if len([d for d in os.listdir() if d.endswith(".ipynb")]) > 0:
    os.chdir(os.getcwd() + "/..")
print("Working directory:", os.getcwd())

Install `pyscf-ipu`:

In [None]:
%pip install -e "..[ipu]"

# Download and preprocess GDB 11 dataset

In [None]:
# Download and extract GDB11 dataset.
!wget -p -O ./gdb/gdb11.tgz https://zenodo.org/record/5172018/files/gdb11.tgz\?download\=1
!tar -xvf ./gdb/gdb11.tgz --directory ./gdb/

In [None]:
import gdb.sortgdb as sortgdb

# Filter & sort GDB11 dataset (size 9).
gdb_filename = "./gdb/gdb11_size09.smi"
gdb_sorted = sortgdb.sort_gdb(gdb_filename, keep_only_atoms_count=9)
# Save output as csv.
out_filename = gdb_filename.replace(".smi", "_sorted.csv")
gdb_sorted.to_csv(out_filename, index=False, header=False)

In [None]:
import subprocess
import sys
import time

import numpy as np
from dataclasses import dataclass

In [None]:
# PySCF IPU setup: use a single device per process.
os.environ["JAX_IPU_DEVICE_COUNT"] = "1"
# JAX/XLA IPU compilation cache.
os.environ['TF_POPLAR_FLAGS'] = """
  --executable_cache_path=/tmp/ipu-ef-cache
"""

# First import of JAX and TessellateIPU make take a few minutes...
import jax
import tessellate_ipu

# Create a DFT dataset using PySCF IPU

In just a couple of Python lines, we can launch a background process building a DFT dataset using PySCF IPU.
In the following example, we use only a single IPU. Multiple IPUs can be used by simply launching a collection of PySCF IPU processes instead of a single one.

In [None]:
# Some PySCF parameters.

# Number of conformers per molecule.
num_conformers = 1000
# Dataset name.
dataset_name = "notebook_dataset"

In [None]:
@dataclass(frozen=True)
class DFTProcess:
    # Underlying process.
    process: subprocess.Popen
    # Path of the dataset generated.
    path: str
        
    @property
    def pid(self):
        return self.process.pid
    
    def is_running(self):
        return self.process.poll() == None
        
    def __del__(self):
        print("Killing DFT dataset process with PID:", self.process.pid)
        self.process.kill()

        
def launch_dft_process() -> DFTProcess:
    """Launch an external PySCF IPU process building a DFT molecular dataset. 
    """
    try:
        # Make sure the root directory exists.
        rootpath = f"./data/generated/{dataset_name}/"
        os.makedirs(rootpath, exist_ok=True)
        num_datasets = len(os.listdir(rootpath))
        # Launch DFT process...
        command_line = [
            "python", "density_functional_theory.py",
            "-generate",
            "-save",
            "-fname", dataset_name,
            "-level","0",
            "-plevel","0",
            "-num_conformers", str(num_conformers),
            "-gdb","9",
            "-backend","ipu",
            "-float32",
        ]
        raw_process = subprocess.Popen(command_line, env=os.environ.copy())
        print("Launching DFT dataset process with PID", raw_process.pid, "... Please wait...")
        # Wait the new directory is created...
        while len(os.listdir(rootpath)) == num_datasets or raw_process.poll() != None:
            time.sleep(1.0)
        # Failure while launching?
        if raw_process.poll() != None:
            raise RuntimeError("Error while launching PySCF IPU process...")
        # Find the dataset path (sorted by date).
        paths = sorted(os.listdir(rootpath), key=lambda x: os.path.getmtime(rootpath + x))
        filename = os.path.join(rootpath, paths[-1], "data.csv")
        return DFTProcess(raw_process, filename)
    except Exception as e:
        # Capture any issue, and kill the process in this case.
        raw_process.kill()
        raise e


In [None]:
# Launching an external PySCF IPU process
dft_process = launch_dft_process()

In [None]:
print("PySCF IPU DFT computation on-going. Dataset saved in:", dft_process.path)
dft_process.is_running()

In [None]:
# Want to stop the process?
# dft_process.process.kill()

# Loading & visualizing generated data

As the dataset is being created in the background, we can load the data which has been already generated.

In [None]:
import pandas as pd

In [None]:
# Output DFT dataset is a compressed CSV file.
# NOTE: it may take a couple of minutes before the file is generated.
df = pd.read_csv(dft_process.path, compression="gzip")

In [None]:
df

In [None]:
# HLgap data.
df["hlgap"]