# Carbon allotrope dataset example for Vast DB

### Imports

In [1]:
import os
from pathlib import Path
from pickle import load
from time import time

from ase.io import iread
from dotenv import load_dotenv
from pyspark.sql import SparkSession

from colabfit.tools.configuration import AtomicConfiguration
from colabfit.tools.database import DataManager, VastDataLoader, generate_ds_id
from colabfit.tools.configuration_set import configuration_set_info
from colabfit.tools.property import PropertyMap, property_info
from colabfit.tools.property_definitions import (
    atomic_forces_pd,
    energy_pd,
)

### Import environment variables and set up connections to Vast DB

In [None]:
# The variables VAST_DB_KEY, VAST_DB_SECRET, and VAST_DB_ENDPOINT must be set in a .env file
load_dotenv()

spark_session = SparkSession.builder.appName("colabfit").getOrCreate()
loader = VastDataLoader()
loader.set_spark_session(spark_session)

access_key = os.getenv("VAST_DB_KEY")
access_secret = os.getenv("VAST_DB_SECRET")
endpoint = os.getenv("VAST_DB_ENDPOINT")
loader.set_vastdb_session(
    endpoint=endpoint,
    access_key=access_key,
    access_secret=access_secret,
)

In [None]:
# Path to directory where metadata files will be saved
loader.metadata_dir = "test_md/MDtest"

# If not using default tables, set table names
table_prefix = "ndb.colabfit.dev"
loader.config_table = f"{table_prefix}.co_test1"
loader.prop_object_table = f"{table_prefix}.po_test1"
loader.config_set_table = f"{table_prefix}.cs_test1"
loader.dataset_table = f"{table_prefix}.ds_test1"
loader.co_cs_map_table = f"{table_prefix}.cs_co_map_test1"
print(
    loader.config_table,
    loader.config_set_table,
    loader.dataset_table,
    loader.prop_object_table,
    loader.co_cs_map_table,
    sep="\n",
)

### Set dataset information

In [None]:
DATASET_NAME = "Carbon_allotrope_multilayer_graphene_graphite_PRB2019"
AUTHORS = ["Mingjian Wen", "Ellad B. Tadmor"]
DESCRIPTION = "A description of the dataset"
DATASET_ID = generate_ds_id()
LICENSE = "CC-BY-4.0"
PUBLICATION_LINK = (
    "https://doi.org/10.1103/PhysRevB.100.195419"  # URL to the publication
)
DATA_LINK = "https://journals.aps.org/prb/supplemental/10.1103/PhysRevB.100.195419/dataset.tar"  # URL to original downloadable data files
OTHER_LINKS = [
    "https://www.example.com",  # URL to other relevant information, if applicable
]
PUBLICATION_YEAR = "2024"

### Define metadata to be included

In [7]:
# The Property Map object is a convenience class to generate metadata and property details
# in the correct format for Property Object and Configuration Object creation.

property_map = PropertyMap([atomic_forces_pd, energy_pd])
property_map.set_metadata_field("software", "VASP 5.x.x")  # required field
property_map.set_metadata_field("method", "DFT-PBE")  # required field
input_ = "Details from the publication, README or data files about the process for generating the data files"  # noqa
property_map.set_metadata_field("input", input_)

# Use the "dynamic" flag to indicate that the field is not a fixed value, but should be taken from a
# given field in each configuration's `config.info` dictionary.
# For example, if the basis set varies between configurations, the following would indicate that the
# basis set value should be taken from `config.info["basis_set"]`
# property_map.set_metadata_field("basis_set", "basis_set", dynamic=True)

### Define details for each property type to be included in the dataset.

In [None]:
# The PropertyMap object will check that required fields are populated for each property type.

energy_info = property_info(
    property_name="energy",  # corresponds to the property-name field in a property definition
    field="Energy",  # corresponds to the field in the configuration's info dict that contains the property data
    units="eV",
    original_file_key="Energy",  # corresponds to the key in the original file that contains the property data. May differ from "field". See example reader function below.
    additional=[("per-atom", {"value": False, "units": None})],
)
force_info = property_info(
    property_name="atomic-forces",
    field="force",  # this may be either in the config.info or in config.arrays (latter is default for ASE Atoms objects)
    units="eV/angstrom",
    original_file_key="force",
    additional=None,
)
property_map.set_properties([energy_info, force_info])

PROPERTY_MAP = property_map.get_property_map()
print(PROPERTY_MAP)

### Define reader function

In [9]:
# Files may be downloaded from the DATA_LINK URL above and extracted to the DATASET_FP directory
DATASET_FP = Path("/path/to/data/files")


# Reader function should output a colabfit AtomicConfiguration object
def reader(fp: Path):
    # names and/or labels may be used later to define configuration sets
    name = str(fp).replace(str(DATASET_FP), "").split("/")
    name = "__".join([x for x in name if x != ""])
    # In this dataset, there is only one configuration per file, but the following would handle files with multiple configurations
    iter_configs = iread(fp, format="extxyz", index=":")
    for i, config in enumerate(iter_configs):
        config.info["_name"] = name
        yield AtomicConfiguration.from_ase(config)


# Wrapper to apply reader function to directory
def read_directory(dir_path: str):
    dir_path = Path(dir_path)
    if not dir_path.exists():
        return
    data_paths = sorted(list(dir_path.rglob("*.xyz")))
    for data_path in data_paths:
        yield from reader(data_path)

### Insert Property Objects and Configurations

In [None]:
config_generator = read_directory(DATASET_FP)
dm = DataManager(
    configs=config_generator,
    prop_defs=[energy_pd, atomic_forces_pd],
    prop_map=PROPERTY_MAP,
    dataset_id=DATASET_ID,
)
dm.load_co_po_to_vastdb(loader)

### Insert Configuration Sets if Desired

In [None]:
# Configuration Sets are organized groupings over Configurations
# Make one tuple of information for each Configuration Set to be created
# Each tuple of CS info includes, in this order:
# 1: substring to match on configuration names (if applicable)
# 2: substring to match on labels (if applicable)
# 3: name of the configuration set
# 4: description of the configuration set
# configuration_set_info is a convenience object to help deliver values in proper order
CONFIGURATION_SETS = [
    configuration_set_info(
        co_name_match="bilayer__",  # substring to match on configuration names
        co_label_match=None,  # substring to match on labels
        cs_name=f"{DATASET_NAME}__bilayer_graphene",  # name of the configuration set
        cs_description=f"Configurations from {DATASET_NAME} of bilayer graphene",  # description of the configuration set
    ),
    configuration_set_info(
        "graphite__",
        None,
        f"{DATASET_NAME}__graphite",
        f"Configurations from {DATASET_NAME} of graphite",
    ),
    configuration_set_info(
        "monolayer__",
        None,
        f"{DATASET_NAME}__monolayer_graphene",
        f"Configurations from {DATASET_NAME} of monolayer graphene",
    ),
]

dm.create_configuration_sets(
    loader,
    CONFIGURATION_SETS,
)

### Insert Dataset

In [None]:
# Create the Dataset Object using values defined above
dm.create_dataset(
    loader,
    name=DATASET_NAME,
    authors=AUTHORS,
    publication_link=PUBLICATION_LINK,
    data_link=DATA_LINK,
    other_links=OTHER_LINKS,
    data_license=LICENSE,
    description=DESCRIPTION,
    publication_year=PUBLICATION_YEAR,
    # doi= "New DOI of the dataset, if applicable",
)

In [None]:
# Stop the Spark instance
loader.stop_spark()