# Imports

In [1]:
import datetime
import json
import os
import pickle
from itertools import chain, islice
from multiprocessing import Pool, cpu_count
from pathlib import Path
from pprint import pprint

import dateutil.parser
import findspark
import lmdb
import numpy as np
import pyspark
from ase.atoms import Atoms

from functools import partial
from ase.io.cfg import read_cfg
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    BooleanType,
    DoubleType,
    FloatType,
    IntegerType,
    LongType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)

from colabfit.tools.configuration import AtomicConfiguration

from colabfit.tools.database import DataManager, PGDataLoader
from colabfit.tools.property import Property, property_object_schema
from colabfit.tools.configuration import config_schema
from colabfit.tools.property_definitions import (
    atomic_forces_pd,
    cauchy_stress_pd,
    potential_energy_pd,
)

findspark.init()
format = "jdbc"
load_dotenv("./.env")

True

# Set up MTPU and Carolina Materials readers and data

In [2]:
def convert_stress(keys, stress):
    stresses = {k: s for k, s in zip(keys, stress)}
    return [
        [stresses["xx"], stresses["xy"], stresses["xz"]],
        [stresses["xy"], stresses["yy"], stresses["yz"]],
        [stresses["xz"], stresses["yz"], stresses["zz"]],
    ]


SYMBOL_DICT = {"0": "Si", "1": "O"}


def reader(filepath):
    with open(filepath, "rt") as f:
        energy = None
        forces = None
        coords = []
        cell = []
        symbols = []
        config_count = 0
        for line in f:
            if line.strip().startswith("Size"):
                size = int(f.readline().strip())
            elif line.strip().lower().startswith("supercell"):
                cell.append([float(x) for x in f.readline().strip().split()])
                cell.append([float(x) for x in f.readline().strip().split()])
                cell.append([float(x) for x in f.readline().strip().split()])
            elif line.strip().startswith("Energy"):
                energy = float(f.readline().strip())
            elif line.strip().startswith("PlusStress"):
                stress_keys = line.strip().split()[-6:]
                stress = [float(x) for x in f.readline().strip().split()]
                stress = convert_stress(stress_keys, stress)
            elif line.strip().startswith("AtomData:"):
                keys = line.strip().split()[1:]
                if "fx" in keys:
                    forces = []
                for i in range(size):
                    li = {
                        key: val for key, val in zip(keys, f.readline().strip().split())
                    }
                    symbols.append(SYMBOL_DICT[li["type"]])
                    if "cartes_x" in keys:
                        coords.append(
                            [
                                float(c)
                                for c in [
                                    li["cartes_x"],
                                    li["cartes_y"],
                                    li["cartes_z"],
                                ]
                            ]
                        )
                    elif "direct_x" in keys:
                        coords.append(
                            [
                                float(c)
                                for c in [
                                    li["direct_x"],
                                    li["direct_y"],
                                    li["direct_z"],
                                ]
                            ]
                        )

                    if "fx" in keys:
                        forces.append(
                            [float(f) for f in [li["fx"], li["fy"], li["fz"]]]
                        )

            elif line.startswith("END_CFG"):
                if "cartes_x" in keys:
                    config = AtomicConfiguration(
                        positions=coords, symbols=symbols, cell=cell
                    )
                elif "direct_x" in keys:
                    config = AtomicConfiguration(
                        scaled_positions=coords, symbols=symbols, cell=cell
                    )
                config.info["energy"] = energy
                if forces:
                    config.info["forces"] = forces
                config.info["stress"] = stress

                if "Si" in symbols and "O" in symbols:
                    config.info["input"] = {
                        "kpoint-scheme": "Monkhorst-Pack",
                        "kpoints": "11x11x11",
                        "kinetic-energy-cutoff": {
                            "val": 1224,
                            "units": "eV",
                        },
                    }
                    config.info["_name"] = f"{filepath.stem}_SiO2_{config_count}"
                elif "Si" in symbols:
                    config.info["input"] = {
                        "kpoint-scheme": "Monkhorst-Pack",
                        "kpoints": "8x8x8",
                        "kinetic-energy-cutoff": {
                            "val": 884,
                            "units": "eV",
                        },
                    }
                    config.info["_name"] = f"{filepath.stem}_Si_{config_count}"
                elif "O" in symbols:
                    config.info["input"] = {
                        "kpoint-scheme": "Monkhorst-Pack",
                        "kpoints": "gamma-point",
                        "kinetic-energy-cutoff": {
                            "val": 1224,
                            "units": "eV",
                        },
                    }
                    config.info["_name"] = f"{filepath.stem}_O_{config_count}"
                config_count += 1
                yield config
                forces = None
                stress = []
                coords = []
                cell = []
                symbols = []
                energy = None

In [3]:
mtpu_configs = reader(Path("data/mtpu_2023/Unified_training_set.cfg"))
data = [x for x in mtpu_configs]
data[0].configuration_summary()

{'nsites': 4,
 'elements': ['Si'],
 'nelements': 1,
 'elements_ratios': [1.0],
 'chemical_formula_anonymous': 'A',
 'chemical_formula_reduced': 'Si',
 'chemical_formula_hill': 'Si4',
 'dimension_types': [0, 0, 0],
 'nperiodic_dimensions': 0}

In [27]:
SOFTWARE = "VASP"
METHODS = "DFT-PBE"
CM_PI_METADATA = {
    "software": {"value": SOFTWARE},
    "method": {"value": METHODS},
    "input": {"value": {"IBRION": 6, "NFREE": 4}},
}

CM_PROPERTY_MAP = {
    "formation-energy": [
        {
            "energy": {"field": "energy", "units": "eV"},
            "per-atom": {"value": False, "units": None},
        }
    ],
    "_metadata": CM_PI_METADATA,
}
CO_MD = {
    key: {"field": key}
    for key in [
        "_symmetry_space_group_name_H-M",
        "_symmetry_Int_Tables_number",
        "_chemical_formula_structural",
        "_chemical_formula_sum",
        "_cell_volume",
        "_cell_formula_units_Z",
        "symmetry_dict",
        "formula_pretty",
    ]
}


def load_row(txn, row):
    try:
        data = pickle.loads(txn.get(f"{row}".encode("ascii")))
        return data
    except TypeError:
        return False


def config_from_row(row: dict, row_num: int):
    coords = row.pop("cart_coords")
    a_num = row.pop("atomic_numbers")
    cell = [
        row.pop(x)
        for x in [
            "_cell_length_a",
            "_cell_length_b",
            "_cell_length_c",
            "_cell_angle_alpha",
            "_cell_angle_beta",
            "_cell_angle_gamma",
        ]
    ]
    config = Atoms(scaled_positions=coords, numbers=a_num, cell=cell)
    symmetry_dict = {str(key): val for key, val in row.pop("symmetry_dict").items()}
    for key in symmetry_dict:
        key = str(key)
    config.info = row
    config.info["symmetry_dict"] = symmetry_dict
    config.info["name"] = f"carolina_materials_{row_num}"
    return AtomicConfiguration.from_ase(config)


def carmat_reader(fp: Path):
    parent = fp.parent
    env = lmdb.open(str(parent))
    txn = env.begin()
    row_num = 0
    rows = []
    while row_num <= 10000:
        row = load_row(txn, row_num)
        if row is False:
            env.close()
            break
        rows.append(row)
        yield config_from_row(row, row_num)
        row_num += 1
    env.close()
    return False
    # return rows

In [28]:
reader = carmat_reader(Path("data/carolina_matdb/base/all/data.mdb"))

In [29]:
carmat_configs = list(carmat_reader(Path("data/carolina_matdb/base/all/data.mdb")))
PI_METADATA = {
    "software": {"value": "Quantum ESPRESSO"},
    "method": {"value": "DFT-PBE"},
    "input": {"field": "input"},
}

PROPERTY_MAP = {
    "potential-energy": [
        {
            "energy": {"field": "energy", "units": "eV"},
            "per-atom": {"value": False, "units": None},
            # "_metadata": PI_METADATA,
        }
    ],
    "atomic-forces": [
        {
            "forces": {"field": "forces", "units": "eV/angstrom"},
            # "_metadata": PI_METADATA,
        },
    ],
    "cauchy-stress": [
        {
            "stress": {"field": "stress", "units": "GPa"},
            "volume-normalized": {"value": True, "units": None},
        }
    ],
    "_metadata": PI_METADATA,
}

In [30]:
# carmat_aconfigs = [
#     AtomicConfiguration(names=f"{i}").from_ase(c) for i, c in enumerate(carmat_configs)
# ]
for c in carmat_configs:
    c.set_metadata(CO_MD)
len(carmat_configs)

10001

# Connect to DB and run loader

In [11]:
JARFILE = os.environ.get("CLASSPATH")
spark = (
    SparkSession.builder.appName("PostgreSQL Connection with PySpark")
    .config("spark.jars", JARFILE)
    .getOrCreate()
)
url = "jdbc:postgresql://localhost:5432/colabfit"
user = os.environ.get("PGS_USER")
password = os.environ.get("PGS_PASS")
properties = {
    "user": user,
    "password": password,
    "driver": "org.postgresql.Driver",
}
loader = PGDataLoader(appname="colabfit", env="./.env")

24/04/16 13:07:36 WARN Utils: Your hostname, arktos resolves to a loopback address: 127.0.1.1; using 172.24.21.25 instead (on interface enp5s0)
24/04/16 13:07:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/16 13:07:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/16 13:07:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [31]:
with open("formation_energy.json", "r") as f:
    formation_energy_pd = json.load(f)
dm = DataManager(
    nprocs=4,
    configs=carmat_configs,
    prop_defs=[formation_energy_pd],
    prop_map=CM_PROPERTY_MAP,
)

In [13]:
co_po = list(dm.gather_co_po_in_batches())
co_po0 = co_po[0]
len(co_po0)

type config chunks <class 'generator'>
number of chunks 2
10000


10001

In [32]:
dm.load_data_to_pg_in_batches(loader)

type config chunks <class 'generator'>
number of chunks 2
10000


In [14]:
cos, pos = zip(*co_po0)

In [25]:
pprint(cos[0])

{'atomic_numbers': '[35 20  1  1  1  1  1  1 45 45]',
 'cell': '[[5.39874426, 0.0, 0.0], [2.6993721300000004, 4.67544967769542, '
         '0.0], [2.6993721300000004, 1.5584832258984738, 4.4080562295931855]]',
 'chemical_formula_anonymous': 'A6B2CD',
 'chemical_formula_hill': 'H6BrCaRh2',
 'chemical_formula_reduced': 'BrCaH6Rh2',
 'dataset_ids': None,
 'dimension_types': '[0, 0, 0]',
 'elements': "['Br', 'Ca', 'H', 'Rh']",
 'elements_ratios': '[0.1, 0.1, 0.6, 0.2]',
 'hash': 1639958031759710933,
 'id': 'CO_1639958031759710933',
 'last_modified': datetime.datetime(2024, 4, 16, 17, 7, 39, tzinfo=tzutc()),
 'metadata': '{"_symmetry_space_group_name_H-M": "Fm-3m", '
             '"_symmetry_Int_Tables_number": "225", '
             '"_chemical_formula_structural": "CaH6Rh2Br", '
             '"_chemical_formula_sum": "\'Ca1 H6 Rh2 Br1\'", "_cell_volume": '
             '"111.26620291", "_cell_formula_units_Z": "1", "symmetry_dict": '
             '{"1": "\'x, y, z\'", "2": "\'-x, -y, -z\'"

In [26]:
pprint(pos[0])

{'adsorption_energy': None,
 'adsorption_energy_per_atom': None,
 'adsorption_energy_property_id': None,
 'adsorption_energy_reference': None,
 'adsorption_energy_reference_unit': None,
 'adsorption_energy_unit': None,
 'atomic_forces': None,
 'atomic_forces_property_id': None,
 'atomic_forces_unit': None,
 'atomization_energy': None,
 'atomization_energy_per_atom': None,
 'atomization_energy_property_id': None,
 'atomization_energy_reference': None,
 'atomization_energy_reference_unit': None,
 'atomization_energy_unit': None,
 'band_gap': None,
 'band_gap_property_id': None,
 'band_gap_unit': None,
 'cauchy_stress': None,
 'cauchy_stress_property_id': None,
 'cauchy_stress_unit': None,
 'cauchy_stress_volume_normalized': None,
 'chemical_formula_hill': 'H6BrCaRh2',
 'configuration_ids': "['CO_1639958031759710933']",
 'dataset_ids': None,
 'formation_energy': -0.574,
 'formation_energy_per_atom': False,
 'formation_energy_property_id': 'tag:staff@noreply.colabfit.org,2022-11-18:propert

In [24]:
dateutil.parser.parse(
    datetime.datetime.now(tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
)

datetime.datetime(2024, 4, 16, 17, 15, 23, tzinfo=tzutc())

In [None]:
"""
Can we make the configuration and the property instance/data object at the same time?
In this way, we would only have to pass through the data one time.

Workflow:
create database access object
create data reader as function? of the database access object
reader returns ase.Atoms-style objects (AtomicConfiguration)
DOs and PIs are now one object
These DOs point to a configuration
The configuration may already exist in the database, so we keep track of the hash added to the DO


"""

In [None]:
cos = json.load(Path("sample_db/co_ds1.json").open("r"))

In [None]:
with open(Path("sample_db/co_ds1.json"), "r") as f:
    co_json = spark.sparkContext.parallelize(json.load(f))

In [None]:
co = co_json.map(_parse_config).map(stringify_lists)
co_df = spark.createDataFrame(co, config_schema)

In [None]:
def parse_configs(co_path, spark):
    with open(co_path, "r") as f:
        co_json = spark.sparkContext.parallelize(json.load(f))
    co = co_json.map(_parse_config).map(stringify_lists)
    co_df = spark.createDataFrame(co, config_schema)
    return co_df

In [None]:
parse_configs("sample_db/co_ds1.json", spark).show()

In [None]:
table_name = "co"

mode = "append"
url = "jdbc:postgresql://localhost:5432/colabfit"
properties = {"user": user, "password": password, "driver": "org.postgresql.Driver"}
co_df.write.jdbc(url=url, table=table_name, mode=mode, properties=properties)

In [None]:
# co_df.write.