# Imports

In [1]:
import datetime
import json
import os
import pickle
from functools import partial
from itertools import chain, islice
from multiprocessing import Pool, cpu_count
from pathlib import Path
from pprint import pprint

import dateutil.parser
import findspark
import lmdb
import numpy as np
import pyspark
import pyspark.sql.functions as sf
from ase.atoms import Atoms
from ase.io.cfg import read_cfg
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    ArrayType,
    BooleanType,
    DoubleType,
    FloatType,
    IntegerType,
    LongType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)

from colabfit.tools.configuration import AtomicConfiguration, config_schema
from colabfit.tools.database import DataManager, PGDataLoader
from colabfit.tools.property import Property, property_object_schema
from colabfit.tools.property_definitions import (
    atomic_forces_pd,
    cauchy_stress_pd,
    potential_energy_pd,
)
from colabfit.tools.dataset import Dataset, dataset_schema

with open("formation_energy.json", "r") as f:
    formation_energy_pd = json.load(f)
findspark.init()
format = "jdbc"
load_dotenv("./.env")

True

# Set up MTPU and Carolina Materials readers and data

In [2]:
def convert_stress(keys, stress):
    stresses = {k: s for k, s in zip(keys, stress)}
    return [
        [stresses["xx"], stresses["xy"], stresses["xz"]],
        [stresses["xy"], stresses["yy"], stresses["yz"]],
        [stresses["xz"], stresses["yz"], stresses["zz"]],
    ]


SYMBOL_DICT = {"0": "Si", "1": "O"}


def reader(filepath):
    with open(filepath, "rt") as f:
        energy = None
        forces = None
        coords = []
        cell = []
        symbols = []
        config_count = 0
        for line in f:
            if line.strip().startswith("Size"):
                size = int(f.readline().strip())
            elif line.strip().lower().startswith("supercell"):
                cell.append([float(x) for x in f.readline().strip().split()])
                cell.append([float(x) for x in f.readline().strip().split()])
                cell.append([float(x) for x in f.readline().strip().split()])
            elif line.strip().startswith("Energy"):
                energy = float(f.readline().strip())
            elif line.strip().startswith("PlusStress"):
                stress_keys = line.strip().split()[-6:]
                stress = [float(x) for x in f.readline().strip().split()]
                stress = convert_stress(stress_keys, stress)
            elif line.strip().startswith("AtomData:"):
                keys = line.strip().split()[1:]
                if "fx" in keys:
                    forces = []
                for i in range(size):
                    li = {
                        key: val for key, val in zip(keys, f.readline().strip().split())
                    }
                    symbols.append(SYMBOL_DICT[li["type"]])
                    if "cartes_x" in keys:
                        coords.append(
                            [
                                float(c)
                                for c in [
                                    li["cartes_x"],
                                    li["cartes_y"],
                                    li["cartes_z"],
                                ]
                            ]
                        )
                    elif "direct_x" in keys:
                        coords.append(
                            [
                                float(c)
                                for c in [
                                    li["direct_x"],
                                    li["direct_y"],
                                    li["direct_z"],
                                ]
                            ]
                        )

                    if "fx" in keys:
                        forces.append(
                            [float(f) for f in [li["fx"], li["fy"], li["fz"]]]
                        )

            elif line.startswith("END_CFG"):
                if "cartes_x" in keys:
                    config = AtomicConfiguration(
                        positions=coords, symbols=symbols, cell=cell
                    )
                elif "direct_x" in keys:
                    config = AtomicConfiguration(
                        scaled_positions=coords, symbols=symbols, cell=cell
                    )
                config.info["energy"] = energy
                if forces:
                    config.info["forces"] = forces
                config.info["stress"] = stress

                if "Si" in symbols and "O" in symbols:
                    config.info["input"] = {
                        "kpoint-scheme": "Monkhorst-Pack",
                        "kpoints": "11x11x11",
                        "kinetic-energy-cutoff": {
                            "val": 1224,
                            "units": "eV",
                        },
                    }
                    config.info["_name"] = f"{filepath.stem}_SiO2_{config_count}"
                elif "Si" in symbols:
                    config.info["input"] = {
                        "kpoint-scheme": "Monkhorst-Pack",
                        "kpoints": "8x8x8",
                        "kinetic-energy-cutoff": {
                            "val": 884,
                            "units": "eV",
                        },
                    }
                    config.info["_name"] = f"{filepath.stem}_Si_{config_count}"
                elif "O" in symbols:
                    config.info["input"] = {
                        "kpoint-scheme": "Monkhorst-Pack",
                        "kpoints": "gamma-point",
                        "kinetic-energy-cutoff": {
                            "val": 1224,
                            "units": "eV",
                        },
                    }
                    config.info["_name"] = f"{filepath.stem}_O_{config_count}"
                config_count += 1
                yield config
                forces = None
                stress = []
                coords = []
                cell = []
                symbols = []
                energy = None

In [3]:
mtpu_configs = reader(Path("data/mtpu_2023/Unified_training_set.cfg"))
data = [x for x in mtpu_configs]
data[0].configuration_summary()

{'nsites': 4,
 'elements': ['Si'],
 'nelements': 1,
 'elements_ratios': [1.0],
 'chemical_formula_anonymous': 'A',
 'chemical_formula_reduced': 'Si',
 'chemical_formula_hill': 'Si4',
 'dimension_types': [0, 0, 0],
 'nperiodic_dimensions': 0}

In [4]:
# data[0]

In [5]:
SOFTWARE = "VASP"
METHODS = "DFT-PBE"
CM_PI_METADATA = {
    "software": {"value": SOFTWARE},
    "method": {"value": METHODS},
    "input": {"value": {"IBRION": 6, "NFREE": 4}},
}

CM_PROPERTY_MAP = {
    "formation-energy": [
        {
            "energy": {"field": "energy", "units": "eV"},
            "per-atom": {"value": False, "units": None},
        }
    ],
    "_metadata": CM_PI_METADATA,
}
CO_MD = {
    key: {"field": key}
    for key in [
        "_symmetry_space_group_name_H-M",
        "_symmetry_Int_Tables_number",
        "_chemical_formula_structural",
        "_chemical_formula_sum",
        "_cell_volume",
        "_cell_formula_units_Z",
        "symmetry_dict",
        "formula_pretty",
    ]
}


def load_row(txn, row):
    try:
        data = pickle.loads(txn.get(f"{row}".encode("ascii")))
        return data
    except TypeError:
        return False


def config_from_row(row: dict, row_num: int):
    coords = row.pop("cart_coords")
    a_num = row.pop("atomic_numbers")
    cell = [
        row.pop(x)
        for x in [
            "_cell_length_a",
            "_cell_length_b",
            "_cell_length_c",
            "_cell_angle_alpha",
            "_cell_angle_beta",
            "_cell_angle_gamma",
        ]
    ]
    config = AtomicConfiguration(scaled_positions=coords, numbers=a_num, cell=cell)
    symmetry_dict = {str(key): val for key, val in row.pop("symmetry_dict").items()}
    for key in symmetry_dict:
        key = str(key)
    config.info = row
    config.info["symmetry_dict"] = symmetry_dict
    config.info["_name"] = f"carolina_materials_{row_num}"
    return config
    # return AtomicConfiguration.from_ase(config)


def carmat_reader(fp: Path):
    parent = fp.parent
    env = lmdb.open(str(parent))
    txn = env.begin()
    row_num = 0
    rows = []
    while row_num <= 10000:
        row = load_row(txn, row_num)
        if row is False:
            env.close()
            break
        rows.append(row)
        yield config_from_row(row, row_num)
        row_num += 1
    env.close()
    return False
    # return rows

In [6]:
reader = carmat_reader(Path("data/carolina_matdb/base/all/data.mdb"))

In [7]:
carmat_configs = list(carmat_reader(Path("data/carolina_matdb/base/all/data.mdb")))
carmat_config_gen = carmat_reader(Path("data/carolina_matdb/base/all/data.mdb"))
PI_METADATA = {
    "software": {"value": "Quantum ESPRESSO"},
    "method": {"value": "DFT-PBE"},
    "input": {"field": "input"},
}

PROPERTY_MAP = {
    "potential-energy": [
        {
            "energy": {"field": "energy", "units": "eV"},
            "per-atom": {"value": False, "units": None},
            # "_metadata": PI_METADATA,
        }
    ],
    "atomic-forces": [
        {
            "forces": {"field": "forces", "units": "eV/angstrom"},
            # "_metadata": PI_METADATA,
        },
    ],
    "cauchy-stress": [
        {
            "stress": {"field": "stress", "units": "GPa"},
            "volume-normalized": {"value": True, "units": None},
        }
    ],
    "_metadata": PI_METADATA,
}

In [8]:
print(carmat_configs[0])

AtomicConfiguration(name=carolina_materials_0, symbols='BrCaH6Rh2', pbc=False, cell=[[5.39874426, 0.0, 0.0], [2.6993721300000004, 4.67544967769542, 0.0], [2.6993721300000004, 1.5584832258984738, 4.4080562295931855]])


In [9]:
from colabfit.tools.configuration import AtomicConfigurationOld

In [10]:
# carmat_aconfigs = [
#     AtomicConfiguration(co_md_map=CO_MD, names="test_name").from_ase(c)
#     for c in carmat_configs
# ]
# carmat_oconfigs = [
#     AtomicConfigurationOld(names=f"{i}").from_ase(c)
#     for i, c in enumerate(carmat_configs)
# ]
# for c in carmat_aconfigs:
#     c.set_metadata(CO_MD)
# len(carmat_aconfigs)

# Connect to DB and run loader

In [11]:
JARFILE = os.environ.get("CLASSPATH")
spark = (
    SparkSession.builder.appName("PostgreSQL Connection with PySpark")
    .config("spark.jars", JARFILE)
    .getOrCreate()
)
url = "jdbc:postgresql://localhost:5432/colabfit"
user = os.environ.get("PGS_USER")
password = os.environ.get("PGS_PASS")
properties = {
    "user": user,
    "password": password,
    "driver": "org.postgresql.Driver",
}
loader = PGDataLoader(appname="colabfit", env="./.env")

24/04/23 17:40:32 WARN Utils: Your hostname, arktos resolves to a loopback address: 127.0.1.1; using 172.24.21.25 instead (on interface enp5s0)
24/04/23 17:40:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/23 17:40:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/23 17:40:33 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [12]:
carmat_config_gen = carmat_reader(Path("data/carolina_matdb/base/all/data.mdb"))

dm = DataManager(
    nprocs=4,
    configs=carmat_config_gen,
    prop_defs=[formation_energy_pd],
    prop_map=CM_PROPERTY_MAP,
)

In [13]:
dm2 = DataManager(
    nprocs=4,
    configs=mtpu_configs,
    prop_defs=[potential_energy_pd, atomic_forces_pd, cauchy_stress_pd],
    prop_map=PROPERTY_MAP,
)

In [14]:
# carmat_configs = list(carmat_reader(Path("data/carolina_matdb/base/all/data.mdb")))
# dm = DataManager(
#     nprocs=4,
#     configs=carmat_configs,
#     prop_defs=[formation_energy_pd],
#     prop_map=CM_PROPERTY_MAP,
# )

YOU were making the data loader take dataset-id as an argument so that it would create configurations and properties with ds-id in advance

In [15]:
dm.load_data_to_pg_in_batches(loader)

number of chunks 4
1000


TypeError: Property.__init__() got an unexpected keyword argument 'dataset_ids'

In [None]:
# df = loader.spark.read.format("jdbc").options(
#     url=loader.url,
#     table="public.configurations",
#     properties=loader.properties,
# )

In [None]:
ds_id = "null"
df = loader.spark.read.jdbc(
    url=loader.url, table="public.configurations", properties=loader.properties
).where(f"dataset_ids is {ds_id}")
# .select("elements", "elements_ratios", "id", "dataset_ids")
#

In [None]:
prop_df = loader.spark.read.jdbc(
    url=loader.url, table="public.property_objects", properties=loader.properties
).where(f"dataset_ids is {ds_id}")

In [None]:
prop_df.select("formation_energy").where("formation_energy is not null").count()

10001

In [None]:
loader.prefix

AttributeError: 'PGDataLoader' object has no attribute 'prefix'

In [None]:
row_dict = {}
for prop in [
    "atomization_energy",
    "adsorption_energy",
    "band_gap",
    "formation_energy",
    "free_energy",
    "potential_energy",
    "atomic_forces",
    "cauchy_stress",
]:
    row_dict[f"{prop}_count"] = (
        prop_df.select(prop).where(f"{prop} is not null").count()
    )

In [None]:
row_dict

{'atomization_energy_count': 0,
 'adsorption_energy_count': 0,
 'band_gap_count': 0,
 'formation_energy_count': 10001,
 'free_energy_count': 0,
 'potential_energy_count': 0,
 'atomic_forces_count': 0,
 'cauchy_stress_count': 0}

In [None]:
df.agg(sf.collect_set("nperiodic_dimensions")).collect()[0][0]

[0, 1]

In [None]:
dims = (
    df.withColumn(
        "dims_unstrung",
        sf.from_json(sf.col("dimension_types"), sf.ArrayType(sf.StringType())),
    )
    .select("dims_unstrung")
    .agg(sf.collect_set("dims_unstrung"))
    .collect()[0][0]
)

In [None]:
df = loader.spark.read.jdbc(
    loader.url,
    table="public.configurations",
    properties=loader.properties,
).options(query="SELECT * FROM public.configurations")

AttributeError: 'DataFrame' object has no attribute 'options'

In [None]:
loader.spark.read.format("jdbc").options(
    url=loader.url,
    query="SELECT * FROM public.configurations",
).load()

Py4JJavaError: An error occurred while calling o207.load.
: java.lang.NoSuchMethodException: org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper.<init>()
	at java.base/java.lang.Class.getConstructor0(Class.java:3349)
	at java.base/java.lang.Class.getConstructor(Class.java:2151)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:54)
	at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1(JdbcDialects.scala:157)
	at org.apache.spark.sql.jdbc.JdbcDialect.$anonfun$createConnectionFactory$1$adapted(JdbcDialects.scala:156)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.getQueryOutputSchema(JDBCRDD.scala:63)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation$.getSchema(JDBCRelation.scala:241)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:37)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:346)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:172)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
pyspark.sql.SQLContext(loader.spark).sql("SELECT * FROM configurations").show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `configurations` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [configurations], [], false


# Hashing

# outer

In [None]:
"""
Can we make the configuration and the property instance/data object at the same time?
In this way, we would only have to pass through the data one time.

Workflow:
create database access object
create data reader as function? of the database access object
reader returns ase.Atoms-style objects (AtomicConfiguration)
DOs and PIs are now one object
These DOs point to a configuration
The configuration may already exist in the database, so we keep track of the hash added to the DO


"""

In [None]:
cos = json.load(Path("sample_db/co_ds1.json").open("r"))

In [None]:
with open(Path("sample_db/co_ds1.json"), "r") as f:
    co_json = spark.sparkContext.parallelize(json.load(f))

In [None]:
co = co_json.map(_parse_config).map(stringify_lists)
co_df = spark.createDataFrame(co, config_schema)

In [None]:
def parse_configs(co_path, spark):
    with open(co_path, "r") as f:
        co_json = spark.sparkContext.parallelize(json.load(f))
    co = co_json.map(_parse_config).map(stringify_lists)
    co_df = spark.createDataFrame(co, config_schema)
    return co_df

In [None]:
parse_configs("sample_db/co_ds1.json", spark).show()

In [None]:
table_name = "co"

mode = "append"
url = "jdbc:postgresql://localhost:5432/colabfit"
properties = {"user": user, "password": password, "driver": "org.postgresql.Driver"}
co_df.write.jdbc(url=url, table=table_name, mode=mode, properties=properties)

In [None]:
# co_df.write.