In [1]:
import os
import numpy as np
import numpy.linalg as la
import lmp_class as lmpc
import time
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
from dash import Dash, html, dcc, callback, Output, Input
import importlib
import kmeans
from utils import *
import polars as pl
import numba as nb
import mmap
from mpi4py import MPI
import psutil
import re
import plotly.graph_objects as go


def reload_utils():

    import utils

    importlib.reload(utils)

    return None


# print(os.cpu_count())
# comm = MPI.COMM_WORLD
# rank = comm.Get_rank()
# size = comm.Get_size()
# print(comm, rank, size)

1. Open memmap file and get the file size, timesteps, number of atoms
2. allocate numpy array(s) (memmpaped if too large)
3. read data in chunks based on timesteps to update shared numpy arrays

In [2]:
atomic_masses = {
    1: 28.085,
    2: 28.085,
    3: 15.999,
    4: 1.008,
    5: 12.011,
    6: 1.008,
    7: 26.9815,
}

atoms = {1: "Si", 2: "Si", 3: "O", 4: "H", 5: "C", 6: "H", 7: "Al"}
data_path = "data/production_500K_a.dump"

In [3]:
file_size = os.path.getsize(data_path)
available_memory = psutil.virtual_memory().available
print(f"file size: {file_size / 1024**3} GB")
print(f"available memory: {available_memory / 1024**3} GB")

file size: 0.8746490916237235 GB
available memory: 20.133846282958984 GB


In [4]:
with open(data_path, mode="r+") as fi:

    if file_size < 0.9 * available_memory:

        length = 0

    else:

        length = 0.9 * available_memory

    with mmap.mmap(fi.fileno(), length=length, access=mmap.ACCESS_WRITE) as fii:

        fields_sig = b"ITEM: ATOMS"
        Natom_sig = b"ITEM: NUMBER OF ATOMS"
        timestep_sig = b"ITEM: TIMESTEP\n"
        bounds_sig = b"ITEM: BOX BOUNDS"

        Natom_len = len(Natom_sig)
        timestep_len = len(timestep_sig)
        bounds_len = len(bounds_sig)

        bounds_start = fii.find(bounds_sig)
        bounds_end = fii.find(b"\n", bounds_start)
        bounds_type = fii[bounds_start + bounds_len : bounds_end].split()[0]
        bounds = np.zeros((3, 2))
        fii.seek(bounds_end + 1)

        for i in range(3):

            bounds[i] = np.array(fii.readline().split(), dtype=float)

        fields_start = fii.find(fields_sig)
        fields_end = fii.find(b"\n", fields_start)
        fields = fii[fields_start + len(fields_sig) : fields_end].split()
        fields_length = fields_end - fields_start
        Nfield = len(fields)

        Natoms_start = fii.find(Natom_sig)
        Natoms_end = fii.find(b"\n", Natoms_start)
        fii.seek(Natoms_end + 1)
        Natom = int(fii.readline())

        all_fields = re.compile(fields_sig).finditer(fii)
        all_fields = np.array([[m.start(), m.end()] for m in all_fields])
        all_fields_start = all_fields[:, 0]
        all_fields_end = all_fields[:, 1]

        timestep_indices = re.compile(timestep_sig).finditer(fii)
        timestep_indices = np.array([m.end() for m in timestep_indices])
        Nt = timestep_indices.shape[0]
        timesteps = np.zeros(Nt, dtype=int)

        for i, t_index in enumerate(timestep_indices):

            fii.seek(t_index)
            timesteps[i] = int(fii.readline())

        data_indices = np.empty((Nt, 2), dtype=int)
        data_indices[:, 0] = all_fields_start + fields_length
        data_indices[:-1, 1] = timestep_indices[1:] - timestep_len
        data_indices[-1, 1] = -1

        all_data = np.zeros((Nt, Natom, Nfield))

        for t in range(Nt):

            data_t = fii[data_indices[t][0] : data_indices[t][1]]
            data_t = np.array(data_t.split(), dtype=float).reshape(-1, Nfield)
            sort = np.argsort(data_t[:, 0])

            all_data[t] = data_t[sort]

In [5]:
fields = [field.decode("utf-8") for field in fields]
print(fields)
atomic_masses_arr = np.array(list(atomic_masses.values()))
df_atom = {}
for i, field in enumerate(fields):

    if field in ["ix", "iy", "iz"]:

        continue

    elif field == "c_ld[1]":

        df_atom["atom_ld"] = all_data[:, :, i]

    elif field == "c_ld[2]":

        df_atom["atom_lt"] = all_data[:, :, i]

    elif field == "c_pe":

        df_atom["atom_pe"] = all_data[:, :, i]

    elif field == "c_ke":

        df_atom["atom_ke"] = all_data[:, :, i]

    elif field == "mol":

        df_atom["molecule_id"] = all_data[:, :, i].astype(int)

    elif field in ["id", "type"]:

        df_atom["atom_" + field] = all_data[:, :, i].astype(int)

    else:

        df_atom["atom_" + field] = all_data[:, :, i]

df_atom["atom_mass"] = atom_map(
    atomic_masses_arr, df_atom["atom_type"].flatten()
).reshape(Nt, Natom)

df_atom["timestep"] = np.repeat(timesteps, Natom).reshape(Nt, Natom)

df_molecule = atom_to_molecule(df_atom)

['id', 'type', 'mol', 'q', 'x', 'y', 'z', 'vx', 'vy', 'vz', 'ix', 'iy', 'iz', 'c_pe', 'c_ke', 'c_ld[1]', 'c_ld[2]']


In [15]:
app = Dash()
dt = timesteps[1] - timesteps[0]

cluster_vars = [
    var
    for var in list(df_molecule.keys())
    if "id" not in var and "timestep" not in var and "mass" not in var
]


app.layout = [
    html.H1(children="", style={"textAlign": "center"}),
    html.Div(
        [
            dcc.RadioItems(
                ["ld", "lt", "vx", "z", "pe", "ke"],
                value="z",
                id="cluster-algo",
                inline=True,
                style={"color": "white", "padding-top": 10, "padding-bottom": 6},
            ),
            dcc.Dropdown(cluster_vars, "molecule_z", id="cluster-vars", multi=True),
        ],
        style={"width": "300", "display": "inline-block"},
    ),
    html.Div(
        [
            dcc.Graph(id="graph-content"),
            dcc.Slider(
                min=timesteps.min(),
                max=timesteps.max(),
                step=dt,
                value=0,
                id="timestep",
                marks={t: t for t in timesteps[::100].astype(str)},
            ),
        ],
        style={"width": "1000px", "height": "1000px", "display": "inline-block"},
    ),
]


@callback(
    Output("graph-content", "figure"),
    Input("cluster-vars", "value"),
    Input("timestep", "value"),
)
def update_graph(vars, t):

    data = []

    t = int(t / dt)

    if isinstance(vars, list):

        for var in vars:

            data.append(np.abs(df_molecule[var][t]))

    else:

        data = np.abs(df_molecule[vars][t])

    data = np.array(data).reshape(101, -1)

    res = kmeans.run(data, 2)

    x = df_molecule["molecule_x"][t]

    y = df_molecule["molecule_y"][t]

    z = df_molecule["molecule_z"][t]

    fig = go.Figure(
        data=[go.Scatter3d(x=x, y=y, z=z, mode="markers", marker=dict(color=res))]
    )

    # fig.update_layout(width=800, height=800)
    fig.update_layout(
        margin=dict(l=20, r=20, t=20, b=20),
        paper_bgcolor="black",
        uirevision=True,
        transition={"duration": 3000, "easing": "cubic-in-out"},
        scene=dict(
            xaxis=dict(range=[bounds[0, 0], bounds[0, 1]]),
            yaxis=dict(range=[bounds[1, 0], bounds[1, 1]]),
            zaxis=dict(range=[bounds[2, 0], bounds[2, 1]]),
        ),
        width=400,
        height=400.0,
    )

    return fig


if __name__ == "__main__":

    app.run(debug=True)

In [9]:
t = 0
x = df_atom["atom_x"][t]
y = df_atom["atom_y"][t]


z = df_atom["atom_z"][t]


fig = go.Figure(
    data=[
        go.Scatter3d(
            x=x, y=y, z=z, mode="markers", marker=dict(color=df_atom["atom_type"][t])
        )
    ]
)


fig.update_layout(width=800, height=2000)


fig.show()

In [9]:
# importlib.reload(kmeans)
# import kmeans

# data = np.array([df_molecule["molecule_ld"]]).reshape(1001, 101, -1)

# data = np.array(
#     [
#         np.abs(df_molecule["molecule_z"]),
#         df_molecule["molecule_ld"],
#         np.abs(df_molecule["molecule_vz"]),
#         df_molecule["molecule_pe"],
#         df_molecule["molecule_ke"],
#     ]
# ).reshape(1001, 101, -1)

# kmeans_results = np.zeros((Nt, 101), dtype=np.uint8)

# for t in nb.prange(Nt):

#     data_t = data[t]

#     res = kmeans.run(data_t, 2)

#     kmeans_results[t] = res


# x = df_molecule["molecule_x"][0]
# y = df_molecule["molecule_y"][0]
# z = df_molecule["molecule_z"][0]

# fig = go.Figure(
#     data=[
#         go.Scatter3d(
#             x=x, y=y, z=z, mode="markers", marker=dict(color=kmeans_results[0])
#         )
#     ]
# )
# fig.update_layout(width=800, height=800)
# fig.show()

In [None]:
# def f(x):

#     return x.struct.field("atom_vx") + x.struct.field("atom_mass")


# def f(x):

#     a = x.struct.field("atom_x").to_numpy()
#     b = x.struct.field("atom_mass").to_numpy()

#     return weighted_average_dataframe(a, b)


# df_gb.agg(
#     pl.struct(["atom_x", "atom_mass"])
#     .map_elements(
#         lambda dff: weighted_average_dataframe(
#             dff.struct.field("atom_x").to_numpy(),
#             dff.struct.field("atom_mass").to_numpy(),
#         ),
#     )


#     .alias("new")

# )


# df_gb = df.group_by(["timestep", "molecule_id"], maintain_order=True)
# df_gb.mean()

# df_gb.agg(pl.struct(pl.col("atom_x"), pl.col("atom_mass")).map_elements(f).alias("new"))