In [None]:
import torch
import sys
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from ase.io import read
from tqdm import tqdm
from torch.nn.functional import pad
from omegaconf import OmegaConf
from ViSNetGW.data.gwset import GWSet



torch.manual_seed(42)

In [None]:
QM9_N_MAX = 29

In [None]:
xyz_path = "/Users/dario/datasets/GWSet/QM9/QM9_xyz_files"
results_path = "/Users/dario/datasets/GWSet/results"
eqp_path = f"{results_path}/E_qp"
homo_path = f"{results_path}/homo_idx"

In [None]:
all_num_atoms = []
for i in tqdm(range(1, 133886), leave=False):
    xyz_file = f"mol_{i}.xyz"
    atoms = read(os.path.join(xyz_path, xyz_file))
    all_num_atoms.append(len(atoms))

In [None]:
plt.hist(all_num_atoms, bins=26)
plt.xticks([i for i in range(0, 31, 2)])
plt.show()

In [None]:
print(np.mean(all_num_atoms))
print(np.std(all_num_atoms))
print(np.mean(all_num_atoms - 2 * np.std(all_num_atoms)))
print(np.mean(all_num_atoms - np.std(all_num_atoms)))
print(np.mean(all_num_atoms + np.std(all_num_atoms)))
print(np.mean(all_num_atoms + 2 * np.std(all_num_atoms)))

In [None]:
all_N = []
all_Z = []
all_R = []
all_E = []
for i in tqdm(range(1, 133886), leave=False):
    mol = f"mol_{i}"
    atoms = read(f"{xyz_path}/{mol}.xyz", format="xyz")
    homo_idx = np.loadtxt(f"{homo_path}/{mol}.dat", dtype=int)
    N = torch.tensor([len(atoms)])
    Z = pad(
        torch.from_numpy(atoms.get_atomic_numbers()),
        pad=(0, QM9_N_MAX - N)
    )
    R = pad(
        torch.from_numpy(atoms.get_positions()),
        pad=(0, 0, 0, QM9_N_MAX - N)
    )
    E = torch.tensor([np.loadtxt(f"{eqp_path}/{mol}.dat")[homo_idx]])
    all_N.append(N)
    all_Z.append(Z)
    all_R.append(R)
    all_E.append(E)

In [None]:
print((sys.getsizeof(all_N) + sys.getsizeof(all_Z) + sys.getsizeof(all_R) + sys.getsizeof(all_E)) / 1000000)

# 4 MB ?

In [None]:
N = torch.concatenate(all_N)
Z = torch.concatenate(all_Z)
R = torch.concatenate(all_R)
E = torch.concatenate(all_E)

In [None]:
print(Z[:10])

In [None]:
cfg = OmegaConf.load("../config.yaml")
data_module = GWSet(**cfg.data)

In [None]:
n = len(data_module.train_dataset)
all_E = []
for i in tqdm(range(n), leave=False):
    _, _, _, E = data_module.train_dataset[i]
    all_E.append(E.item())

In [None]:
df1 = pd.read_csv("../test_datasets/omol25_values_1M.csv")
df2 = pd.read_csv("../test_datasets/omol25_values_5M.csv")
df3 = pd.read_csv("../test_datasets/omol25_values_10M.csv")

In [None]:
plt.hist(all_E, bins=50, density=True, label="GWSet")
plt.hist(df1["e_homo"], bins=50, density=True, label="OMol25 1M")
#plt.hist(df2["e_homo"], bins=50, density=True, label="OMol25 5M")
#plt.hist(df3["e_homo"], bins=50, density=True, label="OMol25 10M")
plt.hist(df1["gap"], bins=50, density=True, label="OMol25 1M")
plt.legend()
plt.show()