In [None]:
import torch
import sys
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from ase.io import read
from tqdm import tqdm
from torch.nn.functional import pad
from omegaconf import OmegaConf
from ViSNetGW.data.gwset import GWSet



torch.manual_seed(42)

In [None]:
QM9_N_MAX = 29

In [None]:
xyz_path = "/Users/dario/datasets/GWSet/QM9/QM9_xyz_files"
results_path = "/Users/dario/datasets/GWSet/results"
eqp_path = f"{results_path}/E_qp"
homo_path = f"{results_path}/homo_idx"

# Number of atoms

In [None]:
all_num_atoms = []
for i in tqdm(range(1, 133886), leave=False):
    xyz_file = f"mol_{i}.xyz"
    atoms = read(os.path.join(xyz_path, xyz_file))
    all_num_atoms.append(len(atoms))

In [None]:
plt.hist(all_num_atoms, bins=26)
plt.xticks([i for i in range(0, 31, 2)])
plt.show()

In [None]:
print(np.mean(all_num_atoms))
print(np.std(all_num_atoms))
print(np.mean(all_num_atoms - 2 * np.std(all_num_atoms)))
print(np.mean(all_num_atoms - np.std(all_num_atoms)))
print(np.mean(all_num_atoms + np.std(all_num_atoms)))
print(np.mean(all_num_atoms + 2 * np.std(all_num_atoms)))

# Distribution of energies

In [None]:
cfg = OmegaConf.load("../config.yaml")
data_module = GWSet(**cfg.data)

In [None]:
n = len(data_module.train_dataset)
all_E = []
for i in tqdm(range(n), leave=False):
    _, _, _, E = data_module.train_dataset[i]
    all_E.append(E.item())

In [None]:
df1 = pd.read_csv("../test_datasets/omol25_values_1M.csv")
df2 = pd.read_csv("../test_datasets/omol25_values_5M.csv")
df3 = pd.read_csv("../test_datasets/omol25_values_10M.csv")

In [None]:
plt.hist(all_E, bins=50, density=True, label="GWSet")
plt.hist(df1["e_homo"], bins=50, density=True, label="OMol25 1M")
#plt.hist(df2["e_homo"], bins=50, density=True, label="OMol25 5M")
#plt.hist(df3["e_homo"], bins=50, density=True, label="OMol25 10M")
plt.hist(df1["gap"], bins=50, density=True, label="OMol25 1M")
plt.legend()
plt.show()

# Correlation between GW gap and first GW-BSE energy

In [None]:
EXC_SS = np.array(torch.load(os.path.join("/Users/dario/ViSNetGW/GWSet/train/EXC_SS.pt")).squeeze().tolist())
GAP = np.array(torch.load(os.path.join("/Users/dario/ViSNetGW/GWSet/train/GAP.pt")).squeeze().tolist())

In [None]:
print(np.mean(EXC_SS), np.mean(GAP))

In [None]:
-np.mean(EXC_SS) + np.mean(GAP)

In [None]:
EXC_SS = EXC_SS - np.mean(EXC_SS) + np.mean(GAP)

In [None]:
plt.hist(EXC_SS, bins=50, density=True, label="EXC")
plt.hist(GAP, bins=50, density=True, label="GAP")
plt.legend()
plt.show()

In [None]:
plt.scatter(EXC_SS, GAP)
plt.show()

# QCDGE

In [None]:
import h5py
import json
import torch

import pandas as pd

from tqdm import tqdm

In [None]:
df = pd.read_csv("/Volumes/LaCie/QCDGE/final_all.csv")

In [None]:
print(len(df))

In [None]:
df[df.duplicated("Smiles_rdkit_can") == True]

In [None]:
n = len(df)
N_max = -1
with h5py.File("/Volumes/LaCie/QCDGE/final_all.hdf5", "r") as f:
    for i in tqdm(range(n), leave=False):
        atomic_numbers = f[df.iloc[i]["Index"]]["ground_state"]["labels"]#[()][0]
        #coords = f[df.iloc[i]["Index"]]["ground_state"]["coords"][()]
        #exc_state_raw_bytes = f[df.iloc[i]["Index"]]["excited_state"]["Info_of_AllExcitedStates"][()][0]
        #exc_state_raw_str = exc_state_raw_bytes.decode("utf-8")
        #exc_state_data = json.loads(exc_state_raw_str)
        #E_exc = float(exc_state_data["1"]["excitation_e_eV"][:-3])
        #if atomic_numbers.size > N_max:
        #    N_max = atomic_numbers.size
        N_max = max(N_max, atomic_numbers.shape[1])
        print(atomic_numbers.shape)
        break
print(N_max)

In [None]:
print(N_max)