# Train using Fock Matrix
We use all Lethola Geometries and train only using RMSE on Fock matrix

In [12]:
import os, glob, sys
scripts_path = "../../scripts"
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from to_cache import density_fock_overlap


geometry_paths = "../../datasets/lethola/all_geometries"

In [13]:
def load_xyz_files(path):
    xyz_files = glob.glob(os.path.join(path, "*.xyz"))
    return xyz_files

def clear_cache(path="../../datasets/lethola/cache"):
    y = input(f"Do you really want to clear the cache @ {path}")
    if y.lower() != "y": 
        print("abort")
        return
    import shutil
    if os.path.exists(path):
        shutil.rmtree(path)
        print(f"Cache at {path} cleared.")
    else:
        print(f"No cache found at {path}.")

xyz_files = load_xyz_files(geometry_paths)
print(f"Found {len(xyz_files)} .xyz files.")

Found 261 .xyz files.


In [14]:
# test
test_file = xyz_files[9]

# basis_path = "/home/etschgi1/REPOS/Masterarbeit/scripts/6-31g_2df_p_custom_nwchem.gbs"
ret = density_fock_overlap(test_file, "test","dft", basis = "6-31g(2df,p)", 
                         functional = "b3lypg", cache="../../datasets/lethola/cache")


Loaded mol from ../../datasets/lethola/all_geometries/alh.xyz
converged SCF energy = -242.983872426811


Basic test working - now we create the reference results and cache them!

In [15]:
for c, file in enumerate(xyz_files): 
    mol_name = os.path.basename(file).split(".")[0]
    print(f"{mol_name} ({c+1}/{len(xyz_files)})")
    ret = density_fock_overlap(filepath = file,
                         filename = mol_name,
                         method = "dft",
                         basis = "sto-3g",
                         functional = "b3lypg",
                         guess = "minao",
                         backend = "pyscf",
                         cache = "../../datasets/lethola/cache_sto3g")
    if any([x is None for x in ret]): 
        print("Not all data available!")
    else: 
        print(f"Got all data for {mol_name}")
    print("\n---")

cf2cl2 (1/261)
Got all data for cf2cl2

---
n-pentane (2/261)
Got all data for n-pentane

---
PhSeH (3/261)
Got all data for PhSeH

---
hccf (4/261)
Got all data for hccf

---
CrO2F2 (5/261)
Got all data for CrO2F2

---
Mn(CO)5CN (6/261)
Got all data for Mn(CO)5CN

---
FeCp2 (7/261)
Got all data for FeCp2

---
Ti(BH4)3 (8/261)
Got all data for Ti(BH4)3

---
Fe(C5Me5)(P5) (9/261)
Got all data for Fe(C5Me5)(P5)

---
alh (10/261)
Got all data for alh

---
n2o (11/261)
Got all data for n2o

---
PR12 (12/261)
Got all data for PR12

---
Mn(CO)4NO (13/261)
Got all data for Mn(CO)4NO

---
formic (14/261)
Got all data for formic

---
PR05 (15/261)
Got all data for PR05

---
hocn (16/261)
Got all data for hocn

---
clcof (17/261)
Got all data for clcof

---
Ni(C5H5)NO (18/261)
Got all data for Ni(C5H5)NO

---
allene (19/261)
Got all data for allene

---
acetic (20/261)
Got all data for acetic

---
c2h2 (21/261)
Got all data for c2h2

---
t-hooo (22/261)
Got all data for t-hooo

---
chf3 (23/261)

ok now let's take a look at prominent matrix sizes

In [46]:
import numpy as np
fock_matrices = []
for file in xyz_files:
    mol_name = os.path.basename(file).split(".")[0]
    _, f, _, _, _ = density_fock_overlap(filepath = file,
                            filename = mol_name,
                            method = "dft",
                            basis = "sto-3g",
                            functional = "b3lypg",
                            guess = "minao",
                            backend = "pyscf",
                            cache = "../../datasets/lethola/cache_sto3g")
    fock_matrices.append(f)
shapes = []
for f in fock_matrices: 
    if type(f) == tuple: 
        shapes.append(f[0].numpy.shape)
    else:
        shapes.append(f.numpy.shape)

unique_shapes, counts = np.unique(shapes, return_counts=True)    
unique_shapes = [(int(u), int(c)) for u,c in zip(unique_shapes, counts)]
print(f"Unique shapes of Fock matrices:")
unique_shapes

Unique shapes of Fock matrices:


[(2, 2),
 (6, 10),
 (7, 8),
 (8, 6),
 (9, 2),
 (10, 26),
 (11, 18),
 (12, 28),
 (13, 10),
 (14, 24),
 (15, 14),
 (16, 30),
 (17, 16),
 (18, 12),
 (19, 22),
 (20, 18),
 (21, 10),
 (22, 4),
 (23, 8),
 (24, 20),
 (25, 6),
 (26, 8),
 (27, 8),
 (28, 8),
 (29, 4),
 (30, 14),
 (31, 2),
 (32, 2),
 (33, 10),
 (34, 6),
 (35, 2),
 (36, 6),
 (37, 2),
 (38, 14),
 (39, 2),
 (40, 2),
 (41, 4),
 (42, 2),
 (43, 2),
 (45, 4),
 (46, 4),
 (48, 4),
 (50, 2),
 (52, 2),
 (53, 2),
 (54, 4),
 (57, 2),
 (58, 20),
 (59, 2),
 (64, 4),
 (66, 2),
 (68, 12),
 (69, 2),
 (70, 2),
 (72, 8),
 (73, 2),
 (74, 4),
 (75, 2),
 (78, 12),
 (79, 2),
 (82, 4),
 (84, 2),
 (88, 2),
 (90, 2),
 (102, 6),
 (114, 2),
 (120, 2),
 (128, 2),
 (132, 2),
 (144, 2),
 (148, 2),
 (170, 2),
 (172, 2)]

Nicht gerade leicht - wir hätten zb. 30x 16x16 Molekühle - etwas wenig für ein Training! 