In [1]:
import h5py    
import numpy as np    
import torch
import gc
import csv
from torch import nn
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import random
from tqdm.auto import tqdm
# from tqdm.notebook import tqdm
from NN_models import NN_2_256, NN_8_256, NN_8_64


def ref(x, y):
    ''' 
    returns reference energies for points of a reaction grid from Reference_data.csv
    '''
    hartree2kcal = 627.5095
    with open("Reference_data.csv", newline='', encoding='cp1251') as csvfile:
        ref_file = csv.reader(csvfile, delimiter=",")
        k = 1
        if y == 391:
            k = hartree2kcal
        ref = []
        for n, i in enumerate(ref_file):
            if x <= n + 1 <= y:
                ref.append((i[0], float(i[2]) * k))

        return ref

def load_ref_energies():
    '''Returns {db_name: [equation, energy]}'''
    ref_e = { # Получение референсных энергий
        "MGAE109":ref(8, 116),
        "IP13":ref(155, 167),
        "EA13":ref(180, 192),
        "PA8":ref(195, 202),
        "DBH76":ref(251, 288) + ref(291, 328),
        "NCCE31":ref(331, 361),
        "ABDE4":ref(206, 209),
        # "AE17":ref(375, 391),
        "pTC13":ref(232, 234) + ref(237, 241) + ref(244, 248)
        } 
    return ref_e

def load_component_names():
    '''
    Returns {db_name: {id: {'Components': [...], 'Coefficients: [...]'
                                }
                            }
                        }
     which is a dictionary with Components and Coefficients data about all reactions
    '''
    with open("total_dataframe_sorted_final.csv", newline='', encoding='cp1251') as csvfile:
        ref_file = csv.reader(csvfile, delimiter=",")
        ref = dict()
        current_database = None
        
        for n, line in enumerate(ref_file):
            line = np.array(line)
            if n == 0:
                components = np.array(line)
            else:
                reaction_id = int(line[0])
                reaction_database = line[1]
                reaction_component_num = np.nonzero(list(map(float, line[2:])))[0] + 2
                if reaction_database in ref:
                    ref[reaction_database][reaction_id] = {'Components': components[reaction_component_num], 'Coefficients': line[reaction_component_num]}
                else: 
                    ref[reaction_database] = {reaction_id: {'Components': components[reaction_component_num], 'Coefficients': line[reaction_component_num]}}
        return ref
    
    
def get_compounds_coefs_energy_v2(reactions, energies):
    '''Returns {id: 
                    {'Components': [...], 'Coefficients: [...]', 'Energy: float', Database: str
                                }
                            }
    which is a dictionaty from load_component_names with Energy information added
    '''
    data_final = dict()
    i = 0
    databases = load_ref_energies().keys()
    for database in databases:
        data = reactions[database]
        for reaction in data:
            data_final[i] = {'Database': database,
                             'Components': reactions[database][reaction]['Components'], #.astype(object),
                             'Coefficients': torch.Tensor(reactions[database][reaction]['Coefficients'].astype(np.float32)),
                             'Energy': torch.Tensor(np.array(energies[database][reaction][1]))
            
        }
            i += 1
        
    return data_final


def get_h5_names(reaction):
    '''reaction must be from the function get_compounds_coefs_energy_v2'''
    database_match = {
        'MGAE109': 'mgae109',
        'IP13': 'ip13',
        'EA13': 'ea13',
        'PA8': 'pa8',
        'DBH76': 'ntbh38',
        'NCCE31': 'ncce31',
        'ABDE4': 'abde4',
        'AE17': 'ae17',
        'pTC13': 'ptc13'
    }
    names = []
    for elem in reaction['Components']:
        database = database_match[reaction['Database']]
        names.append(f'{elem}.h5')
    return names


def add_reaction_info_from_h5(reaction):
    '''
    reaction must be from get_compounds_coefs_energy_v2
    returns merged descriptos array X, integration weights, 
    a and b densities and indexes for backsplitting
    
    Adds the following information to the reaction dict using h5 files from the dataset:
    Grid : np.array with grid descriptors
    Weights : list with integration weights of grid points
    Densities : np.array with alpha and beta densities data for grid points
    HF_energies : list of Total HF energy (T+V) which needs to be added to E_xc
    backsplit_ind: list of indexes where we concatenate molecules' grids
    '''
    X = np.array([])
    backsplit_ind = []
    HF_energies = np.array([])
    for component_filename in get_h5_names(reaction):
        with h5py.File(f'data/{component_filename}', "r") as f:
            HF_energies = np.append(HF_energies, f["ener"][:][0])
            X_raw = np.array(f["grid"][:])
            if len(X) == 0:
                X = X_raw[:, 3:-1]
            else:
                X = np.vstack((X, X_raw[:, 3:-1]))
            backsplit_ind.append(len(X))
    densities = X[:, 1:3]
    weights = X[:,0]
    X = X[:, 1:]

    labels = ['Grid', 'Weights', 'Densities', 'HF_energies', 'backsplit_ind']
    values = [X, weights, densities, HF_energies, backsplit_ind]
    for label, value in zip(labels, values):
        reaction[label] = torch.Tensor(value)

    return reaction


def make_reactions_dict():
    '''
    Returns a dict like {reaction_id: {*reaction info}} with all info available listed below:
    ['Database', 'Components', 'Coefficients', 'Energy', 'Grid', 'Weights', 'Densities', 'HF_energies', 'backsplit_ind']
    '''
    data = get_compounds_coefs_energy_v2(load_component_names(), load_ref_energies())
    for i in data.keys():
        data[i] = add_reaction_info_from_h5(data[i])

    return data


def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_random_seed(42)


data = make_reactions_dict()

In [2]:
data[0]

{'Database': 'MGAE109',
 'Components': array(['C_mgae109', 'H_mgae109', 'CH_mgae109'], dtype='<U20'),
 'Coefficients': tensor([ 1.,  1., -1.]),
 'Energy': tensor(84.2300),
 'Grid': tensor([[6.3204e+01, 6.3213e+01, 1.9346e+00,  ..., 1.9352e+00, 7.5028e+00,
          3.8268e-03],
         [6.3204e+01, 6.3213e+01, 1.6335e+02,  ..., 1.6340e+02, 7.8221e+00,
          3.2312e-01],
         [6.3201e+01, 6.3210e+01, 2.1850e+03,  ..., 2.1857e+03, 1.1821e+01,
          4.3224e+00],
         ...,
         [4.8742e-04, 3.2433e-04, 9.2476e-07,  ..., 5.0238e-07, 2.6551e-04,
          1.9790e-04],
         [6.3506e-04, 4.3839e-04, 1.5955e-06,  ..., 9.3007e-07, 3.5282e-04,
          2.7110e-04],
         [4.8742e-04, 3.2433e-04, 9.2476e-07,  ..., 5.0238e-07, 2.6551e-04,
          1.9790e-04]]),
 'Weights': tensor([2.5717e-17, 9.9780e-15, 3.2611e-13,  ..., 2.6018e-02, 2.2653e-02,
         2.6018e-02]),
 'Densities': tensor([[6.3204e+01, 6.3213e+01],
         [6.3204e+01, 6.3213e+01],
         [6.3201e+

In [3]:
def encode_components(data):
    for i in data:
        data[i]['Components'] = data[i]['Components'].tobytes()
        
encode_components(data)

In [4]:
y_single = [0.0310907, 0.01554535, 
            3.72744,   7.06042,
            12.9352,   18.0578,
            -0.10498,  -0.32500,
            0.0310907,  0.01554535,  -1/(6*np.pi**2),
            13.0720,    20.1231,      1.06835,
            42.7198,   101.578,      11.4813,
            -0.409286,  -0.743294,   -0.228344,
            1]

nconstants = len(y_single)

device = torch.device('cuda:0') if torch.cuda.is_available else torch.device('cpu')
device


all_grid_data = data[0]['Grid']
for i in range(len(data)-1):
    all_grid_data = torch.cat([all_grid_data, data[i]['Grid']])
print(all_grid_data.shape)


stdscaler = StandardScaler()
stdscaler.fit(np.array(all_grid_data))

for i in data:
    data[i]['Grid'] = torch.Tensor(stdscaler.transform(data[i]['Grid']))

torch.Size([87591726, 7])


In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):

        self.data = data
        
    def __getitem__(self, i):
        self.data[i].pop('Database', None)

        return self.data[0], y_single
    
    def __len__(self):
        return len(self.data.keys())


train_set = Dataset(data=data)
train_dataloader = torch.utils.data.DataLoader(train_set,
                                               batch_size=1,
                                               num_workers=1,
                                               pin_memory=True,
                                               shuffle=True)

criterion = nn.MSELoss()

In [74]:
# np.frombuffer(X_batch['Components'][0], dtype='<U20')

In [75]:
# torch.cuda.reset()
# torch.cuda.empty_cache()
# del X_batch, y_batch

In [76]:
from GPUtil import showUtilization as gpu_usage

In [6]:
def train(model, criterion, optimizer, scheduler, train_dataloader, n_epochs=2):
    
    torch.set_printoptions(precision=5)
    
    train_loss_mse = []
    train_loss_mae = []
    test_loss_mse = []
    test_loss_mae = []


    for epoch in range(n_epochs):
        print('Epoch', epoch+1)
        # train
        model.train()


        train_mse_losses_per_epoch = []
        train_mae_losses_per_epoch = []
        
        progress_bar = tqdm(train_dataloader)

        # gradscaler = GradScaler()
        for i, (X_batch, y_batch) in enumerate(progress_bar):
            # print(np.frombuffer(X_batch['Components'][0], dtype='<U20'), X_batch['Grid'][0].shape)

            X_batch = X_batch['Grid'][0].to(device, non_blocking=True)
            y_batch = torch.tile(torch.Tensor(y_batch), [X_batch.shape[0],1]).to(device, non_blocking=True)
            # with torch.cuda.amp.autocast(dtype=torch.bfloat16):
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            MAE = mean_absolute_error(predictions.cpu().detach(), y_batch.cpu().detach())
            MSE = loss.item()
            train_mse_losses_per_epoch.append(MSE)
            train_mae_losses_per_epoch.append(MAE)
            scheduler.step()
            progress_bar.set_postfix(MAE = MAE, MSE = MSE)

            del X_batch, y_batch, predictions, loss, MAE, MSE
            gc.collect()
            torch.cuda.empty_cache()
            
        train_loss_mse.append(np.mean(train_mse_losses_per_epoch))
        train_loss_mae.append(np.mean(train_mae_losses_per_epoch))
        
        print(f'train MSE Loss = {train_loss_mse[epoch]:.8f}')
        print(f'train MAE Loss = {train_loss_mae[epoch]:.8f}')
        
    return train_loss_mse, train_loss_mae, predictions[0].cpu().detach().numpy()

In [7]:
model = NN_2_256(DFT='SVWN').to(device)
# model.load_state_dict(torch.load('predoptimized_3.param'))

In [8]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 400, 700, 1000], gamma=1)

In [9]:
train_loss_mse, train_loss_mae, preds = train(model, criterion, optimizer, 
                                              scheduler, train_dataloader, n_epochs=5)

# print(train_loss_mse, train_loss_mae, test_loss_mse, test_loss_mae)
# print('predicted coef', '\n', preds)
# print('exact coef', '\n', np.array(y_single))

  0%|          | 0/267 [00:00<?, ?it/s]

Epoch 1


100%|██████████| 267/267 [00:42<00:00,  6.35it/s, MAE=0.778, MSE=4.25]
  0%|          | 0/267 [00:00<?, ?it/s]

train MSE Loss = 107.99707187
train MAE Loss = 2.67343998
Epoch 2


100%|██████████| 267/267 [00:46<00:00,  5.77it/s, MAE=0.689, MSE=3.51]
  0%|          | 0/267 [00:00<?, ?it/s]

train MSE Loss = 3.79537282
train MAE Loss = 0.72444433
Epoch 3


100%|██████████| 267/267 [00:45<00:00,  5.86it/s, MAE=0.64, MSE=3.15] 
  0%|          | 0/267 [00:00<?, ?it/s]

train MSE Loss = 3.29507779
train MAE Loss = 0.66153526
Epoch 4


  0%|          | 0/267 [00:00<?, ?it/s, MAE=0.64, MSE=3.14]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fcba845c5e0>
Traceback (most recent call last):
  File "/home/danis/ML/ML_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1481, in __del__
    self._shutdown_workers()
  File "/home/danis/ML/ML_env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1445, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.9/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.9/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.9/multiprocessing/connection.py", line 936, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.9/selectors.py", line 416, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 
 27%|██▋       | 73/267 [

In [12]:
!nvidia-smi

Sat Dec  3 22:17:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   43C    P8    13W /  N/A |     23MiB /  8192MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces