<a href="https://colab.research.google.com/github/Open-Catalyst-Project/ocp/blob/tutorials_01_11/tutorials/OCP_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
os.chdir("/l/users/elizaveta.starykh/OCP_project/ocp-git/")

%load_ext autoreload
%autoreload 2

import sys
# print(sys.version)
import torch
# print(torch.__version__)

import torch
import matplotlib
matplotlib.use('Agg')
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import ase.io
from ase.io.trajectory import Trajectory
from ase.io import extxyz
from ase.calculators.emt import EMT
from ase.build import fcc100, add_adsorbate, molecule
from ase.constraints import FixAtoms
from ase.optimize import LBFGS
from ase.visualize.plot import plot_atoms
from ase import Atoms
from IPython.display import Image

import ocpmodels
import lmdb
import torch_geometric

from ocpmodels.datasets import LmdbDataset

### May 2024, python9_kernel, python9 env, python=3.9.18, CSCC
import e3nn
from ocpmodels.trainers import OCPTrainer
from ocpmodels.datasets import LmdbDataset
from ocpmodels import models
from ocpmodels.common import logger
from ocpmodels.common.utils import setup_logging, setup_imports

while True:
    try:
        setup_imports()
        break
    except (ModuleNotFoundError, RuntimeError, TypeError, NameError):
        print('setup_imports() error raised, continue...')
        pass

setup_logging()
import copy
from torch.utils.data import DataLoader, SubsetRandomSampler
import torch_geometric.loader  
import yaml
from tqdm.auto import tqdm
import pickle as pkl
import wandb
import random

setup_imports() error raised, continue...


  from .autonotebook import tqdm as notebook_tqdm


### Training

In [2]:
with open("./data/dataset_config.yaml", 'r') as file:
    dataset_info = yaml.safe_load(file)

In [3]:
train_dataset_config = dataset_info["datasets"][1]
val_dataset_config = dataset_info["datasets"][2]

train_src = train_dataset_config["path"]
val_src = val_dataset_config["path"]

In [5]:
# Model
# /configs/oc22/s2ef/painn

with open('configs/oc22/s2ef/painn/painn.yml', 'r') as file:
    model = yaml.safe_load(file)

model = model["model"]
model['name'] = 'ocpmodels.models.painn.painn.PaiNN'
model['hidden_channels'] = 512
model['efermi_length'] = 128
if train_dataset_config["efermi_available"]:    
    model["multiply_efermi"] = False
    model["concatenate_efermi"] = True
# model

In [6]:
# Task
task = {
    'dataset': 'lmdb', # dataset used for the S2EF task
    'description': 'Regressing to energies and forces for DFT trajectories from OCP',
    'type': 'regression',
    'metric': 'mae',
    'labels': ['potential energy'],
    'grad_input': 'atomic forces',
    'train_on_free_atoms': True,
    'eval_on_free_atoms': True
}

# Optimizer
optimizer = {
    'batch_size': 32,         # originally 32
    'eval_batch_size': 32,    # originally 32
    'num_workers': 2,
    'lr_initial': 5.e-4,
    'optimizer': 'AdamW',
    'optimizer_params': {"amsgrad": True},
    'scheduler': "ReduceLROnPlateau",
    'mode': "min",
    'factor': 0.8,
    'patience': 3,
    'max_epochs': 1,         # used for demonstration purposes
    'force_coefficient': 100,
    'ema_decay': 0.999,
    'clip_grad_norm': 10,
    'loss_energy': 'mae',
    'loss_force': 'l2mae',
    # 'eval_every': 500
}
# Dataset
dataset = [
{'src': train_src,
'normalize_labels': True,
#  "target_mean": mean,
"target_mean": train_dataset_config["mean"],
#  "target_std": stdev,
"target_std": train_dataset_config["stdev"],
"grad_target_mean": 0.0,
#  "grad_target_std": stdev
"grad_target_std": train_dataset_config["stdev"]
}, # train set
{'src': val_src}, # val set (optional)
]


In [33]:
# def train_wandb(config=None):
#     with wandb.init(project="PaiNN-DOS-efermi", config = sweep_configuration):
#         config = wandb.config

#         for epoch in range(config.epochs):

#             print(wandb.config["batch_size"])
#             batch_size = config['batch_size']

#             task, optimizer, dataset = task_optim_dataset(batch_size)


#             trainer = OCPTrainer(
#                 task=task,
#                 model=copy.deepcopy(model), # copied for later use, not necessary in practice.
#                 dataset=dataset,
#                 optimizer=optimizer,
#                 outputs={},
#                 loss_fns={},
#                 eval_metrics={},
#                 name="s2ef",
#                 identifier="S2EF-example",
#                 run_dir=".", # directory to save results if is_debug=False. Prediction files are saved here so be careful not to override!
#                 is_debug=False, # if True, do not save checkpoint, logs, or results
#                 print_every=5,
#                 seed=0, # random seed to use
#                 logger="tensorboard", # logger of choice (tensorboard and wandb supported)
#                 local_rank=0,
#                 amp=True, # use PyTorch Automatic Mixed Precision (faster training and less memory usage),
#             )



#             trainer.train()
#             avg_loss = trainer.final_loss
#             wandb.log({"loss": avg_loss, "epoch": epoch}) 

In [None]:
torch.manual_seed(0)

trainer = OCPTrainer(
    task=task,
    model=copy.deepcopy(model), # copied for later use, not necessary in practice.
    dataset=dataset,
    optimizer=optimizer,
    outputs={},
    loss_fns={},
    eval_metrics={},
    name="s2ef",
    identifier="S2EF-20k-train",
    run_dir=".", # directory to save results if is_debug=False. Prediction files are saved here so be careful not to override!
    is_debug=False, # if True, do not save checkpoint, logs, or results
    print_every=5,
    seed=0, # random seed to use
    logger="tensorboard", # logger of choice (tensorboard and wandb supported)
    local_rank=0,
    amp=True, # use PyTorch Automatic Mixed Precision (faster training and less memory usage),
)

In [None]:
trainer.train()

: 

In [None]:
sys.exit("finishing trainer.train() function, exiting...")

----

### mean, std [feel free to skip]

In [4]:
# train_dataset = LmdbDataset({"src": "./data/s2ef/200k/train/output_lmdb"})
dataset = LmdbDataset({"src": "./tutorial_data/s2ef/train_100"})

energies = []
for data in dataset:
    energies.append(data.y)

mean = np.mean(energies)
stdev = np.std(energies)

stdev, mean 
## == (2.8471219290033876, -0.7877437496095779) for /normalized_efermi/ 176k dataset
## == (2.8412495666979143, -0.78793442576679) for ??
## == (2.8392264933285123, -0.7946160068500009) for 20k dataset
# (2.8873626757898343, -0.7555769032155011) for 200k train dataset
# == (1.5156444102461508, 0.45158625849998374) for tutorial_data/train_100 dataset

(1.5156444102461508, 0.45158625849998374)

In [21]:
len(dataset)

176027

In [2]:
with open("./data/dataset_config.yaml", 'r') as file:
    yaml_data = yaml.safe_load(file)


In [4]:
yaml_data["datasets"][0].keys()

dict_keys(['name', 'path', 'mean', 'stdev', 'efermi_available', 'description'])

In [22]:

# yaml_data
# yaml_data["datasets"][1]["description"]


selected_dataset_info = yaml_data["datasets"][0]
selected_dataset_info["name"]

'200k_train'

In [None]:

# (2.8873626757898343, -0.7555769032155011) for 200k train dataset
dataset_1 = {'name': "200k_train",
             'path': "/l/users/elizaveta.starykh/OCP_project/ocp-git/data/s2ef/200k/train/", 
             "mean": -0.7555769032155011, 
             "stdev": 2.8873626757898343,
             "efermi_available": False}

## stdev, mean == (2.8471219290033876, -0.7877437496095779) for /normalized_efermi/ 176k dataset
dataset_2 = {'name': "176k_train",
             'path': "/l/users/elizaveta.starykh/OCP_project/ocp-git/data/s2ef/200k/train/output_lmdb/normalized_efermi", 
             "mean": -0.7877437496095779, 
             "stdev": 2.8471219290033876,
             "efermi_available": True}

## == (2.8392264933285123, -0.7946160068500009) for 20k dataset
dataset_3 = {'name': "20k_train",
             'path': "/l/users/elizaveta.starykh/OCP_project/ocp-git/data/s2ef/200k/train/output_lmdb/normalized_efermi/train_20000_systems", 
             "mean": -0.7946160068500009, 
             "stdev": 2.8392264933285123,
             "efermi_available": True}

## == (1.5156444102461508, 0.45158625849998374) for 
dataset_4 = {'name': "tutorial_train",
             'path': "/l/users/elizaveta.starykh/OCP_project/ocp-git/tutorial_data/s2ef/train_100", 
             "mean": 0.45158625849998374, 
             "stdev": 1.5156444102461508,
             "efermi_available": False}

dataset_conf = dict{"dataset" : dt for }


with open('./data/dataset_config.yml', 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

In [None]:
# train_dataset = LmdbDataset({"src": "./data/s2ef/200k/train/output_lmdb"})

efermi_info = []
# for data in train_dataset:
for data in dataset:
    efermi_info.append(data.efermi)


In [None]:
def sigmoid(z):
  return 1.0 / (1 + np.exp(-z)) 

def tanh(z):
	return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))


efermi_info = np.array(efermi_info)
efermi_tanh = tanh(efermi_info)


########
# efermi_normalized = (efermi_info - efermi_info.min()) / (efermi_info.max() - efermi_info.min()) 

# efermi_normalized.min(), efermi_normalized.max(), efermi_normalized.std()
# efermi_sigmoid = sigmoid(efermi_info)

# mean = np.mean(efermi_info)
# stdev = np.std(efermi_info)

# stdev, mean 
## == (2.8471219290033876, -0.7877437496095779) for /output_lmdb/ full dataset
## == (2.8412495666979143, -0.78793442576679)

In [None]:
efermi_tanh.min(), efermi_tanh.max(), efermi_tanh.std()
len(efermi_tanh)

In [None]:
plt.plot(efermi_info, efermi_tanh, 'bo')
# plt.yscale("log")
plt.xlabel("efermi")

plt.ylabel("efermi tanh")
plt.show()

### Values testing:

In [None]:
tst_dataset = LmdbDataset({"src": "./data/s2ef/200k/train/output_lmdb/"})

# tst_efermi = torch.FloatTensor([train_dataset[9].efermi])
# tst_efermi.dim(), tst_efermi.shape
# tst_efermi

values_efermi = [6.7264, 8.2544, 1.5132]

long_efermi = torch.FloatTensor([tst_dataset[0].efermi, tst_dataset[5].efermi, tst_dataset[7].efermi])
long_efermi.dim(), long_efermi.shape
long_efermi



batch_efermi = [torch.FloatTensor([tst_dataset[x].efermi]) for x in range(3)]

batch_efermi


# tst_dataset[9].efermi


In [None]:
torch.manual_seed(0)
rand_tensor = torch.rand([3, 4])
rand_tensor.shape # = [3, 4]
## embedding.shape = [153, 512]  ### 153 elements, each has a vactor of size 512


float_number = -1.1

# rand_tensor[1] = torch.tensor([1, 2, 3, 4])
# rand_tensor[1] = torch.mul(rand_tensor[1], float_number)
# rand_tensor[1], rand_tensor 

obj = torch.mul(rand_tensor[:2], float_number)
obj.type()

---


## Normalization of `efermi`

In [None]:
# initial_lmdb = LmdbDataset({"src": "./data/s2ef/200k/train/output_lmdb/"})
os.makedirs("./data/s2ef/200k/train/output_lmdb/normalized_efermi", exist_ok=True)

In [None]:
# len(train_dataset), len(efermi_tanh), train_dataset[0].efermi, efermi_tanh[0] 
len(dataset), len(efermi_tanh), dataset[0].efermi, efermi_tanh[0] 

In [None]:
db = lmdb.open(
    "./data/s2ef/200k/train/output_lmdb/normalized_efermi/train_data_200k_efermi.lmdb",
    map_size=1099511627776 * 2,
    subdir=False,
    meminit=False,
    map_async=True,
)

In [None]:
correct_sys_nr=0
# no_dos_count=0

# for system_nr, system in tqdm(enumerate(train_dataset), total=10):
for system_nr, system in tqdm(enumerate(dataset), total=len(dataset)):
    
    if system_nr%100==0:
        print("running nr: ", system_nr, "\tcorrect system number: ", correct_sys_nr)

    system.efermi = efermi_tanh[system_nr]


    # system_bulk_id = log_ids.loc[system_nr, "bulk_mpid"]
    
    # if system_bulk_id in unique_dos_postprocessed.keys():
        


    #     full_system_dos = unique_dos_postprocessed[system_bulk_id]
    #     system.energies = full_system_dos[0]
    #     system.efermi = full_system_dos[1]
    #     system.bulk_total_dos = full_system_dos[2]
    
    
    # else:
    #     no_dos_count+=1
    #     continue
#     #### adding to the new lmdb file:
    
    txn = db.begin(write=True)
    txn.put(f"{correct_sys_nr}".encode("ascii"), pkl.dumps(system, protocol=-1))
    txn.commit()
    db.sync()
    correct_sys_nr+=1

db.sync()
db.close()

In [None]:
dta = LmdbDataset({"src": "./data/s2ef/200k/train/output_lmdb/normalized_efermi"})

In [None]:
len(dta), dta[10].efermi, dta[0]

-----