# NONCOVToolbox: Step 2
## Generate the datasets from the ORCA output files with @OrcaAnalysis

Use the OrcaAnalysis module of the NONCOVToolbox to postprocess the ORCA output files from DFT calculations and save data to csv pandas dataframe.

### Load necessary modules from the NONCOVToolbox src

In [1]:
# Get the NONCOVToolbox library and print header
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
from sklearn.cluster import KMeans
import pathlib as Path

path_noncov = os.path.abspath(os.path.join('..', 'src'))

if path_noncov not in sys.path:
    sys.path.append(path_noncov)

from noncov import NONCOVToolbox, NONCOVHeader

noncov = NONCOVToolbox()

#NONCOVHeader.print_header()

# OrcaAnalysis module for postprocessing of DFT calculations
from noncov import OrcaAnalysis

# Functions to store data in dataframes
from noncov import MachineLearning

# Show performance and features of various NMR functions in module
from noncov import NMRFunctions

# Disable printing
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore printing
def enablePrint():
    sys.stdout = sys.__stdout__

In [2]:
# Get work directory and scratch folder for the output data
current_dir = os.getcwd()
print(f'Current work directory is: {current_dir}')

scratch_dir = os.path.abspath(os.path.join('..', 'scratch'))
print(f'Current scratch directory is: {scratch_dir}')
scratch_dir = OrcaAnalysis().convert_path(scratch_dir)

Current work directory is: C:\Users\ettor\Desktop\NONCOV\results
Current scratch directory is: C:\Users\ettor\Desktop\NONCOV\scratch
Normalized path using os.path: C:/Users/ettor/Desktop/NONCOV/scratch


### Preallocate two empty dataframes for ML applications

In [3]:
# Check whether a database exists in your directory and create it if it doesnt
datasets_dir = os.path.join(scratch_dir, 'GenerateMLDataset/data/')
print(f'Dataset directory is: {datasets_dir}')
datasets_dir = OrcaAnalysis().convert_path(datasets_dir)

dataset_name = 'fragments_hopt_nmr.csv'

if os.listdir(datasets_dir) == []:
    print("No files found in the directory, creating datasets... \n")
    
    # Make the dataset for the individual NMR properties
    MachineLearning().make_empty_nuc_prop_df(datasets_dir, dataset_name)
else:
    print("Some files found in the directory, skipping... \n")

Dataset directory is: C:/Users/ettor/Desktop/NONCOV/scratch\GenerateMLDataset/data/
Normalized path using os.path: C:/Users/ettor/Desktop/NONCOV/scratch/GenerateMLDataset/data/
Some files found in the directory, skipping... 



In [4]:
# Display how the empty databases look like
nucprop = os.path.join(datasets_dir, dataset_name)
nucprop_df = pd.read_csv(nucprop)

In [5]:
# Individual nuclear properties
nucprop_df

Unnamed: 0,Molecule,Atom,x_coord,y_coord,z_coord,sigma_iso,sigma_xx,sigma_yy,sigma_zz,dia_sigma_xx,...,dia_sigma_zz,para_sigma_xx,para_sigma_yy,para_sigma_zz,sigma_11,sigma_22,sigma_33,s_tot_symmetry,span,skew


### Process ORCA calculations with @ORCAAnalysis

In [6]:
# Provide files you want to process as input 
orca_output = input("Enter the path to the ORCA file you want to work with: ")
orca_output = OrcaAnalysis().convert_path(orca_output)

Enter the path to the ORCA file you want to work with: "C:\Users\ettor\Desktop\NONCOV\tests\nmr_ncs_comp.mpi8.out"
Normalized path using os.path: C:/Users/ettor/Desktop/NONCOV/tests/nmr_ncs_comp.mpi8.out


In [7]:
# Get the head of the file for saving files later
basename = os.path.basename(orca_output)
outname = basename.split('.')[0]

In [8]:
# Get a list of all the molecule names in the calculation, needed for ML later
list_molecules = OrcaAnalysis().extract_molecule_names(orca_output)
print(f'You have calculated the following molecules: {list_molecules}\n')

You have calculated the following molecules: ['d_cut_0_n1_opt.xyz', 'df_cut_4_n1_opt.xyz', 'df_cut_4_n1_opt_disp_struct_0.xyz', 'df_cut_4_n1_opt_disp_struct_1.xyz', 'df_cut_4_n1_opt_disp_struct_10.xyz', 'df_cut_4_n1_opt_disp_struct_11.xyz', 'df_cut_4_n1_opt_disp_struct_12.xyz', 'df_cut_4_n1_opt_disp_struct_13.xyz', 'df_cut_4_n1_opt_disp_struct_14.xyz', 'df_cut_4_n1_opt_disp_struct_15.xyz', 'df_cut_4_n1_opt_disp_struct_16.xyz', 'df_cut_4_n1_opt_disp_struct_17.xyz', 'df_cut_4_n1_opt_disp_struct_18.xyz', 'df_cut_4_n1_opt_disp_struct_19.xyz', 'df_cut_4_n1_opt_disp_struct_2.xyz', 'df_cut_4_n1_opt_disp_struct_3.xyz', 'df_cut_4_n1_opt_disp_struct_4.xyz', 'df_cut_4_n1_opt_disp_struct_5.xyz', 'df_cut_4_n1_opt_disp_struct_6.xyz', 'df_cut_4_n1_opt_disp_struct_7.xyz', 'df_cut_4_n1_opt_disp_struct_8.xyz', 'df_cut_4_n1_opt_disp_struct_9.xyz', 'dw_cut_4_n1_1f_opt.xyz', 'dw_cut_4_n1_1f_opt_disp_struct_0.xyz', 'dw_cut_4_n1_1f_opt_disp_struct_1.xyz', 'dw_cut_4_n1_1f_opt_disp_struct_10.xyz', 'dw_cut_4_n1

In [9]:
# Working with ORCA .out files

# Count how many sequential calculations have been done
n_jobs = OrcaAnalysis().count_jobs_number(orca_output)
print(f'Number of ORCA jobs in file: {n_jobs}\n')

# Compute size of the .out file and suggest Git LFS 
size_orca_output = os.path.getsize(orca_output)
size_orca_output = size_orca_output/1e6
print(f'Size of ORCA file is: {size_orca_output} MB\n')

if n_jobs > 20:
    print(f'Careful, you are working with a possibly large output file of several GB\n')
    print(f'If using version controls consider setting up a .gitignore \n')

if size_orca_output > 1:
    print(f"Careful, you are working with a '{size_orca_output}' KB large file..\n")
    print(f'Set up a .gitignore or Git LFS before pushing to Git\n')

# Extract level of theory
lot_out = OrcaAnalysis().extract_level_of_theory(orca_output)
print(f'Level of theory for the NMR calculations is: {lot_out}\n')

# Split orca output in several subfiles for ease of handling (takes a while)
if n_jobs > 2:
    print('Your output file will be now spilt into subfiles. \n')
    OrcaAnalysis().split_orca_output(scratch_dir, orca_output)

Number of ORCA jobs in file: 39

Size of ORCA file is: 783.935089 MB

Careful, you are working with a possibly large output file of several GB

If using version controls consider setting up a .gitignore 

Careful, you are working with a '783.935089' KB large file..

Set up a .gitignore or Git LFS before pushing to Git

Level of theory for the NMR calculations is: Job started from odin1, running /scratch/bartalucci/nmr_ncs_comp__LJyf9K__121777/orca/orca

Your output file will be now spilt into subfiles. 

Output file path is C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job1.out
Wrote job 1 to C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job1.out
Output file path is C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job2.out
Wrote job 2 to C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job2.out
Output file path is C:/Users/ettor/Desktop/NONCOV/sc

Output file path is C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job36.out
Wrote job 36 to C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job36.out
Output file path is C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job37.out
Wrote job 37 to C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job37.out
Wrote job 38 to C:/Users/ettor/Desktop/NONCOV/scratch\OrcaAnalysis/split_orca_output/splitted_orca_job38.out
ORCA output has been split into 38 sub files for further analysis


In [10]:
# Initialize variables for shielding tensor components
S_dia = []
S_para = []
S_tot = []
nuclear_identities = []
mayer_bo = []
nuc_coords = []

# Extract NMR data from each splitted file
for job_number in range (1, n_jobs): # split files = number of jobs
        
    blockPrint()
    
    # Path to the splitted outputs from the .out MPI8 file
    orca_splitted_output = OrcaAnalysis().convert_path(os.path.join(scratch_dir, 'OrcaAnalysis/split_orca_output', f'splitted_orca_job{job_number}.out'))

    # Extract CSA data
    shielding_dia, shielding_para, shielding_tot, nucleus_info = OrcaAnalysis().extract_tensor_data(orca_splitted_output)

    # Here include j coupling extraction
    #-------------
    
    # Extract bond orders
    bond_orders = OrcaAnalysis().extract_mayer_bond_order(orca_splitted_output)
    
    # Print the bond orders and their interacting nuclei
    for nucleus, bonds in bond_orders.items():
        print(f"{nucleus}:")
        for interacting_nucleus, bond_order in bonds:
            print(f"  Bond with {interacting_nucleus}: {bond_order}")
    
    enablePrint()
    
    coords = OrcaAnalysis().extract_xyz_coords(orca_splitted_output)
    
    # Append shielding tensor matrices (non-diagonalized) - all nuclei for each job iteration
    S_dia.append(shielding_dia)
    S_para.append(shielding_para)
    S_tot.append(shielding_tot)
    nuclear_identities.append(nucleus_info)
    
    # Append bond orders
    mayer_bo.append(bond_orders)
    
    # Append coordinates
    nuc_coords.append(coords)

In [14]:
# Get all the data that are not pairwise
data = []

# Loop through the number of jobs and get each molecule, each job has a different one
for job_number in range(1, n_jobs-1):
    molecule_name = list_molecules[job_number]
    
    # Process each job for S_tot
    shielding_dict = S_tot[job_number]
    if isinstance(shielding_dict, dict):
        for nucleus_index, (nucleus_key, tensor) in enumerate(shielding_dict.items()):
            shielding_tensor, s_iso, diagonal_mehring, eigenvals, eigenvecs, symmetry, span, skew = NMRFunctions().test_diagonalize_tensor(tensor)
            
            sigma_xx = eigenvals[0]
            sigma_yy = eigenvals[1]
            sigma_zz = eigenvals[2]
            
            sigma_11 = diagonal_mehring[0][0]
            sigma_22 = diagonal_mehring[1][1]
            sigma_33 = diagonal_mehring[2][2]
            
            # Extract coordinates and identities for the current nucleus
            nuc_id = nuclear_identities[job_number][nucleus_index]
            
            # Handle different structures of nuc_coords
            coords = nuc_coords[job_number][nucleus_index]
            if len(coords) >= 3:
                x_coord = coords[1]
                y_coord = coords[2]
                z_coord = coords[3]
            else:
                x_coord = y_coord = z_coord = None  

            # Collect the data for this nucleus
            row_data = {
                'Molecule': molecule_name,
                'Atom': nuc_id,
                'x_coord': x_coord,
                'y_coord': y_coord,
                'z_coord': z_coord,
                'sigma_iso': s_iso,
                'sigma_xx': sigma_xx,
                'sigma_yy': sigma_yy,
                'sigma_zz': sigma_zz,
                'dia_sigma_xx': None,  
                'dia_sigma_yy': None,
                'dia_sigma_zz': None,
                'para_sigma_xx': None,  
                'para_sigma_yy': None,
                'para_sigma_zz': None,
                'sigma_11' : sigma_11,
                'sigma_22' : sigma_22,
                'sigma_33' : sigma_33,
                's_tot_symmetry' : symmetry,
                'span' : span,
                'skew' : skew
            }
            data.append(row_data)

    # After collecting data from S_tot, update with data from S_dia
    shielding_dict = S_dia[job_number]
    if isinstance(shielding_dict, dict):
        for nucleus_index, (nucleus_key, tensor) in enumerate(shielding_dict.items()):
            dia_shielding_tensor, dia_s_iso, dia_diagonal_mehring, dia_eigenvals, dia_eigenvecs, dia_symmetry, dia_span, dia_skew = NMRFunctions().test_diagonalize_tensor(tensor)

            dia_sigma_xx = dia_eigenvals[0]
            dia_sigma_yy = dia_eigenvals[1]
            dia_sigma_zz = dia_eigenvals[2]

            # Update the existing row_data with dia_sigma values
            for row in data:
                if row['Molecule'] == molecule_name and row['Atom'] == nucleus_key:
                    row.update({
                        'dia_sigma_xx': dia_sigma_xx,
                        'dia_sigma_yy': dia_sigma_yy,
                        'dia_sigma_zz': dia_sigma_zz
                    })

    # After collecting data from S_dia, update with data from S_para
    shielding_dict = S_para[job_number]
    if isinstance(shielding_dict, dict):
        for nucleus_index, (nucleus_key, tensor) in enumerate(shielding_dict.items()):
            para_shielding_tensor, para_s_iso, para_diagonal_mehring, para_eigenvals, para_eigenvecs, para_symmetry, para_span, para_skew = NMRFunctions().test_diagonalize_tensor(tensor)
            
            para_sigma_xx = para_eigenvals[0]
            para_sigma_yy = para_eigenvals[1]
            para_sigma_zz = para_eigenvals[2]

            # Update the existing row_data with para_sigma values
            for row in data:
                if row['Molecule'] == molecule_name and row['Atom'] == nucleus_key:
                    row.update({
                        'para_sigma_xx': para_sigma_xx,
                        'para_sigma_yy': para_sigma_yy,
                        'para_sigma_zz': para_sigma_zz
                    })

# Convert the list of rows to a DataFrame
datadf = pd.DataFrame(data)

# Concatenate with the existing DataFrame
nucprop_df = pd.concat([nucprop_df, datadf], ignore_index=True)

# Save the updated DataFrame to a CSV file
nucprop_df.to_csv(nucprop, index=False)

# Display the updated DataFrame
nucprop_df


Unnamed: 0,Molecule,Atom,x_coord,y_coord,z_coord,sigma_iso,sigma_xx,sigma_yy,sigma_zz,dia_sigma_xx,...,dia_sigma_zz,para_sigma_xx,para_sigma_yy,para_sigma_zz,sigma_11,sigma_22,sigma_33,s_tot_symmetry,span,skew
0,df_cut_4_n1_opt.xyz,Nucleus 7H :,1.004732,0.618235,0.399603,28.71,34.44,26.20,25.50,45.09,...,20.37,-12.17,3.32,5.17,25.50,26.20,34.44,0,8.94,-0.84
1,df_cut_4_n1_opt.xyz,Nucleus 8H :,-0.479139,0.82372,0.426466,28.66,33.20,27.90,24.88,24.68,...,34.50,-8.86,-2.88,2.09,24.88,27.90,33.20,0,8.32,-0.27
2,df_cut_4_n1_opt.xyz,Nucleus 9H :,-1.139424,1.29359,-0.715278,25.80,29.45,26.15,21.81,45.32,...,30.01,-19.56,0.18,-7.79,21.81,26.15,29.45,0,7.64,0.14
3,df_cut_4_n1_opt.xyz,Nucleus 10H :,-2.52968,1.410024,-0.731269,25.71,21.87,27.98,27.28,45.53,...,18.25,-18.92,-4.64,4.48,21.87,27.28,27.98,0,6.11,0.77
4,df_cut_4_n1_opt.xyz,Nucleus 11H :,-3.272787,1.055261,0.391918,23.23,22.20,23.28,24.21,12.13,...,40.98,10.96,-20.55,-16.79,22.20,23.28,24.21,0,2.01,0.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
847,dw_cut_4_n1_1f_opt_disp_struct_3.xyz,Nucleus 20C :,0.776414,-3.63766,3.208965,3.58,-82.25,69.89,23.09,201.34,...,264.86,-131.50,-342.79,-238.18,-82.25,23.09,69.89,0,152.14,0.38
848,dw_cut_4_n1_1f_opt_disp_struct_3.xyz,Nucleus 5N :,1.831751,-2.679966,1.421577,124.20,66.01,167.33,139.27,299.41,...,330.00,-132.41,-198.35,-263.96,66.01,139.27,167.33,0,101.32,0.45
849,dw_cut_4_n1_1f_opt_disp_struct_3.xyz,Nucleus 21O :,2.415792,-5.494936,1.881939,-43.41,210.39,-49.29,-291.33,398.23,...,420.34,-200.82,-450.73,-706.10,-291.33,-49.29,210.39,0,501.72,-0.04
850,dw_cut_4_n1_1f_opt_disp_struct_3.xyz,Nucleus 22O :,3.704628,-4.283362,1.954744,-38.95,213.04,-281.99,-47.91,398.53,...,429.01,-204.97,-690.85,-466.52,-281.99,-47.91,213.04,0,495.03,-0.05
