In [None]:
filename = "D:/PhD/Data/DFT/NONCOV/DFT_simulations/NMR/Frozen_coords_old/Arg_Pi_charged/orca_out/run_all_displaced_distances.mpi8.out"
n_nuclei = 20

In [None]:
import pandas as pd
import re

def extract_j_coupling(n_nuclei, filename):
    
    column_labels = []
    
    data = []
    
    start_reading = False
    
    row_data = []
    
    rows_to_read = 0
    
    with open(filename, 'r') as file:
        
        for line in file:

            if 'SUMMARY OF ISOTROPIC COUPLING CONSTANTS (Hz)' in line:
                start_reading = True
                continue

            if 'Maximum memory used throughout the entire EPRNMR-calculation:' in line:
                break
            
            if start_reading:

                if not column_labels and re.search(r'\d+ [A-Z]', line):
                    column_labels = re.findall(r'\d+ [A-Z]', line)
                    rows_to_read = n_nuclei
                    print(f"Column labels extracted: {column_labels}")
                    continue
                
                # Accumulate data if we are reading rows
                if rows_to_read > 0:
                    line_parts = line.split()
                    
                    if not line_parts:
                        continue
                    
                    # Detect if this line contains row header and values
                    if len(line_parts) > 2:
                        row_nucleus = line_parts[0] + ' ' + line_parts[1]  # Y-axis nucleus
                        values = line_parts[2:]
                        
                        # Combine current row data with new values
                        if row_data:
                            row_data[1].extend(values)
                        else:
                            row_data = [row_nucleus, values]

                        # Check if we have a complete row
                        if len(row_data[1]) >= len(column_labels):
                            # Extract complete row data
                            row_nucleus = row_data[0]
                            values = row_data[1][:len(column_labels)]
                            
                            # Store row data
                            data.append([row_nucleus] + [float(value) for value in values])
                            
                            # Reset row_data for the next row
                            row_data = []
                            rows_to_read -= 1
                            if rows_to_read == 0:
                                continue
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=['Nucleus'] + column_labels)
    df.set_index('Nucleus', inplace=True)

    return df

# Example usage:
df = extract_j_coupling(n_nuclei,filename)

# Print the DataFrame
df

In [1]:
# Get the NONCOVToolbox library and print header
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
from sklearn.cluster import KMeans
import pathlib as Path

path_noncov = os.path.abspath(os.path.join('..', 'src'))

if path_noncov not in sys.path:
    sys.path.append(path_noncov)

from noncov import NONCOVToolbox, NONCOVHeader

noncov = NONCOVToolbox()

#NONCOVHeader.print_header()

# Pre work on molecular geometries
from noncov import StructureModifier

# OrcaAnalysis module for postprocessing of DFT calculations
from noncov import OrcaAnalysis

# Graph molecular representations
from noncov import MolecularGraph

# Functions to store data in dataframes
from noncov import MachineLearning

# Show performance and features of various NMR functions in module
from noncov import NMRFunctions

# Display the molecule while its displaced, not yet interactive in Jupyter but interactive in VS Code
from noncov import MolView

# Disable printing
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore printing
def enablePrint():
    sys.stdout = sys.__stdout__

In [2]:
# Get work directory and scratch folder for the output data
current_dir = os.getcwd()
print(f'Current work directory is: {current_dir}')

scratch_dir = os.path.abspath(os.path.join('..', 'scratch'))
print(f'Current scratch directory is: {scratch_dir}')
scratch_dir = OrcaAnalysis().convert_path(scratch_dir)

mol_dir = os.path.join(scratch_dir, 'test_structs/benzene_H2O.xyz')
print(f'Current molecule directory is: {mol_dir}')
mol_dir = OrcaAnalysis().convert_path(mol_dir)

Current work directory is: C:\Users\ettor\Desktop\NONCOV\results
Current scratch directory is: C:\Users\ettor\Desktop\NONCOV\scratch
Normalized path using os.path: C:/Users/ettor/Desktop/NONCOV/scratch
Current molecule directory is: C:/Users/ettor/Desktop/NONCOV/scratch\test_structs/benzene_H2O.xyz
Normalized path using os.path: C:/Users/ettor/Desktop/NONCOV/scratch/test_structs/benzene_H2O.xyz


In [3]:
import os
datasets_dir = os.path.join(scratch_dir, 'GenerateMLDataset/data/')
print(f'Dataset directory is: {datasets_dir}')
datasets_dir = OrcaAnalysis().convert_path(datasets_dir)

nucprop = os.path.join(datasets_dir, 'nuc_prop_nmr_observables.csv')
nucprop_df = pd.read_csv(nucprop)

pw_nucprop = os.path.join(datasets_dir, 'pairwise_nuc_prop_nmr_observables.csv')
pw_nucprop_df = pd.read_csv(pw_nucprop)

Dataset directory is: C:/Users/ettor/Desktop/NONCOV/scratch\GenerateMLDataset/data/
Normalized path using os.path: C:/Users/ettor/Desktop/NONCOV/scratch/GenerateMLDataset/data/


In [4]:
nucprop_df

Unnamed: 0,Molecule,Atom,x_coord,y_coord,z_coord,sigma_iso,sigma_11,sigma_22,sigma_33,dia_sigma_11,dia_sigma_22,dia_sigma_33,para_sigma_11,para_sigma_22,para_sigma_33,nmr_functional,nmr_basis_set


In [5]:
pw_nucprop_df

Unnamed: 0,Molecule,Atom_1,Atom_2,x_coord_1,y_coord_1,z_coord_1,x_coord_2,y_coord_2,z_coord_2,J_iso,...,J_DSO_11,J_DSO_22,J_DSO_33,J_PSO_11,J_PSO_22,J_PSO_33,J_SD_11,J_SD_22,J_SD_33,Mayer_BO


In [9]:
import pandas as pd

# Start with an empty list to store data rows
data = []

# First loop: populate some values (partial data)
for i in range(5):  # Simulating some iterations
    row_data = {
        'Molecule': f'molecule_{i}',
        'Atom': f'atom_{i}',
        'x_coord': i * 0.1,
        'y_coord': i * 0.2,
        # 'z_coord' and other columns are missing for now
    }
    data.append(row_data)

# Second loop: update the rows with more data
for i in range(5):  # Simulate further iterations with more data
    data[i].update({
        'z_coord': i * 0.3,
        'sigma_iso': i * 0.4,
        'nmr_functional': 'functional_value',
        'nmr_basis_set': 'basis_set_value'
    })

datadf = pd.DataFrame(data)

nucprop_df = pd.concat([nucprop_df, datadf], ignore_index=True)

# Save the updated DataFrame back to the CSV file
nucprop_df.to_csv(nucprop, index=False)

# Display the updated DataFrame
print(nucprop_df)


     Molecule    Atom  x_coord  y_coord  z_coord  sigma_iso sigma_11 sigma_22  \
0  molecule_0  atom_0      0.0      0.0      0.0        0.0      NaN      NaN   
1  molecule_1  atom_1      0.1      0.2      0.3        0.4      NaN      NaN   
2  molecule_2  atom_2      0.2      0.4      0.6        0.8      NaN      NaN   
3  molecule_3  atom_3      0.3      0.6      0.9        1.2      NaN      NaN   
4  molecule_4  atom_4      0.4      0.8      1.2        1.6      NaN      NaN   

  sigma_33 dia_sigma_11 dia_sigma_22 dia_sigma_33 para_sigma_11 para_sigma_22  \
0      NaN          NaN          NaN          NaN           NaN           NaN   
1      NaN          NaN          NaN          NaN           NaN           NaN   
2      NaN          NaN          NaN          NaN           NaN           NaN   
3      NaN          NaN          NaN          NaN           NaN           NaN   
4      NaN          NaN          NaN          NaN           NaN           NaN   

  para_sigma_33    nmr_fun