# Noncovalent interactions and NMR observables
## NONCOV Toolbox in Python

### Ettore Bartalucci, Progress Report 17.09.2024

## Introduction

## Structure

## Preconfig NONCOVToolbox Library

In [1]:
# Get the NONCOVToolbox library and print header
import sys
import os

path_noncov = os.path.abspath(os.path.join('..', 'src'))

if path_noncov not in sys.path:
    sys.path.append(path_noncov)

from noncov import NONCOVToolbox, NONCOVHeader

noncov = NONCOVToolbox()

#NONCOVHeader.print_header()

# OrcaAnalysis module for postprocessing of DFT calculations
from noncov import OrcaAnalysis

# Show performance and features of various NMR functions in module
from noncov import NMRFunctions

# Display the molecule while its displaced, not yet interactive in Jupyter but interactive in VS Code
from noncov import MolView

In [None]:
# Get configuration file 
configdir = os.getcwd()
configdir = os.path.abspath(os.path.join('..', 'config'))

configs = os.path.join(configdir, 'configuration.yml')
print(f'Change with care, configuration file is in:', configs)

print('And looks like this...\n')
with open(configs,'r') as f:
    config_file = f.read()
    print(config_file)

In [2]:
# Get work directory and scratch folder for the output data
current_dir = os.getcwd()
print(f'Current work directory is: {current_dir}')

scratch_dir = os.path.abspath(os.path.join('..', 'scratch'))
print(f'Current scratch directory is: {scratch_dir}')

mol_dir = os.path.join(scratch_dir, 'test_structs/benzene_H2O.xyz')
print(f'Current molecule directory is: {mol_dir}')

Current work directory is: d:\PhD\Data\DFT\NONCOV\DFT_simulations\codes\results
Current scratch directory is: d:\PhD\Data\DFT\NONCOV\DFT_simulations\codes\scratch
Current molecule directory is: d:\PhD\Data\DFT\NONCOV\DFT_simulations\codes\scratch\test_structs/benzene_H2O.xyz


## Modules: OrcaAnalysis

here some description

In [3]:
# Provide files you want to process as input 
# Example: "D:\PhD\Data\DFT\NONCOV\DFT_simulations\codes\tests\data\KLaL_cation_pi_RCCE_opt.mpi8.out"
orca_output = input("Enter the path to the ORCA file you want to work with: ")
orca_output = OrcaAnalysis().convert_path(orca_output)
scratch_dir = OrcaAnalysis().convert_path(scratch_dir)
mol_dir = OrcaAnalysis().convert_path(mol_dir)

KeyboardInterrupt: Interrupted by user

In [None]:
# Working with ORCA .out files

# Count how many sequential calculations have been done
n_jobs = OrcaAnalysis().count_jobs_number(orca_output)
print(f'Number of ORCA jobs in file: {n_jobs}\n')

# Compute size of the .out file and suggest Git LFS 
size_orca_output = os.path.getsize(orca_output)
size_orca_output = size_orca_output/1024
print(f'Size of ORCA file is: {size_orca_output} KB\n')

if n_jobs > 20:
    print(f'Careful, you are working with a possibly large output file of several GB\n')
    print(f'If using version controls consider setting up a .gitignore \n')

if size_orca_output > 1000:
    print(f"Careful, you are working with a '{size_orca_output}' KB large file..\n")
    print(f'Set up a .gitignore or Git LFS before pushing to Git\n')

# Extract level of theory
lot_out = OrcaAnalysis().extract_level_of_theory(orca_output)
print(f'Level of theory for the NMR calculations is: {lot_out}\n')

# Split orca output in several subfiles for ease of handling (takes a while)
if n_jobs > 2:
    print('Your output file will be now spilt into subfiles. \n')
    #OrcaAnalysis().split_orca_output(scratch_dir, orca_output)

In [None]:
# Define the empirical boundaries ([A]) for various noncovalent interactions
OrcaAnalysis().run_boundary_checks()

In [None]:
# Initialize displacement steps in Angstrom - need to find a clever way to do this
displacement_steps_distance = [job * 0.25 for job in range(1,n_jobs+1)]

In [65]:
# Disable printing
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore printing
def enablePrint():
    sys.stdout = sys.__stdout__

In [None]:
# --- Extract the CSA tensor components for each nucleus at each distance iteration --- #

# Initialize variables for shielding tensor components
S_dia = []
S_para = []
S_tot = []
nuclear_identities = []

# Extract NMR data from each splitted file
for job_number in range (1, n_jobs+1): # split files = number of jobs
        
    # Path to the splitted outputs from the .out MPI8 file
    orca_splitted_output = OrcaAnalysis().convert_path(os.path.join(scratch_dir, 'OrcaAnalysis/split_orca_output', f'splitted_orca_job{job_number}.out'))
    
    # Extract CSA data
    shielding_dia, shielding_para, shielding_tot, nucleus_info = OrcaAnalysis().extract_csa_data(orca_splitted_output)
    
    # Append shielding tensor matrices (non-diagonalized) - all nuclei for each job iteration
    S_dia.append(shielding_dia)
    S_para.append(shielding_para)
    S_tot.append(shielding_tot)
    nuclear_identities.append(nucleus_info)

# Transform into PAS 
original_shielding_tensors = []
s_iso_all = []
diagonal_mehring_all = []
diagonal_haberlen_all = []
eigenvals_all = []
eigenvecs_all = []
symmetry_all = []

# Iterate over each job's shielding tensors in S_tot
for job_index, shielding_dict in enumerate(S_tot):
    
    # Check if shielding_dict is a dictionary and contains items
    if isinstance(shielding_dict, dict):
        for nucleus_key, tensor in shielding_dict.items():
            
            # Diagonalize the tensor
            shielding_tensor, s_iso, diagonal_mehring, diagonal_haberlen, eigenvals, eigenvecs, symmetry = NMRFunctions().diagonalize_tensor(tensor)
            
            # Append the results to the lists
            original_shielding_tensors.append((nucleus_key, shielding_tensor, job_index))
            s_iso_all.append((nucleus_key, s_iso, job_index))
            diagonal_mehring_all.append((nucleus_key, diagonal_mehring, job_index))
            diagonal_haberlen_all.append((nucleus_key, diagonal_haberlen, job_index))
            eigenvals_all.append((nucleus_key, eigenvals, job_index))
            eigenvecs_all.append((nucleus_key, eigenvecs, job_index))
            symmetry_all.append((nucleus_key, symmetry, job_index))
    else:
        print(f"Error: Expected a dictionary but got {type(shielding_dict)}")

# Do the same for diamagnetic and paramagnetic


In [None]:
import os
import matplotlib.pyplot as plt

def plot_tensor_shielding(nuclear_identities, s_iso_all, diagonal_mehring_all, displacement_steps_distance, scratch_dir, n_jobs):

    tensor_plots = os.path.join(scratch_dir, 'OrcaAnalysis/test_tensor_plots')
    tensor_plots = OrcaAnalysis().convert_path(tensor_plots)
    os.makedirs(tensor_plots, exist_ok=True)

    for job_number in range (1, n_jobs+1): # split files = number of jobs
        
        s_iso = []
        s_11 = []
        s_22 = []
        s_33 = []

        for i, nucleus in enumerate(nuclear_identities):

            sigma_iso = s_iso_all[i]
            sigma_iso = sigma_iso[1]

            diag_tensor = diagonal_mehring_all[i]
            diag_tensor = diag_tensor[1]
            sigma_11 = diag_tensor[0]  
            sigma_22 = diag_tensor[1]  
            sigma_33 = diag_tensor[2] 
            
            sigma_11 = sigma_11[0]  
            sigma_22 = sigma_22[1]  
            sigma_33 = sigma_33[2] 

            s_iso.append(sigma_iso)
            s_11.append(sigma_11)
            s_22.append(sigma_22)
            s_33.append(sigma_iso)

        # Plot the isotropic shift and the diagonal tensor components
        plt.plot(displacement_steps_distance[job_number], s_iso[job_number], marker='o', linestyle='-', color='blue', label=r'$\sigma$_iso')
        plt.plot(displacement_steps_distance, s_11, marker='o', linestyle='-', color='red', label=r'$\sigma$_11')
        plt.plot(displacement_steps_distance, s_22, marker='o', linestyle='-', color='magenta', label=r'$\sigma$_22')
        plt.plot(displacement_steps_distance, s_33, marker='o', linestyle='-', color='gold', label=r'$\sigma$_33')

        # Highlight the NONCOV effective region (optional, can be commented out if not needed)
        #if min_distance_value is not None and max_distance_value is not None:
        #    plt.axvspan(min_distance_value, max_distance_value, alpha=0.2, color='grey', label='NONCOV \n effective region')

        # Set labels and title
        plt.xlabel('Displacement')
        plt.ylabel('Shielding / ppm')
        plt.title(f'Nucleus {nucleus[i]}')
        pdf_filename = os.path.join(tensor_plots, f'{nucleus[i]}.pdf')
        jpg_filename = os.path.join(tensor_plots, f'{nucleus[i]}.jpg')


    # Display legend
    plt.legend(loc='best')

    # Save the plot as a PDF in the output folder
    plt.savefig(pdf_filename, bbox_inches='tight')

    # Save the plot as a JPEG in the output folder
    plt.savefig(jpg_filename, bbox_inches='tight')        

    # Clear the current figure for the next iteration
    plt.clf()


In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np

def plot_tensor_shielding(nuclear_identities, s_iso_all, diagonal_mehring_all, displacement_steps_distance, scratch_dir, n_jobs):
    
    tensor_plots = os.path.join(scratch_dir, 'OrcaAnalysis/test_tensor_plots')
    tensor_plots = OrcaAnalysis().convert_path(tensor_plots)
    os.makedirs(tensor_plots, exist_ok=True)

    if isinstance(nuclear_identities[0], list):
        nuclear_identities = [tuple(nuc) for nuc in nuclear_identities]

    nucleus_data = {nucleus: {'displacements': [], 's_iso': []} for nucleus in nuclear_identities}

    for i, nucleus in enumerate(nuclear_identities):
        sigma_iso = s_iso_all[i]
        sigma_iso = sigma_iso[1]  

        for job_number in range(1, n_jobs + 1):
            
            displacements = displacement_steps_distance[job_number - 1]  # Correct zero-based indexing
            nucleus_data[nucleus]['displacements'].append(displacements)

            s_iso = sigma_iso
            nucleus_data[nucleus]['s_iso'].append(s_iso)

    for nucleus, data in nucleus_data.items():

        for i in range(1,len(nucleus)):

            plt.figure()
            
            # Plot the progression of s_iso as a function of displacement steps
            plt.plot(data['displacements'], data['s_iso'], marker='o', linestyle='-', label=f'Nucleus {i}')
                    
            nucleus_name = nucleus.replace(' ', '_').replace(':', '')

            plt.xlabel('Displacement Steps Distance')
            plt.ylabel(r'$\sigma_\text{iso}$')
            plt.title(f'Isotropic Shielding Progression for Nucleus: {nucleus_name}')
            plt.legend()
            
            # Save the plot for the current nucleus
            plot_file = os.path.join(tensor_plots, f'{nucleus_name}_isotropic_shielding.png')
            plt.savefig(plot_file)
            plt.close()


In [None]:
import warnings
import numpy as np
warnings.simplefilter("ignore", np.ComplexWarning)

plot_tensor_shielding(nuclear_identities, s_iso_all, diagonal_mehring_all, 
                          displacement_steps_distance, scratch_dir, n_jobs)

## Modules: MolView

here some text

In [None]:
# molecule = 'D:/PhD/Data/DFT/NONCOV/DFT_simulations/codes/scratch/test_structs/benzene_H2O.xyz'
molecule = 'C:/Users/ettor/Desktop/NONCOV/scratch/test_structs/benzene_H2O.xyz'

MolView().plot_3d_molecule(molecule)

In [None]:
# Plot euler angles and rotated tensor
tensor_pas = np.diag([1.0, 2.0, 3.0])  # Diagonal tensor in PAS
alpha, beta, gamma = 30, 45, 60  # Euler angles in degrees
MolView().plot_3D_tensors_and_axes(tensor_pas, alpha, beta, gamma)


## Modules: NMRFunctions

here some text

In [37]:
import matplotlib.pyplot as plt
import pandas as pd

# Test random tensor
xx =-5.9766
xy =-65.5206
xz =-9.5073
yx =-60.3020
yy =-23.0881
yz =-28.2399
zx =-10.8928
zy =-25.2372
zz =56.277

tensor = [xx, xy, xz], [yx, yy, yz], [zx, zy, zz]
print(tensor)

# Diagonalize tensor
shielding_tensor, s_iso, diagonal_mehring, diagonal_haberlen, eigenvals, eigenvecs, symmetry = NMRFunctions().diagonalize_tensor(tensor)

sigma_11 = diagonal_mehring[0][0]
sigma_22 = diagonal_mehring[1][1]
sigma_33 = diagonal_mehring[2][2]

nuclei = []
a = []
b = []
c = []
d = []
e = []

for i in range (1,7):
    nucleus = i+1
    y = s_iso
    z = sigma_11
    j = sigma_22
    k = sigma_33
    l = symmetry

    nuclei.append(nucleus)
    a.append(y)
    b.append(z)
    c.append(j)
    d.append(k)
    e.append(l)

data = (nuclei,a,b,c,d,e)

columns = ('Nucleus', 's_iso', 'sigma_11', 'sigma_22', 'sigma_33', 'symmetry')

df = pd.DataFrame(data=data, index=columns).T
df 
df.plot()

([-5.9766, -65.5206, -9.5073], [-60.302, -23.0881, -28.2399], [-10.8928, -25.2372, 56.277])
# -------------------------------------------------- #
# TENSOR DIAGONALIZATION FUNCTION HAS BEEN REQUESTED #


Shielding Tensor is: 
[[ -5.9766 -65.5206  -9.5073]
 [-60.302  -23.0881 -28.2399]
 [-10.8928 -25.2372  56.277 ]]
Proceeding to transposing...

Transposed matrix is: 
[[ -5.9766 -60.302  -10.8928]
 [-65.5206 -23.0881 -25.2372]
 [ -9.5073 -28.2399  56.277 ]]
Proceeding to symmetrization...

Symmetric tensor is: 
[[ -5.9766  -62.9113  -10.20005]
 [-62.9113  -23.0881  -26.73855]
 [-10.20005 -26.73855  56.277  ]]

Antisymmetric tensor is. 
[[ 0.      -2.6093   0.69275]
 [ 2.6093   0.      -1.50135]
 [-0.69275  1.50135  0.     ]]

Since antisymmetric part does not contribute to observable, skipping...

Proceeding to diagonalization...

Eigenvalues are: [-83.17  43.79  66.59], Eigenvectors are: 
[[ 0.65  0.76  0.32]
 [ 0.74 -0.52 -0.47]
 [ 0.18 -0.39  0.82]]

Proceeding to ordering eigenvalue

In [66]:

# --- Extract the CSA tensor components for each nucleus at each distance iteration --- #
n_jobs = 1

# Initialize variables for shielding tensor components
S_dia = []
S_para = []
S_tot = []
nuclear_identities = []
data = []

# Extract NMR data from each splitted file
for job_number in range (1, n_jobs+1): # split files = number of jobs
        
    blockPrint()
    # Path to the splitted outputs from the .out MPI8 file
    orca_splitted_output = OrcaAnalysis().convert_path(os.path.join(scratch_dir, 'OrcaAnalysis/split_orca_output', f'splitted_orca_job{job_number}.out'))
    
    # Extract CSA data
    shielding_dia, shielding_para, shielding_tot, nucleus_info = OrcaAnalysis().extract_csa_data(orca_splitted_output)

    enablePrint()
    
    # Append shielding tensor matrices (non-diagonalized) - all nuclei for each job iteration
    S_dia.append(shielding_dia)
    S_para.append(shielding_para)
    S_tot.append(shielding_tot)
    nuclear_identities.append(nucleus_info)

# Transform into PAS 
original_shielding_tensors = []
s_iso_all = []
diagonal_mehring_all = []
diagonal_haberlen_all = []
eigenvals_all = []
eigenvecs_all = []
symmetry_all = []
df = []

# Iterate over each job's shielding tensors in S_tot
for job_index, shielding_dict in enumerate(S_tot):
    
    # Check if shielding_dict is a dictionary and contains items
    if isinstance(shielding_dict, dict):

        for nucleus_key, tensor in shielding_dict.items():
            
            # Diagonalize the tensor
            shielding_tensor, s_iso, diagonal_mehring, diagonal_haberlen, eigenvals, eigenvecs, symmetry = NMRFunctions().diagonalize_tensor(tensor)
            
            data2 = (nucleus_key, s_iso, diagonal_mehring[0][0], symmetry)
            df2 = pd.DataFrame(data=data2, index=columns).T
            df.append(df2)

            # Append the results to the lists
            original_shielding_tensors.append((nucleus_key, shielding_tensor, job_index))
            s_iso_all.append((nucleus_key, s_iso, job_index))
            diagonal_mehring_all.append((nucleus_key, diagonal_mehring, job_index))
            diagonal_haberlen_all.append((nucleus_key, diagonal_haberlen, job_index))
            eigenvals_all.append((nucleus_key, eigenvals, job_index))
            eigenvecs_all.append((nucleus_key, eigenvecs, job_index))
            symmetry_all.append((nucleus_key, symmetry, job_index))
    else:
        print(f"Error: Expected a dictionary but got {type(shielding_dict)}")


columns = ('Nucleus', 'sdia', 'spara', 'stot')

df 

#df.plot()


[        Nucleus   sdia  spara stot
 0  Nucleus 3H :  27.48  23.16    0,
         Nucleus        sdia       spara stot
 0  Nucleus 4H :  (26.93+0j)  (23.25+0j)    0,
         Nucleus   sdia  spara stot
 0  Nucleus 7H :  24.21  20.22    0,
         Nucleus   sdia  spara stot
 0  Nucleus 9H :  24.76  20.51    0,
          Nucleus   sdia  spara stot
 0  Nucleus 11H :  24.16  20.24    0,
          Nucleus       sdia       spara stot
 0  Nucleus 14H :  (26.3+0j)  (22.93+0j)    0,
          Nucleus  sdia  spara stot
 0  Nucleus 15H :  27.7  23.25    0,
          Nucleus  sdia  spara stot
 0  Nucleus 18H :  23.5  19.86    0,
          Nucleus   sdia  spara stot
 0  Nucleus 20H :  23.78  20.01    0,
          Nucleus   sdia spara stot
 0  Nucleus 22H :  23.78  19.8    0,
          Nucleus   sdia  spara stot
 0  Nucleus 29H :  27.36  22.97    0,
          Nucleus        sdia       spara stot
 0  Nucleus 30H :  (27.02+0j)  (23.25+0j)    0,
          Nucleus   sdia spara stot
 0  Nucleus 33H :  2

In [None]:
# Tensor to Euler angles
mode = 'AZYZ'
order = 'Ascending'
alpha, beta, gamma, tensor_pas = NMRFunctions().tensor_to_euler(shielding_tensor, eigenvals, eigenvecs, symmetry, mode, order)

In [None]:
# Generate equivalent sets of angles
NMRFunctions().EqEulerSet(alpha,beta,gamma)

## Modules: MolecularGraph

In [None]:
def main(molecule_path):

    mol_graph = MolecularGraph()

    # Parse the XYZ file
    atom_types, coordinates = mol_graph.parse_xyz(molecule_path)
    
    # Calculate pairwise distances
    distances = mol_graph.calculate_distances(coordinates)
    
    # Detect covalent bonds
    covalent_bonds = mol_graph.detect_bonds(atom_types, distances)
    
    # Detect non-covalent interactions
    noncovalent_interactions = mol_graph.detect_noncovalent_interactions(atom_types, distances)
    
    # Build the molecular graph
    #mol_graph = mol_graph.build_molecular_graph(atom_types, coordinates, covalent_bonds, noncovalent_interactions)
    
    # Visualize the molecular graph
    #mol_graph.draw()

    # Plots 
    mol_graph.plot_bond_dist_matrix(covalent_bonds, distances, atom_types)
    mol_graph.plot_noncov_distance_map(noncovalent_interactions, atom_types)

    # Build different graphs
    covalent_bonds_graph = mol_graph.build_covalent_bonds_graph(atom_types, coordinates, covalent_bonds)
    intramolecular_graph = mol_graph.build_intramolecular_graph(atom_types, coordinates, covalent_bonds, noncovalent_interactions)
    intermolecular_graph = mol_graph.build_intermolecular_graph(atom_types, coordinates, noncovalent_interactions)

    # Draw subplots while preserving atom positions
    mol_graph.draw_subplots(covalent_bonds_graph, intramolecular_graph, intermolecular_graph, coordinates)




threshold = 1.6

current_dir = os.getcwd()
print(f'Current working directory is: {current_dir}')
#molecule = os.path.join(current_dir, 'scratch/test_structs/caffeine.xyz')

molecule = 'D:/PhD/Data/DFT/NONCOV/DFT_simulations/codes/scratch/test_structs/benzene_H2O.xyz'

main(molecule)


## Modules: DistanceScanner & RotationScanner

In [None]:
# WORKING WITH RELATIVE PATHS 
current_dir = os.getcwd()
print(current_dir)

# LOGS AND ERRORS
error_log_file = 'error_log_file.txt' # to finish
log_file = 'log_file.txt' # to finish


# START TIMER: COMPUTE EFFECTIVE WALL TIME
start = timer() # this is not in [sec] i think

# SECTON: MAIN
def main():

    # Relative paths
    start_structure = os.path.join(current_dir, 'input_structures/KLaL_cation_pi_RCCE_opt_NICS.xyz')
    centroid_out = os.path.join(current_dir, 'centroid_output/centroid_file.xyz')
    input_file = os.path.join(current_dir, 'input_file/input_file.txt')

    # Read xyz file: this should be either a fully optimized geometry or one with relaxed H
    coordinates, atom_identities = read_atomic_coord(start_structure)
    print(f'Starting coordinates: {coordinates}')
    print(f'Atom identities: {atom_identities}')

    # Assign coordinates to molecular fragments, check nomenclature of your atoms in avogadro or any other molecular graphics soft
    coords1, coords2 = assign_molecule_fragments(coordinates, input_file)

    # Concatenate coordinates for k-means clustering
    all_coords = np.concatenate((coords1, coords2), axis=0)
    # print(f'All coords: {all_coords}')

    # Count how many fragments you have defined in the input file, important for accurate K-means clustering
    n_fragments = count_fragments(input_file)
    print(f"Number of '$fragment' occurrences: {n_fragments}")

    # Perform k-means clustering to compute centroids
    kmeans = KMeans(n_clusters=n_fragments) # K-means clusters = number of centroids = number of fragments
    kmeans.fit(all_coords)
    centroids = kmeans.cluster_centers_

    # Compute centroids for each fragment
    fragment_centroids = calculate_centroids([coords1, coords2])

    # Write centroid coordinates to file
    write_centroids(centroid_out, fragment_centroids)
    print(f'Centroid coordinates: {fragment_centroids}')

    # Calculate displacement direction (line connecting centroids)
    displacement_direction = centroids[1] - centroids[0]
    displacement_direction /= np.linalg.norm(displacement_direction)
    print(f'Displacement direction:{displacement_direction}')

    # Read displacement step size from input file
    displacement_step = None
    with open(input_file, 'r') as f:
        lines = f.readlines()
        read_displacement = False
        for line in lines:
            if read_displacement:
                displacement_values = line.strip().split()
                if displacement_values:
                    displacement_step = float(displacement_values[0])
                    break
            elif line.strip() == "$displacement":
                read_displacement = True

    if displacement_step is None:
        print('ERROR: displacement step size not found in input file, please specify it! Syntax => $displacement + number')
        return
    print(f'Displacement step is: {displacement_step}') # please doublecheck that it is the same value you defined in the input

    # Displace the first fragment iteratively and save each structure
    displaced_fragment_coords = coords1.copy()  # Make a copy of the original coordinates of the fragment that is displaced
    print(f'Original coordinates displaced fragment:', displaced_fragment_coords)

    # Initialize the coordinates for the fixed fragment (e.g., coords2)
    coords_fixed = coords2.copy() # make a copy of the fixed fragment coordinates to append to the displaced ones
    print(f'Original coordinates fixed fragment:', coords_fixed)

    all_displaced_fragment_coords = [displaced_fragment_coords]  # List to store all displaced structures

    # Combine displaced coordinates with original ones
    all_combined_coords = [np.concatenate((coords_fixed, displaced_fragment_coords), axis=0)]  # List to store all combined structures

    fragment_centroids = [fragment_centroids[0]]  # List to store all centroids

    # Dissociation limit NEED AT LEAST 40 OF THEM MINIMUM
    diss_lim = 50 # change with the output value in agnstrom from func(dissociation_limit)

    for i in range(1, diss_lim):  # Iterate 50 times (adjust the number as needed) put this as to be the dissociation limit of each DFT run
        
        displacement_vector = [] 

        # Compute new set of coordinates for displaced fragments, change $displacement value in input file to tune the displacement
        displaced_fragment_coords = displace_fragment(coords1, displacement_direction, displacement_step, i)
        #print(f'Displaced fragment coord is: {displaced_fragment_coords}')

        combined_coords = np.concatenate((coords_fixed, displaced_fragment_coords), axis=0)
        all_combined_coords.append(combined_coords)

        # Update centroids for the displaced structure
        fragment_centroid = calculate_centroids([displaced_fragment_coords])
        fragment_centroids.append(fragment_centroid[0])
        print(f'Updated centroids:', fragment_centroid)

        # Write displaced structure to file
        output_file = Path(os.path.join(current_dir, f'displaced_structures/displaced_structure_{i}.xyz'))
        write_displaced_xyz_file(output_file, coords_fixed, displaced_fragment_coords, atom_identities)

        all_displaced_fragment_coords.append(displaced_fragment_coords)

        # Compute distance between the fixed fragment centroid and all the atoms from the displaced fragment
        centroid_to_displaced_distance = compute_distance_from_centroid(displaced_fragment_coords, centroids)
        print(f'Distance between displaced coordinates and centroid is: {centroid_to_displaced_distance}')

        # Write distances to file - needed for DFT calculations outputs
        distance_output_file = Path(os.path.join(current_dir, f'distance_files/distances_structures_{i}.xyz'))
        write_distances_file(distance_output_file, displaced_fragment_coords, centroid_to_displaced_distance, atom_identities, displacement_step)


    # Plot initial topology for molecular fragments and centroids
    fig = plot_starting_molecular_fragments(coords1, coords2, centroids)

    # Generate colors for the plots based on displacement iteration
    num_iterations = len(all_displaced_fragment_coords)
    colors = plt.cm.viridis(np.linspace(0.2, 1.0, num_iterations))

    # Plot displaced molecular fragments and centroids
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    # Plot original fragments and centroids
    ax.scatter(coords1[:, 0], coords1[:, 1], coords1[:, 2], color=colors[0], label='Molecule 1 (Original)')
    ax.scatter(coords2[:, 0], coords2[:, 1], coords2[:, 2], color=colors[0], label='Molecule 2 (Original)')
    ax.scatter(centroids[:, 0], centroids[:, 1], centroids[:, 2], color=colors[0], marker='x', s=100, label='Centroids (Original)')

    # Plot displaced fragments and centroids
    for i, displaced_coords in enumerate(all_displaced_fragment_coords[1:], start=1):
        color = colors[i]
        label = f'Iteration {i}'
        ax.scatter(displaced_coords[:, 0], displaced_coords[:, 1], displaced_coords[:, 2], color=color, label=label)
        ax.scatter(fragment_centroids[i][0], fragment_centroids[i][1], fragment_centroids[i][2], color=color, marker='x', s=100, label=f'Centroids ({label})')

    ax.legend()
    plt.show()


    # END TIMER: STOP TIMER AND PRINT
    elapsed_time = timer() - start  # in seconds
    print(f'Elapsed time for the code to run is: {elapsed_time}')


if __name__ == '__main__':
    main()


## Modules: GenerateMLDataset

In [None]:
# def main():

# current_dir = os.getcwd()
# print(f'Current working directory is: {current_dir}')

# root_directory = os.path.join(current_dir, 'Machine_learning/raw')
# print(f'Dataset root directory is: {root_directory}')

# output_csv_path = os.path.join(current_dir, 'Machine_learning/datasets/model_structures/test.csv')
# print(f'Dataset directory is: {output_csv_path}')

# generate_dataset = GenerateMLDataset(root_directory, output_csv_path)
# generate_dataset.search_files()

# if __name__ == "__main__":
# main()

## Modules: AminoStat

In [None]:
# # Example usage
# toolbox = NONCOVToolbox()
# amino_stats = toolbox.AminoStat()

# # Example usage
# current_dir = os.getcwd()

# protein_sequence = os.path.join(current_dir, 'scratch/amino_acid_stats/spidersilks.txt')
# spaced_sequence = os.path.join(current_dir, 'scratch/amino_acid_stats/spaced_spidersilks.txt')
# count_file = os.path.join(current_dir, 'scratch/amino_acid_stats/silks_amino_acid_count.txt')
# plot_file = os.path.join(current_dir, 'scratch/amino_acid_stats/silks_amino_acid_statistics.pdf')

# #amino_stats = AminoStat()

# amino_stats.space_prot_seq(protein_sequence, spaced_sequence)
# amino_stats.count_amino_acids(spaced_sequence, count_file)
# amino_stats.plot_amino_acid_statistics(count_file, plot_file)

# amino_stats.define_protein_domains()


## Preliminary results