In [None]:
# ------ SIMULATION VALIDATOR ------
# Before running analysis it's important to check that all simulations have run as expected
# ----------------------------------

expected_number_of_dump_files_per_sim = 96             # =(total number of timesteps + 1)/100

# This expects file structure:
# <unique_key>/                     
#         ‚îî‚îÄ‚îÄ NVT/                       
#               ‚îú‚îÄ‚îÄ dump_custom.C.00000.dat  
#               ‚îú‚îÄ‚îÄ dump_custom.C.00001.dat
from pathlib import Path

root_directory = Path("/u/vld/scat9451/main_project/")
def simulation_validator(directory):

    directory = Path(directory)  

    # Check number of files and file types
    errors = set()
    warnings = set()
    invalid_sims = 0

    
    for nvt_dir in directory.rglob("NVT"):
        if nvt_dir.is_dir():

            # All files in the NVT directory
            all_files = [f for f in nvt_dir.iterdir() if f.is_file()]
            total = len(all_files)

            unique_key = nvt_dir.parent.name
            # Check number of files matches expected number
            if total != expected_number_of_dump_files_per_sim:
                print(f"{total} files found in {unique_key}")
                invalid_sims += 1

            # Files that start with 'dump_custom'
            unrecognized_files = [f for f in all_files if not f.name.startswith("dump_custom")]
            if unrecognized_files:
                print(f"Unrecognized files found in {unique_key}")
                print(unrecognized_files)
                invalid_sims += 1
    
    # Check for ERROR and WARNING messages in .log files
    for log_file in directory.rglob("*.log"):
        
        has_error = False
        has_warning = False

        unique_key = log_file.parent.name 
        
        with log_file.open("r", encoding="utf-8", errors="ignore") as f:
            for line in f:
                if "ERROR" in line:
                    has_error = True
                    break  # stop reading; error takes priority
                elif "WARNING" in line:
                    has_warning = True

        # Decide classification
        if has_error:
            errors.add(unique_key)
            invalid_sims += 1
        elif has_warning:
            warnings.add(unique_key)
        

    # --- Print results nicely ---
    print("\nüîç Log scan summary")
    print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")

    if errors:
        print(f"‚ùå Simulations displaying errors ({len(errors)}):")
        for e in sorted(errors):
            print(f"   - {e}")
    else:
        print("‚úÖ No simulations displaying errors.")

    if warnings:
        print(f"\n‚ö†Ô∏è  Simulations displaying warnings ({len(warnings)}):")
        for w in sorted(warnings):
            print(f"   - {w}")
    else:
        print("\n‚úÖ No simulations displaying warnings.")

    print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
    print(f"Total invalid simulations: {invalid_sims}")

simulation_validator("/u/vld/scat9451/main_project/LAMMPS_simulations")

In [None]:
#--------- ANALYSIS SCRIPT---------------

# This script searches the LAMMPS_simulations directory and 
# 1. Identifies simulations that have not yet been analysed
# 2. Creates ovito files and renderings
# 3. Runs the following analyses as functions of density :
#   a) % of sp, sp2, sp3 environments 
#   c) Histogram of ring sizes for a given density
#   d) Frequency plots for n-mem rings
#   e) Radial Distribution Functions
#   f) Potential Energy
#   g) Bond Length

# # -------- OPTIONAL WAIT FUNCTION TO ALLOW FOR AUTOMATED RUNNING ---------
# import subprocess
# import time
# WAIT_TIME = 600  # seconds
# USER = "scat9451"

# while True:
#     job_status = subprocess.run(["qstat", "-u", USER], capture_output=True, text=True)
#     if not job_status.stdout.strip():  # empty means no jobs
#         break
#     print("Active jobs found - waiting ...")
#     time.sleep(WAIT_TIME)

# print("No active jobs found - proceeding ...")
# # ---------------------------------------------------------------------

import re
import numpy as np
import ovito
from ovito.io import import_file
from ovito.modifiers import CreateBondsModifier, FindRingsModifier, CoordinationAnalysisModifier, ColorCodingModifier, BondAnalysisModifier
from ovito.vis import Viewport, TachyonRenderer, ColorLegendOverlay, BondsVis
from ovito.qt_compat import QtCore

# ------ MAKE NEW DIRECTORIES ------
from pathlib import Path
import re
from collections import defaultdict

cwd = Path.cwd()

analysis_dir = cwd / "Analysis"
analysis_dir.mkdir(exist_ok=True)

ovito_dir = analysis_dir / "Ovito"
ovito_dir.mkdir(exist_ok=True)

structural_analysis_dir = analysis_dir / "Structural Analysis"
structural_analysis_dir.mkdir(exist_ok=True)
# ----------------------------------

# ------  IMPORT SIMULATION DATA ------
# 1. Searches recursively through the specified directory
# 2. Creates a dictionary sorted_imported_simulation_files = {unique_key: [sorted list of dump_file path objects]} 
# 3. This can be loaded like so: 
#   a) first item: unique_key, dump_file = next(iter(imported_simulation_files.items()))
#   b) loop through all items: for unique_key, dump_files in imported_simulation_files.items():

# NOTE: The unique_key is generated from the grandparent of the dumpfiles
# This function expects the following file structure, "dump_custom.C.00000" regex and unique_key regex:
#
# <unique_key>/                     
#         ‚îî‚îÄ‚îÄ NVT/                       
#               ‚îú‚îÄ‚îÄ dump_custom.C.00000.dat  
#               ‚îú‚îÄ‚îÄ dump_custom.C.00001.dat
root_directory = Path("/u/vld/scat9451/main_project/")
def import_simulation_data(directory):

    dump_file_name = re.compile(r"^dump_custom\.C\.(\d+)\.dat$") # Dump file regex
    unique_key_pattern = re.compile(r"^[A-Za-z]+_[A-Za-z0-9]+_[A-Za-z]+_\d+_\d+(?:\.\d+)?_\d+$") # Unique key regex
    
    directory = Path(directory)

    imported_simulation_files = defaultdict(list) # Imported files dictionary

    imported_files_counter = 0
    skipped_files_counter = 0

    for path in directory.rglob("*"):
        
        if not path.is_file(): # Filters for files not directories
            continue

        m = dump_file_name.match(path.name) # Enforce dump_file file naming
        if not m:
            continue

        parent = path.parent
        
        if parent.name != "NVT": # Enforce NVT file naming
            skipped_files_counter += 1
            print(f"ERROR: Parent directory for {path}, {parent} is not equal to NVT")
            continue

        grandparent = parent.parent
        if not unique_key_pattern.match(grandparent.name): # Enforce unique_key file naming
            skipped_files_counter += 1
            print(f"ERROR: Invalid unique_key name format '{grandparent.name}'")
            continue
        if not grandparent.name: # Protect against missing grandparent
            skipped_files_counter += 1
            print(f"ERROR: No grandparent directory for {path}")
            continue


        unique_key = grandparent.name
        numeric_index = int(m.group(1))

        imported_simulation_files[unique_key].append((numeric_index, path))
        imported_files_counter += 1

    # sort each list by numeric index and drop the numeric index in final structure
    sorted_imported_simulation_files = {}
    for key, items in imported_simulation_files.items():
        items.sort(key=lambda pair: pair[0])  # sort by numeric_index
        paths_sorted = [p for _, p in items]
        sorted_imported_simulation_files[key] = paths_sorted

    if imported_files_counter:
        print(f"Imported {imported_files_counter} dump files")
    if skipped_files_counter:
        print(f"Skipped {skipped_files_counter} dump files due to errors")

    return sorted_imported_simulation_files

imported_simulation_files = import_simulation_data("LAMMPS_simulations")
print(f"Imported {len(imported_simulation_files)} LAMMPS simulation files")

# Sets up an empty pipeline for each successive function to use
def empty_ovito_pipeline(imported_simulation_files):

    # Clear existing pipeline
    for p in list(ovito.scene.pipelines):
        p.remove_from_scene()

    if not imported_simulation_files:
        raise ValueError("No datafiles provided to empty_ovito_pipeline()")
    
    # Load the first item in the dictionary 
    unique_key, dump_file = next(iter(imported_simulation_files.items()))

    if not dump_file:
        raise ValueError(f"No dump files found for simulation '{unique_key}'")
    
    pipeline = import_file(dump_file)
    
    return pipeline
pipeline = empty_ovito_pipeline(imported_simulation_files)

# Data visualisation in Ovito
def ovito_analysis(data_dict, pipeline):

    if not data_dict:
        raise ValueError("No datafiles provided to ovito_analysis()")

    # ------- ANALYSIS OF IMPORTED FILES ------------
    # BUG: Image and video renderers error with: 
    # "RuntimeError: Visual element 'Rings' reported an error:Failed to build non-periodic representation of periodic surface mesh. Periodic domain might be too small." if ring mod is included.

    # Bond Modifier and Visuals 
    bond_modifier = CreateBondsModifier(cutoff=1.85)
    bond_modifier.vis.width = 0.15
    bond_modifier.vis.coloring_mode = BondsVis.ColoringMode.Uniform
    bond_modifier.vis.color = (0.5, 0.5, 0.5)
    pipeline.modifiers.append(bond_modifier)

    # Coordination Modifier and Colour Coding
    pipeline.modifiers.append(CoordinationAnalysisModifier(cutoff=1.85))
    colour_coding_mod = ColorCodingModifier(property="Coordination",start_value=1.0,end_value=4.0,gradient=ColorCodingModifier.Viridis(),discretize_color_map=True)
    pipeline.modifiers.append(colour_coding_mod)

    # Add to Scene
    pipeline.add_to_scene()

    # Viewing settings
    vp = Viewport()
    vp.type = Viewport.Type.Perspective

    # Coordination Legend
    legend = ColorLegendOverlay(
        title = "Coordination",
        modifier = colour_coding_mod,
        alignment = QtCore.Qt.AlignmentFlag.AlignHCenter | QtCore.Qt.AlignmentFlag.AlignBottom,
        orientation = QtCore.Qt.Orientation.Horizontal,
        font_size = 0.1,
        format_string = '%.0f' 
        )
    vp.overlays.append(legend)

    # Note: this function only renders for the first repeat 
    def is_run_1_(run_file_name):
        return re.match(r".*1$", run_file_name) is not None

    # Skipped/Created file counters
    skipped_ovito_files_counter = 0
    skipped_png_files_counter = 0
    skipped_avi_files_counter = 0
    created_ovito_files_counter = 0
    created_png_files_counter = 0
    created_avi_files_counter = 0
    

    for unique_key, dump_files in data_dict.items():

        tachyon = TachyonRenderer(shadows=False, direct_light_intensity=1.1)


        if is_run_1_(unique_key): # Only does analysis for run_1_
            
            ovito_save_file = analysis_dir / f"{unique_key}.ovito"
            
            # Ovito File Existance-Checker
            ovito_exists = any(analysis_dir.rglob(ovito_save_file.name))

            if ovito_exists:
                skipped_ovito_files_counter += 1
                continue
            
            pipeline.source.load(dump_files)

            # Set particle scaling (datafile specific)
            n_frames = pipeline.source.num_frames
            final_frame = max(0, n_frames - 1)
            data = pipeline.compute(frame = final_frame)
            data.particles.vis.scaling = 0.3

            # Set Zoom
            vp.zoom_all()

            ovito.scene.save(ovito_save_file)
            created_ovito_files_counter += 1   

        if is_run_1_(unique_key): # Only does analysis for run_1_
             
            img_save_file = analysis_dir / f"{unique_key}.png"
            img_save_file_str = str(img_save_file)
            
            # Ovito File Existance-Checker
            img_exists = any(analysis_dir.rglob(img_save_file.name))

            if img_exists:
                skipped_png_files_counter += 1
                continue
            
            pipeline.source.load(dump_files)

            # Set particle scaling (datafile specific)
            n_frames = pipeline.source.num_frames
            final_frame = max(0, n_frames - 1)
            data = pipeline.compute(frame = final_frame)
            data.particles.vis.scaling = 0.3

            # Set Zoom
            vp.zoom_all()
            
            vp.render_image(size=(1920,1080),
                            filename=img_save_file_str,
                            background=(1,1,1),
                            frame=final_frame,
                            renderer=tachyon)
            created_png_files_counter += 1 
              

        if is_run_1_(unique_key): # Only does analysis for run_1_
            
            vid_save_file   = analysis_dir / f"{unique_key}.avi"
            vid_save_file_str = str(vid_save_file)                                     

            # File Existance-Checker
            vid_exists   = any(analysis_dir.rglob(vid_save_file.name))
            if vid_exists:
                skipped_avi_files_counter += 1
                continue

            pipeline.source.load(dump_files)

            # Set particle scaling (datafile specific)
            n_frames = pipeline.source.num_frames
            final_frame = max(0, n_frames - 1)
            data = pipeline.compute(frame = final_frame)
            data.particles.vis.scaling = 0.3

            # Set Zoom
            vp.zoom_all()

            vp.render_anim(size=(1920,1080), 
                        filename=vid_save_file_str, 
                        fps=10,
                        renderer=tachyon)
            created_avi_files_counter += 1  


    # Print Skipped/Created files
    if skipped_ovito_files_counter:
        print(f"Skipped {skipped_ovito_files_counter} existing .ovito files")
    if skipped_png_files_counter:
        print(f"Skipped {skipped_png_files_counter} existing .png files")
    if skipped_avi_files_counter:
        print(f"Skipped {skipped_avi_files_counter} existing .avi files")
    
    if created_ovito_files_counter:
        print(f"Created {created_ovito_files_counter} .ovito files")
    if created_png_files_counter:
        print(f"Created {created_png_files_counter} .png files")
    if created_avi_files_counter:
        print(f"Created {created_avi_files_counter} .avi files")
    
    # Remove modifiers
    pipeline.modifiers.pop()
    pipeline.modifiers.pop()
    pipeline.modifiers.pop()


# ------ DATA GENERATION FUNCTIONS ------
# file_analysis(): 
#   1. uses imported_simulation_files from import_simulation_data()
#   2. uses the pipeline from empty_ovito_pipeline(): no modifiers by default
#   3. checks if files already exist in "Structural Analysis"
#   4. loads each file in datafiles into the existing pipeline
#   5. computes a specified data object for the given pipeline on each file and saves to a file name given by the "unique_key" + "data_tag"
#   NOTE:
#       a) requires "data_tag": e.g. "bond_length_data.txt" or "RDF_data.txt" (include file suffix, e.g. ".txt")
#       b) "data_function" refers to the ovito function that return the desired data object 
#               e.g. "data.particles['Coordination']" or "data.tables['coordination-rdf'].xy()" or "data.particles["c_pea"]" 
#       c) requires use of the "lambda data:" syntax for creating a throwaway function
#               e.g. When calling this func, use "file_analysis_and_existance_checker(datafiles,"ring_data",lambda data: data.tables["ring-size-histogram"].xy())""

def file_analysis(data_dict, pipeline, data_tag, data_function):

    if not data_dict:
        raise ValueError("No datafiles provided")
    
    # Skipped file counter
    skipped_files_counter = 0
    created_files_counter = 0

    # ----- STRUCTURAL ANALYSIS -----
    for unique_key, dump_files in data_dict.items():

        # File Name
        data_file_name = analysis_dir / f"{unique_key}_{data_tag}"

        # Structural Analysis File Existance-Checker
        data_exists = any(analysis_dir.rglob(data_file_name.name))
        if data_exists and not REPLACE_OLD_FILES:
            skipped_files_counter += 1
            continue 

        # Load new file into the pipeline and compute data for final frame        
        pipeline.source.load(dump_files)
        n_frames = pipeline.source.num_frames
        final_frame = max(0, n_frames - 1)
        data = pipeline.compute(frame = final_frame)

        # Data
        specific_data = data_function(data)
        np.savetxt(data_file_name, specific_data, delimiter=",", fmt="%.6f")
        created_files_counter += 1

    # Print Skipped/Created Files
    if skipped_files_counter:
        print(f"Skipped {skipped_files_counter} existing {data_tag} files")
    if created_files_counter:
        print(f"Created {created_files_counter} {data_tag} files")    

def list_attributes(pipeline):
    data = pipeline.compute()
    print("Per Particle Attributes:")
    for prop in data.particles.keys():
        print(prop)

def ring_analysis(data_dict, pipeline, min_ring_size, max_ring_size, bond_length):
    
    # Create Bonds Modifier
    bond_modifier = CreateBondsModifier(cutoff=bond_length)
    pipeline.modifiers.append(bond_modifier)
    
    # Ring Analysis Modifier
    ring_mod = FindRingsModifier(minimum_ring_size=min_ring_size, maximum_ring_size=max_ring_size)
    pipeline.modifiers.append(ring_mod)

    # Analysis
    file_analysis(data_dict, pipeline, "ring.txt", lambda data: data.tables["ring-size-histogram"].xy())

    # Remove Modifiers
    pipeline.modifiers.pop()
    pipeline.modifiers.pop()

def coordination_analysis(data_dict, pipeline, coordination_cutoff):
    
    # Coordination Analysis Modfier
    coord_mod = CoordinationAnalysisModifier(cutoff=coordination_cutoff)
    pipeline.modifiers.append(coord_mod)

    # Analysis
    file_analysis(data_dict, pipeline, "coordination.txt", lambda data: data.particles['Coordination'])

    # Remove Modifier
    pipeline.modifiers.pop()
    
def energy_analysis(data_dict, pipeline):

    # No modifier required

    # Analysis
    file_analysis(data_dict, pipeline, "potential_energy.txt", lambda data: data.particles["c_pea"])

def RDF_analysis(data_dict, pipeline, RDF_cutoff, bins):
    
    # Coordination Analysis Modfier for RDF
    RDF_coord_mod = CoordinationAnalysisModifier(cutoff=RDF_cutoff, number_of_bins=bins)
    pipeline.modifiers.append(RDF_coord_mod)

    # Analysis
    file_analysis(data_dict, pipeline, "RDF.txt", lambda data: data.tables['coordination-rdf'].xy())

    # Remove Modifier
    pipeline.modifiers.pop()    

def bond_length_analysis(data_dict, pipeline, bins, bond_length, bond_length_analysis_cutoff):

    # Create Bonds Modifier
    bond_modifier = CreateBondsModifier(cutoff=bond_length)
    pipeline.modifiers.append(bond_modifier)

    # Bond Analysis Modifier
    bond_analysis_mod = BondAnalysisModifier(bins = bins, length_cutoff=bond_length_analysis_cutoff)
    pipeline.modifiers.append(bond_analysis_mod)

    # Analysis
    file_analysis(data_dict, pipeline, "bond_length.txt", lambda data: data.tables["bond-length-distr"].xy())
  
    # Remove Modifiers
    pipeline.modifiers.pop()
    pipeline.modifiers.pop()
 
def force_analysis(data_dict, pipeline):

    # No modifier required

    # Analysis
    file_analysis(data_dict, pipeline, "forces.txt", lambda data: data.particles["Force"])

# ----- POSSIBLE ANALYSIS ------
# 2. Bond Angle
# 3. Conditional analysis (i.e. for sp/sp2/sp3 individually)
# 4. Young's Modulus
# 5. Coordination for each frame in a given sim plotted against simulation time
# ------------------------------

# -----------------------
# Use carefully - will regenerate ALL files (apart from renders)
REPLACE_OLD_FILES = False

if REPLACE_OLD_FILES:
    confirm = input("Are you sure you want to replace old files? (y/n): ").strip().lower()
    if confirm != "y":
        REPLACE_OLD_FILES = False
# -----------------------

#list_attributes(pipeline)
# force_analysis(imported_simulation_files, pipeline)
# bond_length_analysis(imported_simulation_files, pipeline, bins=1000, bond_length = 1.85, bond_length_analysis_cutoff=2.0)
# RDF_analysis(imported_simulation_files, pipeline, RDF_cutoff=6.0, bins=200)
# ring_analysis(imported_simulation_files, pipeline, min_ring_size=3, max_ring_size=24, bond_length=1.85)
# coordination_analysis(imported_simulation_files, pipeline, coordination_cutoff=1.85)
# energy_analysis(imported_simulation_files, pipeline)
# ovito_analysis(imported_simulation_files, pipeline)

In [None]:
# ----- FILE ORGANISER -----
import re, shutil
from pathlib import Path

# Assign Directories
cwd = Path.cwd()

analysis_dir = cwd / "Analysis"
analysis_dir.mkdir(parents=True,exist_ok=True)

ovito_dir = analysis_dir / "Ovito"
ovito_dir.mkdir(parents=True,exist_ok=True)

ovito_file_data_tags = [".ovito", ".png", ".avi"]
structural_analysis_file_data_tags = ["bond_length.txt", "coordination.txt", "potential_energy.txt", "RDF.txt", "ring.txt", "forces.txt"]

structural_analysis_dir = analysis_dir / "Structural Analysis"
structural_analysis_dir.mkdir(parents=True,exist_ok=True)

#structural_analysis_file_tags = [""] <--- This may be worth implementing if more than 2 folders are used in the future

# Regex pattern for reading "unique_key" + "data_tag""
data_file_name = re.compile(
    r'^(?P<element_symbol>[A-Za-z]{1,6})_'          # e.g. C
    r'(?P<potential_name>[^_]+)_'                   # e.g. GAP17
    r'(?P<simulation_type>[^_]+)_'                  # e.g. NVT
    r'(?P<num_atoms>\d+)_'                          # e.g. 64
    r'(?P<density>[\d.eE+-]+)_'                     # e.g. 1.5 or 1.85e+00
    r'(?P<run>\d+)'                                 # e.g. 1 (run number) 
    r'(?:_(?P<data_tag>.+)|(?P<data_tag2>\..+))$'   # e.g. ring.txt or .png (allows underscore after run_number or .avi etc...)   
)

# General function for moving a directory with overwrite function
def directory_move(directory, destination_dir):

    if not directory.exists():
        print(f"ERROR: {directory} does not exist")
        return "missing"
    
    # Make Destination Directory
    destination_dir.mkdir(parents=True, exist_ok=True)

    # New Path with existance check
    moved_dir = destination_dir / directory.name
    if moved_dir.exists() and not OVERWRITE:
        return "skipped"

    # Move Directory (with failsafe)
    try:
        shutil.move(str(directory), str(moved_dir))
    except Exception as e:
        print(f"ERROR: Failed to move {directory} --> {moved_dir}: {e}")
        return "failed"
    
    return "success"

# Robust, general function for sorting all files using the unique_key and directory_move()
def sort_directory(working_directory):

    sorted_files = 0
    skipped_files = 0
    failed_files = 0
    missing_files = 0
    unrecognized_data_tags = 0

    for file in working_directory.rglob("*"): # searches working directory for directories contained in it
        
        directory = Path(file)
        
        if not directory.is_file(): # select for files only (data files)
            continue

        m = data_file_name.match(directory.name)
        if not m:
            continue

        # Parse each element of the unique key
        element_symbol = m.group("element_symbol")
        if element_symbol == "C":
            element_name = "Carbon"
        else:
            print(f"Unrecognized element symbol for {directory}. Skipping file. \nAdd element_symbol --> element_name mapping")
            skipped_files += 1
            continue
        potential_name = m.group("potential_name")
        simulation_type = m.group("simulation_type")
        num_atoms = int(m.group("num_atoms"))
        density = m.group("density")
        
        data_tag =  (m.group("data_tag") or m.group("data_tag2") or "").lower()

        # Defaults to not moving files
        destination_dir = None
        
        # Moves ovito files into ovito_dir
        if data_tag in ovito_file_data_tags:   
            # Destination: evaluated using the unique key
            destination_dir = (
                ovito_dir
                / f"Element: {element_name}"
                / f"Potential: {potential_name}"
                / f"Type: {simulation_type}"
                / f"Atoms: {num_atoms}"
                / f"Density: {density}"
            )
        
        # Moves structural data files to structural_analysis_dir 
        elif data_tag in structural_analysis_file_data_tags:
            # Destination: evaluated using the unique key
            destination_dir = (
                structural_analysis_dir
                / f"Element: {element_name}"
                / f"Potential: {potential_name}"
                / f"Type: {simulation_type}"
                / f"Atoms: {num_atoms}"
                / f"Density: {density}"
            )
        
        if destination_dir  is None:
            unrecognized_data_tags +=1
            continue
        
        status = directory_move(directory, destination_dir)
        
        if status == "success":
            sorted_files += 1
        elif status == "skipped":
            skipped_files += 1
        elif status == "failed":
            failed_files += 1
        elif status == "missing":
            missing_files += 1
    
    if sorted_files:
        print(f"Sorted {sorted_files} files")
    if skipped_files:
        print(f"Skipped {skipped_files} existing files")
    if missing_files:
        print(f"{missing_files} missing files")
    if failed_files:
        print(f"{failed_files} failed files")
    if unrecognized_data_tags:
        print(f"{unrecognized_data_tags} unrecognized data tags")

    if not sorted_files and not skipped_files and not missing_files and not failed_files:
        print(f"No matching run files found in {working_directory}. Change run file regex if required.")

# -----------------------
# Use carefully - will replace ALL existing files 
OVERWRITE = False

if OVERWRITE:
    confirm = input("Are you sure you want to overwrite existing files? (y/n): ").strip().lower()
    if confirm != "y":
        REPLACE_OLD_FILES = False
# -----------------------

# Organise ovito files and structural_analysis files
sort_directory(analysis_dir)


In [3]:
# ------ GRAPHICAL ANALYSIS --------

# Graphical data points are means of all repeat runs with errors given as 1 standard deviation
import pandas as pd
import numpy as np
import re

# Create Graphical Analysis Directories
import re
from pathlib import Path
from collections import defaultdict
from typing import Dict, List

cwd = Path.cwd()
analysis_dir = cwd / "Analysis"
analysis_dir.mkdir(exist_ok=True)

graph_dir = analysis_dir / "Graphical Analysis"
graph_dir.mkdir(exist_ok=True)


# ------ FIGURE FORMATTING ------
import matplotlib.pyplot as plt
plt.style.use('1_column_fig.mplstyle')
# -------------------------------

# ------ IMPORT DATA FILES ------
# 1. Searches recursively through the specified directory
# 2. Creates a dictionary imported_data_files = {unique_data_key : [list of (density, Path), ...]} for all densities and runs (sorted by density)
#   a) Where unique_data_key = "unique_key (without density or repeat)" + "data_tag"
#   b) e.g. C_GAP17_NVT_64_ring.txt

def import_data_files(directory, data_tag):
    
    directory = Path(directory)

    # Regex pattern for reading "unique_key" + "data_tag""
    data_file_name = re.compile(
    r'^(?P<element_symbol>[A-Za-z]{1,6})_'          # e.g. C
    r'(?P<potential_name>[^_]+)_'                   # e.g. GAP17
    r'(?P<simulation_type>[^_]+)_'                  # e.g. NVT
    r'(?P<num_atoms>\d+)_'                          # e.g. 64
    r'(?P<density>[\d.eE+-]+)_'                     # e.g. 1.5 or 1.85e+00
    r'(?P<run>\d+)'                                 # e.g. 1 (run number) 
    r'(?:_(?P<data_tag>.+)|(?P<data_tag2>\..+))$'   # e.g. ring.txt or .png (allows underscore after run_number or .avi etc...)   
    )  

    imported_data_files = defaultdict(list) # Imported files dictionary

    skipped_data_files_counter = 0
    imported_data_files_counter = 0

    for path in directory.rglob("*"):
    
        if not path.is_file(): # Filters for files not directories
            continue

        m = data_file_name.match(path.name) # Enforce data file naming
        if not m:
            print(f"ERROR: Skipped {path}. Invalid data file name")
            skipped_data_files_counter += 1
            continue

        # Parse "unique_key" + "data_tag" components
        element_symbol    = m.group("element_symbol")
        potential_name    = m.group("potential_name")
        simulation_type   = m.group("simulation_type")
        num_atoms         = m.group("num_atoms")
        density           = m.group("density")
        run_number        = m.group("run")
        file_data_tag     = m.group("data_tag")


        if file_data_tag != data_tag:
            continue
        
        # Construct unique_data_key
        unique_data_key = f"{element_symbol}_{potential_name}_{simulation_type}_{num_atoms}"

        # Append the density to the list
        imported_data_files[unique_data_key].append((density, path))
        imported_data_files_counter += 1

    # Sort by density for each key
    for key, items in imported_data_files.items():
        try:
            items_sorted = sorted(items, key=lambda pair: pair[0])
            imported_data_files[key] = items_sorted
        except Exception:
            print(f"ERROR: Failed to sort density list for {key}")

    return imported_data_files

# ------ PLOTTING DATA ------
# Plot_type: marker (with error bars), line (with shaded regions) 
def plot(plot_type, data, x_label, y_label, chart_title, save_path):

    if plot_type not in existing_plot_types:
        raise ValueError(f"Unknown plot_type: {plot_type!r}")

    # Ensure Parent Directory Exists
    save_path = Path(save_path)
    save_path.parent.mkdir(parents=True, exist_ok=True)

    fig, ax = plt.subplots()  # Local figure size for each plot
    
    x = data["density"].values
    mean = data["mean"].values
    std = data["std"].values
    
    if plot_type == "marker":
        ax.errorbar(x, mean, yerr=std,fmt='-o', capthick=0.5, elinewidth=0.5)
    elif plot_type == "line":
        alpha_fill = 0.25
        ax.plot(x, mean, label="Mean")
        ax.fill_between(x, mean - std, mean + std, alpha=alpha_fill)
                
    # Labels and Titles
    ax.set_xlabel(f"{x_label}")
    ax.set_ylabel(f'{y_label}')
    ax.set_title(f"{chart_title}")

    # Save Plot to Graphical Analysis
    fig.savefig(save_path)
    plt.close(fig)  # Close figure to free memory
# ---------------------------

# File Reader Function
def file_reader(path):
    
    try:
        data = np.loadtxt(path, delimiter=',')
    except Exception as e:
        print(f"Skipping {path}; unable to load: {e}")
        return None
                
    # Failsafe incase there's no data in the file
    if data.size == 0:
        print(f"No data found in {path}")
        return None
    
    return data

# Imports data using import_data_files(directory, data_tag)
# Manipulates data using the unique_data_function
# Must be called within other functionsusing a data_function
def by_density_data_analysis(directory, data_tag, unique_data_function, save_file_name, chart_title, y_label):
    
    imported_data_files = import_data_files(directory, data_tag)
    if not imported_data_files:
        return None
    
    # Safety check to ensure not parsing paths when trying to name graphs
    base = Path(save_file_name).name

    # Loop over all unique keys
    for unique_data_key, entries in imported_data_files.items():

        results = [] # In the form: (density, unique_data_function_output)
        
        # Loop over all MD runs for different densities and repeats that match the unique key
        for density, path in entries: 
                        
            data = file_reader(Path(path))
            if data is None:
                continue
        
            # Performs data analysis function
            try:
                unique_data_function_output = unique_data_function(data)
            
            except Exception:
                unique_data_function_output = None
            if unique_data_function_output is None:
                continue
            
            results.append((density, float(unique_data_function_output)))

        if not results:
            print(f"Failed to analyse data for {unique_data_key}")
        
        x_label = "Density (g/cm¬≥)"
        
        save_file_dir = graph_dir / Path(unique_data_key)
        save_file_dir.mkdir(parents=True, exist_ok=True)

        final_name = f"{unique_data_key}_{base}"
        save_path = save_file_dir / final_name

        df = pd.DataFrame(results, columns=["density", "unique_data_function_output"])
        agg_df = df.groupby("density")["unique_data_function_output"].agg(["mean", "std"]).fillna(0.0).reset_index() 


        plot(set_plot_type, agg_df,x_label, y_label, chart_title,save_path)
        
        print(f"{final_name} created")

# Coordination analysis 
def coordination_analysis(directory, coordination_number):
    
    data_tag = "coordination.txt"

    # Label coordination number 
    if coordination_number == 2:
        env = "sp"
        y_label = "sp Carbon Proportion"
    elif coordination_number == 3:
        env = "sp2"
        y_label = "sp2 Carbon Proportion"
    elif coordination_number == 4:
        env = "sp3"
        y_label = "sp3 Carbon Proportion"
    else:
        print("ERROR: Coordination number should be between 2 and 4")
        env = f"{coordination_number}_coordinate"
        y_label == f"{coordination_number} coordinate atoms"
        return
   
    imported_data_files = import_data_files(directory, data_tag)

    if not imported_data_files:
        return None

    # Loop over all unique keys
    for unique_data_key, entries in imported_data_files.items():

        results = [] # In the form: (density, coordination_proportion)
        
        # Loop over all MD runs for different densities and repeats that match the unique key
        for density, path in entries: 
                        
            data = file_reader(path)
            if data is None:
                continue
        
            # Calculate coordination proportions
            coordination_proportion = (np.count_nonzero(data == coordination_number) / data.size)
            results.append((density, coordination_proportion))
        
        x_label = "Density (g/cm¬≥)"
        chart_title = f"Coordination vs. Density"
        
        save_file_dir = graph_dir / Path(unique_data_key)
        save_file_dir.mkdir(parents=True, exist_ok=True)
        save_file_name = save_file_dir / f"{env}_coordination_density_plot.png"

        df = pd.DataFrame(results, columns=["density", "coordination_proportions"])
        coordination_df = df.groupby("density")["coordination_proportions"].agg(["mean", "std"]).fillna(0.0).reset_index()  


        plot(plot_type = set_plot_type, data = coordination_df,
            x_label = x_label, y_label = y_label, 
            chart_title = chart_title,
            save_file_name = save_file_name)

# Ring Size analysis
def ring_analysis(directory, ring_size):
        
    data_tag = "ring.txt"
    chart_title = f"Number of {ring_size} Membered Rings vs. Density"
    save_file_name = f"{ring_size}_ring_density_plot.png"
    y_label = f"{ring_size} Membered Rings"

    # unique_data_function must be a callable that takes the loaded numpy array and returns a scalar
    def ring_function(data: np.ndarray):
        try:

            num_rings_of_ring_size = data[data[:, 0] == ring_size, 1][0]
            if num_rings_of_ring_size.size:
                return float(num_rings_of_ring_size)
            
            return None
        
        except Exception as exc:
            return None
    
    by_density_data_analysis(directory, data_tag, ring_function, save_file_name, chart_title, y_label)
   
# Potential energy analysis 
def potential_energy_analysis(directory):

    data_tag = "potential_energy.txt"

    imported_data_files = import_data_files(directory, data_tag)

    if not imported_data_files:
        return None

    # Loop over all unique keys
    for unique_data_key, entries in imported_data_files.items():

        results = [] # In the form: (density, mean PE)
        
        # Loop over all MD runs for different densities and repeats that match the unique key
        for density, path in entries: 
                        
            data = file_reader(path)
            if data is None:
                continue
        
            # Calculate PE
            mean_potential_energy = np.mean(data)
            results.append((density, mean_potential_energy))
        
        x_label = "Density (g/cm¬≥)"
        chart_title = f"Mean Potential Energy vs. Density"
        
        save_file_dir = graph_dir / Path(unique_data_key)
        save_file_dir.mkdir(parents=True, exist_ok=True)
        save_file_name = save_file_dir / "mean_potential_energy_density_plot.png"

        df = pd.DataFrame(results, columns=["density", "mean_potential_energy"])
        PE_df = df.groupby("density")["mean_potential_energy"].agg(["mean", "std"]).fillna(0.0).reset_index()  


        plot(plot_type = set_plot_type, data = PE_df,
            x_label = x_label, y_label = 'Mean Potential Energy (eV)', 
            chart_title = chart_title,
            save_file_name = save_file_name)

# Bond Length analysis
def bond_length_analysis(directory):
    
    data_tag = "bond_length.txt"

    imported_data_files = import_data_files(directory, data_tag)

    if not imported_data_files:
        return None

    # Loop over all unique keys
    for unique_data_key, entries in imported_data_files.items():

        results = [] # In the form: (density, mean bond length)
        
        # Loop over all MD runs for different densities and repeats that match the unique key
        for density, path in entries: 
                        
            data = file_reader(path)
            if data is None:
                continue
        
            # Split columns
            x = data[:, 0]   # bond length
            y = data[:, 1]   # frequency

            # Calculate mean bond length w.r.t. density
            mean_bond_length = np.average(x, weights=y)
            results.append((density, mean_bond_length))
        
        x_label = "Density (g/cm¬≥)"
        chart_title = f"Mean Bond Length vs. Density"
        
        save_file_dir = graph_dir / Path(unique_data_key)
        save_file_dir.mkdir(parents=True, exist_ok=True)
        save_file_name = save_file_dir / "mean_bond_length_density_plot.png"

        df = pd.DataFrame(results, columns=[("density"), "mean_bond_length"])
        bond_length_df = df.groupby("density")["mean_bond_length"].agg(["mean", "std"]).fillna(0.0).reset_index()   


        plot(plot_type = set_plot_type, data = bond_length_df,
            x_label = x_label, y_label = 'Mean Bond Length (√Ö)', 
            chart_title = chart_title,
            save_file_name = save_file_name)

# Force Analysis
def force_analysis(directory):
        
    data_tag = "forces.txt"
    chart_title = f"Mean Force Magnitude vs. Density"
    save_file_name = "mean_forces_density_plot.png"
    y_label = "mean_forces_density_plot.png"

    # unique_data_function must be a callable that takes the loaded numpy array and returns a scalar
    def force_function(data: np.ndarray):
        try:

            magnitude_of_force_vectors = np.linalg.norm(data, axis=1)
            mean_force_magnitude = np.mean(magnitude_of_force_vectors)
            if mean_force_magnitude.size:
                return float(mean_force_magnitude)
            
            return None
        
        except Exception as exc:
            return None
    
    by_density_data_analysis(directory, data_tag, force_function, save_file_name, chart_title, y_label)

# ------ RDF/DENSITY ANALYSIS ------
# Allows comparison of different densities
# The RDF is given as the mean of all repeats
# Shading represents the standard deviation
# Note: this assumes the same x values in the RDFS for the data, which is only true
# for the same number of atoms and bins --> specified in structure generator and earlier structural analysis
# for different analysis types it takes the first value of densities/quench_times if there is more than one
def RDF_density_analysis(directory, melt_time, quench_time, densities):
       
    data_name = "RDF"

    per_density_dataframes = {}

    for density in densities:
        datafiles = import_data_files(directory, data_name, "single_value", density, melt_time, quench_time)

        if not datafiles:
            continue

        RDF_results = [] # in the form (density, r_array, g_r_array)

        for file, file_density, _, _ in datafiles:
            # Failsafe incase files don't load
            try:
                RDF_data = np.loadtxt(file, delimiter=',')
            except Exception as e:
                # skip bad files
                print(f"Skipping {file}: {e}")
                continue
            
            # Failsafe incase there's no data in the file
            if RDF_data.size == 0:
                continue
            
            r_array = RDF_data[:, 0]
            g_r_array = RDF_data[:, 1]

            RDF_results.append((file_density, r_array, g_r_array))
            
        # skip if no files found 
        if len(RDF_results) == 0:
            continue

        # Check that the r scales are the same across all repeats
        ref_r = RDF_results[0][1] #first r scale
        consistent_r = True
        for _, r_arr, _ in RDF_results[1:]:
            if not np.allclose(ref_r, r_arr, atol=1e-6):
                print(f"ERROR: RDF data not consistent in r scale ‚Äî skipping density {density}")
                consistent_r = False
                break
        if not consistent_r:
            continue  # skip this density entirely

        # Expand each tuple (density, r_array, g_r_array) into rows to allow for aggregation
        rows = []
        for d, r_arr, g_arr in RDF_results:
            for r_val, g_val in zip(r_arr, g_arr):
                rows.append((d, r_val, g_val))
        
        # Dataframe
        df = pd.DataFrame(rows, columns=["density", "r", "g_r"])
        RDF_df = (df.groupby(["density", "r"])["g_r"].agg(["mean", "std"]).fillna(0.0).reset_index())
    
        # Saved to the dictionary
        per_density_dataframes[density] = RDF_df

    if not per_density_dataframes:
        print("ERROR: No data found for RDF plot")
        return

    # Plot Data
    # Shaded areas given by +- 1 standard deviation
    def plot_RDFs(rdf_dict, title, alpha_fill=0.25):
        
        fig, ax = plt.subplots()
        ax.set_xlabel("r (√Ö)")
        ax.set_ylabel("g(r)")
        ax.set_title(f"{title}")
        
        # sort densities for consistent legend order
        densities_sorted = sorted(rdf_dict.keys())

        for density in densities_sorted:
            df = rdf_dict[density]
            r = df['r'].values
            mean = df['mean'].values
            std = df['std'].values

            ax.plot(r, mean, label=f"{density:.2f} g/cm¬≥")
            ax.fill_between(r, mean - std, mean + std, alpha=alpha_fill)

        ax.legend(title="Density")

        # Save image
        dens_str = "-".join(f"{d:.2f}" for d in densities)
        save_file_name = f"RDF_m{melt_time}_c{quench_time}_{dens_str}gcm.png"
        filepath = graph_dir / save_file_name
        plt.savefig(filepath)
        plt.close(fig)
        print(f"{filepath.name} created")

    plot_RDFs(per_density_dataframes, title=f"Radial Distribution Function\nMelt Time = {melt_time} fs, Quench Time = {quench_time} fs")
#-----------------------------------

# ------ RING SIZE/ DENSITY HISTOGRAM ------
# Allows comparison of different densities
# Shading represents the standard deviation
def ring_histogram_density_analysis(directory, melt_time, quench_time, densities):
    
    data_name = "ring"

    per_density_dataframes = {}

    for density in densities:
        datafiles = import_data_files(directory, data_name, "single_value", density, melt_time, quench_time)

        if not datafiles:
            return None

        ring_results = [] # in the form (density, ring_size_array, frequency_array) 

        for file, file_density, _, _ in datafiles:
                    
            # Failsafe incase files don't load
            try:
                ring_data = np.loadtxt(file, delimiter=',')
            except Exception as e:
                # skip bad files
                print(f"Skipping {file}: {e}")
                continue
            
            # Failsafe incase there's no data in the file
            if ring_data.size == 0:
                continue
            
            ring_size_array = ring_data[:,0]
            frequency_array = ring_data[:,1]

            ring_results.append((file_density, ring_size_array, frequency_array))
            
        # skip if no files found 
        if len(ring_results) == 0:
            continue

        # Expand each tuple (density, ring_size_array, frequency_array) into rows to allow for aggregation
        rows = []
        for d, ring_size_arr, frequency_array in ring_results:
            for r_val, f_val in zip(ring_size_arr, frequency_array):
                rows.append((d, r_val, f_val))        

        # Dataframe
        df = pd.DataFrame(rows, columns=["density", "ring_size", "frequency"])
        ring_df = (df.groupby(["density", "ring_size"])["frequency"].agg(["mean", "std"]).fillna(0.0).reset_index())
        
        # Saved to the dictionary
        per_density_dataframes[density] = ring_df

    if not per_density_dataframes:
        print("ERROR: No data found for ring histogram plot")
        return

    # Plot Data
    # Shaded areas given by +- 1 standard deviation
    def plot_ring_hist(rdf_dict, title, alpha_fill=0.25):
        
        fig, ax = plt.subplots() 
        ax.set_xlabel("Ring Size")
        ax.set_ylabel("Frequency")
        ax.set_title(f"{title}")
        
        # sort densities for consistent legend order
        densities_sorted = sorted(rdf_dict.keys())

        for density in densities_sorted:
            df = rdf_dict[density]
            r = df['ring_size'].values
            mean = df['mean'].values
            std = df['std'].values

            ax.errorbar(r, mean, yerr=std,fmt='-o', capthick=0.5, elinewidth=0.5, label=f"{density} g/cm¬≥")

            #ax.fill_between(r, mean - std, mean + std, alpha=alpha_fill)

        ax.legend(title="Density")

        # Save image
        dens_str = "-".join(f"{d:.2f}" for d in densities)
        save_file_name = f"ring_histogram_m{melt_time}_c{quench_time}_{dens_str}gcm.png"
        filepath = graph_dir / save_file_name
        plt.savefig(filepath)
        plt.close(fig)
        print(f"{filepath.name} created")

    plot_ring_hist(per_density_dataframes, title=f"Ring Size Histogram\nMelt Time = {melt_time} fs, Quench Time = {quench_time} fs")
# ------------------------------------------

# ------ ANALYSIS PARAMETERS ------
existing_plot_types = ["marker", "line"]

set_plot_type = "line"

set_analysis_directory = "Analysis/Structural Analysis"

set_coordination_number = 4
set_ring_size = 6
# ---------------------------------

#coordination_analysis(directory=set_analysis_directory, coordination_number=set_coordination_number)

#bond_length_analysis(directory=set_analysis_directory)

#potential_energy_analysis(directory=set_analysis_directory)

# ring_analysis(directory= set_analysis_directory, ring_size=set_ring_size)

force_analysis(directory=set_analysis_directory)

# #Multi density plots
# ring_histogram_density_analysis('Analysis/Structural Analysis', 
#                                 melt_time=5000, quench_time=10000, densities=(2.0,3.0))

# RDF_density_analysis('Analysis/Structural Analysis', 
#                      melt_time=5000, quench_time=10000, densities=(2.0,3.5,1.5))

C_GAP17_NVT_64_mean_forces_density_plot.png created
C_GAP17_NVT_216_mean_forces_density_plot.png created
