In [None]:
# ------ SIMULATION VALIDATOR ------
# Before running analysis it's important to check that all simulations have run as expected
# ----------------------------------


# # -------- OPTIONAL WAIT FUNCTION ---------
import subprocess
import time
WAIT_TIME = 600  # seconds
USER = "scat9451"

while True:
    job_status = subprocess.run(["qstat", "-u", USER], capture_output=True, text=True)
    if not job_status.stdout.strip():  # empty means no jobs
        break
    print("Active jobs found - waiting ...")
    time.sleep(WAIT_TIME)

print("No active jobs found - proceeding ...")
# # ---------------------------------------------------------------------

expected_number_of_dump_files_per_sim = 96             # =(total number of timesteps + 1)/100

# This expects file structure:
# <unique_key>/                     
#         ‚îî‚îÄ‚îÄ NVT/                       
#               ‚îú‚îÄ‚îÄ dump_custom.C.00000.dat  
#               ‚îú‚îÄ‚îÄ dump_custom.C.00001.dat
from pathlib import Path

root_directory = Path("/u/vld/scat9451/main_project/")
def simulation_validator(directory):

    directory = Path(directory)  

    # Check number of files and file types
    errors = set()
    warnings = set()
    invalid_sims = 0

    
    for nvt_dir in directory.rglob("NVT"):
        if nvt_dir.is_dir():

            # All files in the NVT directory
            all_files = [f for f in nvt_dir.iterdir() if f.is_file()]
            total = len(all_files)

            unique_key = nvt_dir.parent.name
            # Check number of files matches expected number
            if total != expected_number_of_dump_files_per_sim:
                print(f"{total} files found in {unique_key}")
                invalid_sims += 1

            # Files that start with 'dump_custom'
            unrecognized_files = [f for f in all_files if not f.name.startswith("dump_custom")]
            if unrecognized_files:
                print(f"Unrecognized files found in {unique_key}")
                print(unrecognized_files)
                invalid_sims += 1
    
    # Check for ERROR and WARNING messages in .log files
    for log_file in directory.rglob("*.log"):
        
        has_error = False
        has_warning = False

        unique_key = log_file.parent.name 
        
        with log_file.open("r", encoding="utf-8", errors="ignore") as f:
            for line in f:
                if "ERROR" in line:
                    has_error = True
                    break  # stop reading; error takes priority
                elif "WARNING" in line:
                    has_warning = True

        # Decide classification
        if has_error:
            errors.add(unique_key)
            invalid_sims += 1
        elif has_warning:
            warnings.add(unique_key)
        

    # --- Print results nicely ---
    print("\nüîç Log scan summary")
    print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")

    if errors:
        print(f"‚ùå Simulations displaying errors ({len(errors)}):")
        for e in sorted(errors):
            print(f"   - {e}")
    else:
        print("‚úÖ No simulations displaying errors.")

    if warnings:
        print(f"\n‚ö†Ô∏è  Simulations displaying warnings ({len(warnings)}):")
        for w in sorted(warnings):
            print(f"   - {w}")
    else:
        print("\n‚úÖ No simulations displaying warnings.")

    print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
    print(f"Total invalid simulations: {invalid_sims}")

simulation_validator("/u/vld/scat9451/main_project/LAMMPS_simulations")

In [None]:
#--------- ANALYSIS SCRIPT---------------

# This script searches the LAMMPS_simulations directory and 
# 1. Identifies simulations that have not yet been analysed
# 2. Creates ovito files and renderings
# 3. Runs the following analyses as functions of density :
#   a) % of sp, sp2, sp3 environments 
#   c) Histogram of ring sizes for a given density
#   d) Frequency plots for n-mem rings
#   e) Radial Distribution Functions
#   f) Potential Energy
#   g) Bond Length

import re
import numpy as np
import ovito
from ovito.io import import_file
from ovito.modifiers import CreateBondsModifier, FindRingsModifier, CoordinationAnalysisModifier, ColorCodingModifier, BondAnalysisModifier
from ovito.vis import Viewport, TachyonRenderer, ColorLegendOverlay, BondsVis
from ovito.qt_compat import QtCore

# ------ MAKE NEW DIRECTORIES ------
from pathlib import Path
import re
from collections import defaultdict

cwd = Path.cwd()

analysis_dir = cwd / "Analysis"
analysis_dir.mkdir(exist_ok=True)

ovito_dir = analysis_dir / "Ovito"
ovito_dir.mkdir(exist_ok=True)

structural_analysis_dir = analysis_dir / "Structural Analysis"
structural_analysis_dir.mkdir(exist_ok=True)
# ----------------------------------

# ------  IMPORT SIMULATION DATA ------
# 1. Searches recursively through the specified directory
# 2. Creates a dictionary sorted_imported_simulation_files = {unique_key: [sorted list of dump_file path objects]} 
# 3. This can be loaded like so: 
#   a) first item: unique_key, dump_file = next(iter(imported_simulation_files.items()))
#   b) loop through all items: for unique_key, dump_files in imported_simulation_files.items():

# NOTE: The unique_key is generated from the grandparent of the dumpfiles
# This function expects the following file structure, "dump_custom.C.00000" regex and unique_key regex:
#
# <unique_key>/                     
#         ‚îî‚îÄ‚îÄ NVT/                       
#               ‚îú‚îÄ‚îÄ dump_custom.C.00000.dat  
#               ‚îú‚îÄ‚îÄ dump_custom.C.00001.dat
root_directory = Path("/u/vld/scat9451/main_project/")
def import_simulation_data(directory):

    dump_file_name = re.compile(r"^dump_custom\.C\.(\d+)\.dat$") # Dump file regex
    unique_key_pattern = re.compile(r"^[A-Za-z]+_[A-Za-z0-9]+_[A-Za-z]+_\d+_\d+(?:\.\d+)?_\d+$") # Unique key regex
    
    directory = Path(directory)

    imported_simulation_files = defaultdict(list) # Imported files dictionary

    imported_files_counter = 0
    skipped_files_counter = 0

    for path in directory.rglob("*"):
        
        if not path.is_file(): # Filters for files not directories
            continue

        m = dump_file_name.match(path.name) # Enforce dump_file file naming
        if not m:
            continue

        parent = path.parent
        
        if parent.name != "NVT": # Enforce NVT file naming
            skipped_files_counter += 1
            print(f"ERROR: Parent directory for {path}, {parent} is not equal to NVT")
            continue

        grandparent = parent.parent
        if not unique_key_pattern.match(grandparent.name): # Enforce unique_key file naming
            skipped_files_counter += 1
            print(f"ERROR: Invalid unique_key name format '{grandparent.name}'")
            continue
        if not grandparent.name: # Protect against missing grandparent
            skipped_files_counter += 1
            print(f"ERROR: No grandparent directory for {path}")
            continue


        unique_key = grandparent.name
        numeric_index = int(m.group(1))

        imported_simulation_files[unique_key].append((numeric_index, path))
        imported_files_counter += 1

    # sort each list by numeric index and drop the numeric index in final structure
    sorted_imported_simulation_files = {}
    for key, items in imported_simulation_files.items():
        items.sort(key=lambda pair: pair[0])  # sort by numeric_index
        paths_sorted = [p for _, p in items]
        sorted_imported_simulation_files[key] = paths_sorted

    if imported_files_counter:
        print(f"Imported {imported_files_counter} dump files")
    if skipped_files_counter:
        print(f"Skipped {skipped_files_counter} dump files due to errors")

    return sorted_imported_simulation_files

imported_simulation_files = import_simulation_data("LAMMPS_simulations")
print(f"Imported {len(imported_simulation_files)} LAMMPS simulation files")

# Sets up an empty pipeline for each successive function to use
def empty_ovito_pipeline(imported_simulation_files):

    # Clear existing pipeline
    for p in list(ovito.scene.pipelines):
        p.remove_from_scene()

    if not imported_simulation_files:
        raise ValueError("No datafiles provided to empty_ovito_pipeline()")
    
    # Load the first item in the dictionary 
    unique_key, dump_file = next(iter(imported_simulation_files.items()))

    if not dump_file:
        raise ValueError(f"No dump files found for simulation '{unique_key}'")
    
    pipeline = import_file(dump_file)
    
    return pipeline
pipeline = empty_ovito_pipeline(imported_simulation_files)

# Data visualisation in Ovito
def ovito_analysis(data_dict, pipeline):

    if not data_dict:
        raise ValueError("No datafiles provided to ovito_analysis()")

    # ------- ANALYSIS OF IMPORTED FILES ------------
    # BUG: Image and video renderers error with: 
    # "RuntimeError: Visual element 'Rings' reported an error:Failed to build non-periodic representation of periodic surface mesh. Periodic domain might be too small." if ring mod is included.

    # Bond Modifier and Visuals 
    bond_modifier = CreateBondsModifier(cutoff=1.85)
    bond_modifier.vis.width = 0.15
    bond_modifier.vis.coloring_mode = BondsVis.ColoringMode.Uniform
    bond_modifier.vis.color = (0.5, 0.5, 0.5)
    pipeline.modifiers.append(bond_modifier)

    # Coordination Modifier and Colour Coding
    pipeline.modifiers.append(CoordinationAnalysisModifier(cutoff=1.85))
    colour_coding_mod = ColorCodingModifier(property="Coordination",start_value=1.0,end_value=4.0,gradient=ColorCodingModifier.Viridis(),discretize_color_map=True)
    pipeline.modifiers.append(colour_coding_mod)

    # Add to Scene
    pipeline.add_to_scene()

    # Viewing settings
    vp = Viewport()
    vp.type = Viewport.Type.Perspective

    # Coordination Legend
    legend = ColorLegendOverlay(
        title = "Coordination",
        modifier = colour_coding_mod,
        alignment = QtCore.Qt.AlignmentFlag.AlignHCenter | QtCore.Qt.AlignmentFlag.AlignBottom,
        orientation = QtCore.Qt.Orientation.Horizontal,
        font_size = 0.1,
        format_string = '%.0f' 
        )
    vp.overlays.append(legend)

    # Note: this function only renders for the first repeat 
    def is_run_1_(run_file_name):
        return re.match(r".*1$", run_file_name) is not None

    # Skipped/Created file counters
    skipped_ovito_files_counter = 0
    skipped_png_files_counter = 0
    skipped_avi_files_counter = 0
    created_ovito_files_counter = 0
    created_png_files_counter = 0
    created_avi_files_counter = 0
    

    for unique_key, dump_files in data_dict.items():

        tachyon = TachyonRenderer(shadows=False, direct_light_intensity=1.1)


        if is_run_1_(unique_key): # Only does analysis for run_1_
            
            ovito_save_file = analysis_dir / f"{unique_key}.ovito"
            
            # Ovito File Existance-Checker
            ovito_exists = any(analysis_dir.rglob(ovito_save_file.name))

            if ovito_exists:
                skipped_ovito_files_counter += 1
                continue
            
            pipeline.source.load(dump_files)

            # Set particle scaling (datafile specific)
            n_frames = pipeline.source.num_frames
            final_frame = max(0, n_frames - 1)
            data = pipeline.compute(frame = final_frame)
            data.particles.vis.scaling = 0.3

            # Set Zoom
            vp.zoom_all()

            ovito.scene.save(ovito_save_file)
            created_ovito_files_counter += 1   

        if is_run_1_(unique_key): # Only does analysis for run_1_
             
            img_save_file = analysis_dir / f"{unique_key}.png"
            img_save_file_str = str(img_save_file)
            
            # Ovito File Existance-Checker
            img_exists = any(analysis_dir.rglob(img_save_file.name))

            if img_exists:
                skipped_png_files_counter += 1
                continue
            
            pipeline.source.load(dump_files)

            # Set particle scaling (datafile specific)
            n_frames = pipeline.source.num_frames
            final_frame = max(0, n_frames - 1)
            data = pipeline.compute(frame = final_frame)
            data.particles.vis.scaling = 0.3

            # Set Zoom
            vp.zoom_all()
            
            vp.render_image(size=(1920,1080),
                            filename=img_save_file_str,
                            background=(1,1,1),
                            frame=final_frame,
                            renderer=tachyon)
            created_png_files_counter += 1 
              

        if is_run_1_(unique_key): # Only does analysis for run_1_
            
            vid_save_file   = analysis_dir / f"{unique_key}.avi"
            vid_save_file_str = str(vid_save_file)                                     

            # File Existance-Checker
            vid_exists   = any(analysis_dir.rglob(vid_save_file.name))
            if vid_exists:
                skipped_avi_files_counter += 1
                continue

            pipeline.source.load(dump_files)

            # Set particle scaling (datafile specific)
            n_frames = pipeline.source.num_frames
            final_frame = max(0, n_frames - 1)
            data = pipeline.compute(frame = final_frame)
            data.particles.vis.scaling = 0.3

            # Set Zoom
            vp.zoom_all()

            vp.render_anim(size=(1920,1080), 
                        filename=vid_save_file_str, 
                        fps=10,
                        renderer=tachyon)
            created_avi_files_counter += 1  


    # Print Skipped/Created files
    if skipped_ovito_files_counter:
        print(f"Skipped {skipped_ovito_files_counter} existing .ovito files")
    if skipped_png_files_counter:
        print(f"Skipped {skipped_png_files_counter} existing .png files")
    if skipped_avi_files_counter:
        print(f"Skipped {skipped_avi_files_counter} existing .avi files")
    
    if created_ovito_files_counter:
        print(f"Created {created_ovito_files_counter} .ovito files")
    if created_png_files_counter:
        print(f"Created {created_png_files_counter} .png files")
    if created_avi_files_counter:
        print(f"Created {created_avi_files_counter} .avi files")
    
    # Remove modifiers
    pipeline.modifiers.pop()
    pipeline.modifiers.pop()
    pipeline.modifiers.pop()


# ------ DATA GENERATION FUNCTIONS ------
# file_analysis(): 
#   1. uses imported_simulation_files from import_simulation_data()
#   2. uses the pipeline from empty_ovito_pipeline(): no modifiers by default
#   3. checks if files already exist in "Structural Analysis"
#   4. loads each file in datafiles into the existing pipeline
#   5. computes a specified data object for the given pipeline on each file and saves to a file name given by the "unique_key" + "data_tag"
#   NOTE:
#       a) requires "data_tag": e.g. "bond_length_data.txt" or "RDF_data.txt" (include file suffix, e.g. ".txt")
#       b) "data_function" refers to the ovito function that return the desired data object 
#               e.g. "data.particles['Coordination']" or "data.tables['coordination-rdf'].xy()" or "data.particles["c_pea"]" 
#       c) requires use of the "lambda data:" syntax for creating a throwaway function
#               e.g. When calling this func, use "file_analysis_and_existance_checker(datafiles,"ring_data",lambda data: data.tables["ring-size-histogram"].xy())""

def file_analysis(data_dict, pipeline, data_tag, data_function):

    if not data_dict:
        raise ValueError("No datafiles provided")
    
    # Skipped file counter
    skipped_files_counter = 0
    created_files_counter = 0

    # ----- STRUCTURAL ANALYSIS -----
    for unique_key, dump_files in data_dict.items():

        # File Name
        data_file_name = analysis_dir / f"{unique_key}_{data_tag}"

        # Structural Analysis File Existance-Checker
        data_exists = any(analysis_dir.rglob(data_file_name.name))
        if data_exists and not REPLACE_OLD_FILES:
            skipped_files_counter += 1
            continue 

        # Load new file into the pipeline and compute data for final frame        
        pipeline.source.load(dump_files)
        n_frames = pipeline.source.num_frames
        final_frame = max(0, n_frames - 1)
        data = pipeline.compute(frame = final_frame)

        # Data
        specific_data = data_function(data)
        np.savetxt(data_file_name, specific_data, delimiter=",", fmt="%.6f")
        created_files_counter += 1

    # Print Skipped/Created Files
    if skipped_files_counter:
        print(f"Skipped {skipped_files_counter} existing {data_tag} files")
    if created_files_counter:
        print(f"Created {created_files_counter} {data_tag} files")    

def list_attributes(pipeline):
    data = pipeline.compute()
    print("Per Particle Attributes:")
    for prop in data.particles.keys():
        print(prop)

def ring_analysis(data_dict, pipeline, min_ring_size, max_ring_size, bond_length):
    
    # Create Bonds Modifier
    bond_modifier = CreateBondsModifier(cutoff=bond_length)
    pipeline.modifiers.append(bond_modifier)
    
    # Ring Analysis Modifier
    ring_mod = FindRingsModifier(minimum_ring_size=min_ring_size, maximum_ring_size=max_ring_size)
    pipeline.modifiers.append(ring_mod)

    # Analysis
    file_analysis(data_dict, pipeline, "ring.txt", lambda data: data.tables["ring-size-histogram"].xy())

    # Remove Modifiers
    pipeline.modifiers.pop()
    pipeline.modifiers.pop()

def coordination_analysis(data_dict, pipeline, coordination_cutoff):
    
    # Coordination Analysis Modfier
    coord_mod = CoordinationAnalysisModifier(cutoff=coordination_cutoff)
    pipeline.modifiers.append(coord_mod)

    # Analysis
    file_analysis(data_dict, pipeline, "coordination.txt", lambda data: data.particles['Coordination'])

    # Remove Modifier
    pipeline.modifiers.pop()
    
def energy_analysis(data_dict, pipeline):

    # No modifier required

    # Analysis
    file_analysis(data_dict, pipeline, "potential_energy.txt", lambda data: data.particles["c_pea"])

def RDF_analysis(data_dict, pipeline, RDF_cutoff, bins):
    
    # Coordination Analysis Modfier for RDF
    RDF_coord_mod = CoordinationAnalysisModifier(cutoff=RDF_cutoff, number_of_bins=bins)
    pipeline.modifiers.append(RDF_coord_mod)

    # Analysis
    file_analysis(data_dict, pipeline, "RDF.txt", lambda data: data.tables['coordination-rdf'].xy())

    # Remove Modifier
    pipeline.modifiers.pop()    

def bond_length_analysis(data_dict, pipeline, bins, bond_length, bond_length_analysis_cutoff):

    # Create Bonds Modifier
    bond_modifier = CreateBondsModifier(cutoff=bond_length)
    pipeline.modifiers.append(bond_modifier)

    # Bond Analysis Modifier
    bond_analysis_mod = BondAnalysisModifier(bins = bins, length_cutoff=bond_length_analysis_cutoff)
    pipeline.modifiers.append(bond_analysis_mod)

    # Analysis
    file_analysis(data_dict, pipeline, "bond_length.txt", lambda data: data.tables["bond-length-distr"].xy())
  
    # Remove Modifiers
    pipeline.modifiers.pop()
    pipeline.modifiers.pop()
 
def force_analysis(data_dict, pipeline):

    # No modifier required

    # Analysis
    file_analysis(data_dict, pipeline, "forces.txt", lambda data: data.particles["Force"])

# ----- POSSIBLE ANALYSIS ------
# 2. Bond Angle
# 3. Conditional analysis (i.e. for sp/sp2/sp3 individually)
# 4. Young's Modulus
# 5. Coordination for each frame in a given sim plotted against simulation time
# ------------------------------

# -----------------------
# Use carefully - will regenerate ALL files (apart from renders)
REPLACE_OLD_FILES = False

if REPLACE_OLD_FILES:
    confirm = input("Are you sure you want to replace old files? (y/n): ").strip().lower()
    if confirm != "y":
        REPLACE_OLD_FILES = False
# -----------------------

#list_attributes(pipeline)
force_analysis(imported_simulation_files, pipeline)
bond_length_analysis(imported_simulation_files, pipeline, bins=1000, bond_length = 1.85, bond_length_analysis_cutoff=2.0)
RDF_analysis(imported_simulation_files, pipeline, RDF_cutoff=6.0, bins=200)
ring_analysis(imported_simulation_files, pipeline, min_ring_size=3, max_ring_size=24, bond_length=1.85)
coordination_analysis(imported_simulation_files, pipeline, coordination_cutoff=1.85)
energy_analysis(imported_simulation_files, pipeline)
# ovito_analysis(imported_simulation_files, pipeline)

In [None]:
# ----- FILE ORGANISER -----
import re, shutil
from pathlib import Path

# Assign Directories
cwd = Path.cwd()

analysis_dir = cwd / "Analysis"
analysis_dir.mkdir(parents=True,exist_ok=True)

ovito_dir = analysis_dir / "Ovito"
ovito_dir.mkdir(parents=True,exist_ok=True)

ovito_file_data_tags = [".ovito", ".png", ".avi"]
structural_analysis_file_data_tags = ["bond_length.txt", "coordination.txt", "potential_energy.txt", "RDF.txt", "ring.txt", "forces.txt"]

structural_analysis_dir = analysis_dir / "Structural Analysis"
structural_analysis_dir.mkdir(parents=True,exist_ok=True)

#structural_analysis_file_tags = [""] <--- This may be worth implementing if more than 2 folders are used in the future

# Regex pattern for reading "unique_key" + "data_tag""
data_file_name = re.compile(
    r'^(?P<element_symbol>[A-Za-z]{1,6})_'          # e.g. C
    r'(?P<potential_name>[^_]+)_'                   # e.g. GAP17
    r'(?P<simulation_type>[^_]+)_'                  # e.g. NVT
    r'(?P<num_atoms>\d+)_'                          # e.g. 64
    r'(?P<density>[\d.eE+-]+)_'                     # e.g. 1.5 or 1.85e+00
    r'(?P<run>\d+)'                                 # e.g. 1 (run number) 
    r'(?:_(?P<data_tag>.+)|(?P<data_tag2>\..+))$'   # e.g. ring.txt or .png (allows underscore after run_number or .avi etc...)   
)

# General function for moving a file with overwrite function
def directory_move(file, destination_dir):

    file=Path(file)
    if not file.exists():
        print(f"ERROR: {file} does not exist")
        return "missing"
    
    # Make Destination Directory
    destination_dir.mkdir(parents=True, exist_ok=True)

    # New Path with existance check
    moved_file = destination_dir / file.name
    if moved_file.exists(): 
        if not OVERWRITE:
            return "skipped"
        else:
            moved_file.unlink()
            shutil.move(str(file), str(moved_file))
            return "overwritten"

    else:
        shutil.move(str(file), str(moved_file))
        return "success"

# Robust, general function for sorting all files using the unique_key and directory_move()
def sort_directory(working_directory):

    sorted_files = 0
    skipped_files = 0
    overwritten_files = 0
    missing_files = 0
    unrecognized_data_tags = 0

    for file in working_directory.rglob("*"): # searches working directory for directories contained in it
        
        directory = Path(file)
        
        if not directory.is_file(): # select for files only (data files)
            continue

        m = data_file_name.match(directory.name)
        if not m:
            continue

        # Parse each element of the unique key
        element_symbol = m.group("element_symbol")
        if element_symbol == "C":
            element_name = "Carbon"
        else:
            print(f"Unrecognized element symbol for {directory}. Skipping file. \nAdd element_symbol --> element_name mapping")
            skipped_files += 1
            continue
        potential_name = m.group("potential_name")
        simulation_type = m.group("simulation_type")
        num_atoms = int(m.group("num_atoms"))
        density = m.group("density")
        
        data_tag =  (m.group("data_tag") or m.group("data_tag2") or "")

        # Defaults to not moving files
        destination_dir = None
        
        # Moves ovito files into ovito_dir
        if data_tag in ovito_file_data_tags:   
            # Destination: evaluated using the unique key
            destination_dir = (
                ovito_dir
                / f"Element: {element_name}"
                / f"Potential: {potential_name}"
                / f"Type: {simulation_type}"
                / f"Atoms: {num_atoms}"
                / f"Density: {density}"
            )
        
        # Moves structural data files to structural_analysis_dir 
        elif data_tag in structural_analysis_file_data_tags:
            # Destination: evaluated using the unique key
            destination_dir = (
                structural_analysis_dir
                / f"Element: {element_name}"
                / f"Potential: {potential_name}"
                / f"Type: {simulation_type}"
                / f"Atoms: {num_atoms}"
                / f"Density: {density}"
            )
        
        if destination_dir is None:
            unrecognized_data_tags +=1
            print(f"Unrecognized data_tag: {data_tag}")
            continue
        
        status = directory_move(directory, destination_dir)
        
        if status == "success":
            sorted_files += 1
        elif status == "skipped":
            skipped_files += 1
        elif status == "overwritten":
            overwritten_files += 1
        elif status == "missing":
            missing_files += 1
    
    if sorted_files:
        print(f"Sorted {sorted_files} files")
    if skipped_files:
        print(f"Skipped {skipped_files} existing files")
    if missing_files:
        print(f"{missing_files} missing files")
    if overwritten_files:
        print(f"{overwritten_files} failed files")
    if unrecognized_data_tags:
        print(f"{unrecognized_data_tags} unrecognized data tags")

    if not sorted_files and not skipped_files and not missing_files and not failed_files:
        print(f"No matching run files found in {working_directory}. Change run file regex if required.")

# -----------------------
# Use carefully - will replace ALL existing files 
OVERWRITE = False

if OVERWRITE:
    confirm = input("Are you sure you want to overwrite existing files? (y/n): ").strip().lower()
    if confirm != "y":
        REPLACE_OLD_FILES = False
# -----------------------

# Organise ovito files and structural_analysis files
sort_directory(analysis_dir)


In [None]:
# ------ GRAPHICAL ANALYSIS --------

# Graphical data points are means of all repeat runs with errors given as 1 standard deviation
import pandas as pd
import numpy as np
import re
import shutil

# Create Graphical Analysis Directories
import re
from pathlib import Path
from collections import defaultdict

cwd = Path.cwd()
analysis_dir = cwd / "Analysis"
analysis_dir.mkdir(exist_ok=True)

graph_dir = analysis_dir / "Graphical Analysis"
graph_dir.mkdir(exist_ok=True)

potential_comparison_dir = graph_dir / "Potential Comparison Plots"
potential_comparison_dir.mkdir(parents=True, exist_ok=True)


# ------ FIGURE FORMATTING ------
import matplotlib.pyplot as plt
plt.style.use('1_column_fig.mplstyle')
# -------------------------------

# ------ IMPORT DATA FILES ------
# 1. Searches recursively through the specified directory
# 2. Creates a dictionary:
#    imported_data_files = {unique_data_key : [list of (density, Path), ...]} 
#    only includes files with the specified "data_tag" 
#    Where unique_data_key = "unique_key (without density or repeat) e.g. C_GAP17_NVT_64
#    Therefore only imports files with the corresponding data_tag
# 3. If specify_density = True, it will only import files with density in key_analysis_densities
key_analysis_densities = [1.5, 2.0, 2.5, 3.0, 3.5]
def import_data_files(directory, data_tag, specify_density):
    
    directory = Path(directory)

    # Regex pattern for reading "unique_key" + "data_tag""
    data_file_name = re.compile(
    r'^(?P<element_symbol>[A-Za-z]{1,6})_'          # e.g. C
    r'(?P<potential_name>[^_]+)_'                   # e.g. GAP17
    r'(?P<simulation_type>[^_]+)_'                  # e.g. NVT
    r'(?P<num_atoms>\d+)_'                          # e.g. 64
    r'(?P<density>[\d.eE+-]+)_'                     # e.g. 1.5 or 1.85e+00
    r'(?P<run>\d+)'                                 # e.g. 1 (run number) 
    r'(?:_(?P<data_tag>.+)|(?P<data_tag2>\..+))$'   # e.g. ring.txt or .png (allows underscore after run_number or .avi etc...)   
    )  

    imported_data_files = defaultdict(list) # Imported files dictionary

    skipped_data_files_counter = 0
    imported_data_files_counter = 0

    for path in directory.rglob("*"):
    
        if not path.is_file(): # Filters for files not directories
            continue

        m = data_file_name.match(path.name) # Enforce data file naming
        if not m:
            print(f"ERROR: Skipped {path}. Invalid data file name")
            skipped_data_files_counter += 1
            continue

        # Parse "unique_key" + "data_tag" components
        element_symbol    = m.group("element_symbol")
        potential_name    = m.group("potential_name")
        simulation_type   = m.group("simulation_type")
        num_atoms         = m.group("num_atoms")
        density           = m.group("density")
        run_number        = m.group("run")
        file_data_tag     = m.group("data_tag")

        # only import files with the correct data_tag
        if file_data_tag != data_tag:
            continue
        
        # Functionality for only importing files with a specific density
        density = float(density)
        if specify_density and density not in key_analysis_densities:
            continue

        # Construct unique_data_key
        unique_data_key = f"{element_symbol}_{potential_name}_{simulation_type}_{num_atoms}"

        # Append the density to the list
        imported_data_files[unique_data_key].append((density, path))
        imported_data_files_counter += 1

    # Sort by density for each key
    for key, items in imported_data_files.items():
        try:
            items_sorted = sorted(items, key=lambda pair: pair[0])
            imported_data_files[key] = items_sorted
        except Exception:
            print(f"ERROR: Failed to sort density list for {key}")

    if import_data_files:
        print(f"Imported {imported_data_files_counter} {data_tag} files")
    if skipped_data_files_counter:
        print(f"Skipped {skipped_data_files_counter} {data_tag} files")
    return imported_data_files
#-------------------------------

# ------ DATA ANALYSIS ----------
# Imports data using import_data_files()
# Manipulates data using specified unique_data_function to return dependent variables
# OPTIONS:
# 1. potential_comaparison = False:
#   a) specify_density = False: returns plots of scalar values (e.g. potential energy) against density for each unique key
#   b) specify_density = True: returns plots of arrays (e.g. g(r) vs r) for each density in "key_analysis_densities" for each (element, potential, sim_type, num_atoms) key
# 2. Potential comparison = True: 
#   a) performs the same analysis but overlays plots of each different potential
# Default is specify_density=False, which then runs a analysis with Density as the default independent variable
# Allows you to specify_density will will return 1 graph per density in key_analysis_densities
def by_density_data_analysis(directory, data_tag, unique_data_function, save_file_name, chart_title, x_label, y_label, specify_density=False):
    
    # Import Data Files
    imported_data_files = import_data_files(directory, data_tag, specify_density=specify_density)
    if not imported_data_files:
        return None   

    # Safety check to ensure not parsing paths when trying to name graphs
    base = Path(save_file_name).name

    # Used for potential comparison
    unique_data_key_dict = {}         # Dictionary of {unique_data_key: agg_df}, where agg_df is a processed data_frame including mean and std

    # Loop over all unique keys and create per_density and specific density plots. Adds processed data to the unique_data_key_dict
    for unique_data_key, entries in imported_data_files.items():

        results = [] # In the form: (density (unique_data_function_output: (independent_variable (None by default), dependent_variable)))
        
        # Loop over all MD runs for different densities and repeats that match the unique key
        for density, path in entries: 
                        
            try:
                data = np.loadtxt(path, delimiter=',')
            except Exception as e:
                print(f"Skipping {path}; unable to load: {e}")
                continue

            # Failsafe in case there's no data in the file
            if data is not None and getattr(data, "size", None) == 0:
                print(f"Data file {path} is empty")
                continue

            # Performs data analysis function
            try:
                unique_data_function_output = unique_data_function(data)
            
            except Exception as e:
                print(f"ERROR: failed unique_data_function for {path}:")
                unique_data_function_output = None
            
            if unique_data_function_output is None:
                continue            

            if specify_density:

                # Check that the unique data_function is returning correct data format: (independent_variable, dependent_variable)
                if not (hasattr(unique_data_function_output, "__iter__") and not isinstance(unique_data_function_output, (str, bytes))):
                    print(f"Invalid output for {path}: expected (independent, dependent), got {type(unique_data_function_output)}")
                    continue
                if not hasattr(unique_data_function_output, "__len__") or len(unique_data_function_output) != 2:
                    print(f"Invalid output length for {path}: expected 2, got {len(unique_data_function_output) if hasattr(unique_data_function_output, '__len__') else 'N/A'}")
                    continue

                independent_variable, dependent_variable = unique_data_function_output
                results.append((float(density), independent_variable, dependent_variable))
            
            else:
                
                # Check that the unique data_function is returning correct data format: scalar value (dependent_variable)
                if not np.isscalar(unique_data_function_output):
                    print("Error: function should return a single number.")
                    continue

                dependent_variable = unique_data_function_output
                results.append((float(density), float(dependent_variable)))

        if not results:
            print(f"Failed to analyse data for {unique_data_key}")
            continue
        
        save_file_dir = graph_dir / Path(unique_data_key)
        save_file_dir.mkdir(parents=True, exist_ok=True)

        final_name = f"{unique_data_key}_{base}"
        save_path = save_file_dir / final_name
        
        # Handling returning arrays/lists or scalars from the unique_data_function
        if specify_density:

            # Sample first value
            first_ind = results[0][1]   # independent_variable from first result
            first_dep = results[0][2]   # dependent_variable from first result
            
            #Lists/ Arrays
            if (isinstance(first_ind, (list, np.ndarray)) and isinstance(first_dep, (list, np.ndarray))):
                
                frames = []
                for density, independent_variable, dependent_variable in results:
                    
                    ind_arr = np.asarray(independent_variable)
                    dep_arr = np.asarray(dependent_variable)

                    # Check for length mismatch    
                    if ind_arr.shape[0] != dep_arr.shape[0]:
                        print(f"ERROR: Length mismatch for density {density}: indep {ind_arr.shape} vs dep {dep_arr.shape}; skipping this entry.")
                        continue
                    
                    df_local = pd.DataFrame({
                        "density": [float(density)] * ind_arr.shape[0],
                        "independent_variable": ind_arr,
                        "dependent_variable": dep_arr
                    })
                    frames.append(df_local)
                
                if not frames:
                    print(f"No valid array entries for {unique_data_key}")
                    continue

                df = pd.concat(frames, ignore_index=True)
                agg_df = (df.groupby(["density", "independent_variable"])["dependent_variable"].agg(mean="mean", std="std").fillna(0.0).reset_index())
          
            else:
                # Scalar values
                df = pd.DataFrame(results, columns=["density", "independent_variable", "dependent_variable"])
                agg_df = (df.groupby(["density", "independent_variable"])["dependent_variable"].agg(mean="mean", std="std").fillna(0.0).reset_index())


            for density in agg_df["density"].unique():
                slice_df = agg_df[agg_df["density"] == density].reset_index(drop=True)
                per_density_chart_title = f"{density}gcm {chart_title}"
                per_density_save_path = save_file_dir / f"{density}_{final_name}"
                plot(set_plot_type, slice_df, x_label, y_label, per_density_chart_title, per_density_save_path, specify_density=specify_density)
                print(f"{final_name} created")

            # Organise Plots into subfolders
            for path in graph_dir.rglob(f'*{save_file_name}'):
                
                parent = path.parent
                sf = Path(save_file_name)
                target_stem = sf.stem   # e.g. 'RDF'
                sub_dir = parent / f"{target_stem} plots" 
                
                # Don't move files once moved (avoid nesting)
                if f"{target_stem} plots" in path.parts:
                    continue

                sub_dir.mkdir(parents = True, exist_ok=True)
                dest = sub_dir / path.name
                shutil.move(str(path), str(dest))       
        
        else:
            df = pd.DataFrame(results, columns=["density", "dependent_variable"])
            agg_df = df.groupby("density")["dependent_variable"].agg(mean="mean", std="std").fillna(0.0).reset_index() 

            plot(set_plot_type, agg_df,x_label, y_label, chart_title, save_path, specify_density=specify_density)
            print(f"{final_name} created")

        
        # Add agg_df to the unique_data_key_dict
        unique_data_key_dict[unique_data_key] = agg_df

    # Parsing unique_data_key_information
    unique_data_key_name = re.compile(
    r'^(?P<element_symbol>[A-Za-z]{1,6})_'          # e.g. C
    r'(?P<potential_name>[^_]+)_'                   # e.g. GAP17
    r'(?P<simulation_type>[^_]+)_'                  # e.g. NVT
    r'(?P<num_atoms>\d+)$'                          # e.g. 64
    )

    # Creates a new dictionary of {element, simulation_type, num_atoms : (potential, agg_df)}
    per_potential_dict = {} 
    for unique_data_key, agg_df in unique_data_key_dict.items():
        
        match = unique_data_key_name.match(unique_data_key)
        if not match:
            print(f"Unique_data_key name: {unique_data_key} does not match expected format. Change unique_data_key_name regex or file naming")
            continue

        key_information = match.groupdict()
        element = key_information["element_symbol"]
        potential = key_information["potential_name"]
        simulation_type = key_information["simulation_type"]
        num_atoms = int(key_information["num_atoms"])

        per_potential_key = (element, simulation_type, num_atoms)
        per_potential_dict.setdefault(per_potential_key, {})

        if potential in per_potential_dict[per_potential_key]:
            per_potential_dict[per_potential_key][potential] = pd.concat(
            [per_potential_dict[per_potential_key][potential], agg_df], ignore_index=True
        )
        else:
            per_potential_dict[per_potential_key][potential] = agg_df.copy()

    # Plots multi line comparison plots for each potential
    for (element, simulation_type, num_atoms), value in per_potential_dict.items():
        
        # Print all different potentials found
        if len(value) > 1:
            print(f"{element}_{simulation_type}_{num_atoms}: Datafiles found for {len(value)} different potentials")
            potentials = []
            for potentials_dict in per_potential_dict.values():
                potentials.extend(potentials_dict.keys())

            print(sorted(set(potentials)))
        else:
            print(f"{element}_{simulation_type}_{num_atoms}: Only 1 potential found, skipping potential comparison analysis")
            continue
        
        for potential, agg_df in value.items():

            save_file_dir = potential_comparison_dir

            final_name = f"{element}_{simulation_type}_{num_atoms}_{base}"
            save_path = save_file_dir / final_name

            if specify_density:
                
                for density in agg_df["density"].unique():
                    slice_df = agg_df[agg_df["density"] == density].reset_index(drop=True)
                    per_density_chart_title = f"{density}gcm {chart_title}"
                    per_density_save_path = save_file_dir / f"{density}_{final_name}"
                    plot(set_plot_type, slice_df, x_label, y_label, per_density_chart_title, 
                        per_density_save_path, specify_density=specify_density)
                    print(f"{final_name} created")

                # Organise Plots into subfolders
                for path in graph_dir.rglob(f'*{save_file_name}'):
                    
                    parent = path.parent
                    sf = Path(save_file_name)
                    target_stem = sf.stem   # e.g. 'RDF'
                    sub_dir = parent / f"{target_stem} plots" 
                    
                    # Don't move files once moved (avoid nesting)
                    if f"{target_stem} plots" in path.parts:
                        continue

                    sub_dir.mkdir(parents = True, exist_ok=True)
                    dest = sub_dir / path.name
                    shutil.move(str(path), str(dest))      
            
            else:
                plot(set_plot_type, agg_df,x_label, y_label, chart_title, 
                    save_path, specify_density=specify_density)

# -------------------------------

# ------ PLOTTING DATA ---------
# Plot Types: marker (with error bars), line (with shaded regions) 
def plot(plot_type, data, x_label, y_label, chart_title, save_path, specify_density):

    if plot_type not in existing_plot_types:
        raise ValueError(f"Unknown plot_type: {plot_type!r}")

    # Ensure Parent Directory Exists
    save_path = Path(save_path)
    save_path.parent.mkdir(parents=True, exist_ok=True)

    fig, ax = plt.subplots()  # Local figure size for each plot
    

    if "independent_variable" in data.columns and specify_density:
        x = data["independent_variable"].values
    else:
        x = data["density"].values

    mean = data["mean"].values
    std = data["std"].values
    
    if plot_type == "marker":
        ax.errorbar(x, mean, yerr=std,fmt='-o', capthick=0.5, elinewidth=0.5)
    elif plot_type == "line":
        alpha_fill = 0.25
        ax.plot(x, mean, label="Mean")
        ax.fill_between(x, mean - std, mean + std, alpha=alpha_fill)
                
    # Labels and Titles
    ax.set_xlabel(f"{x_label}")
    ax.set_ylabel(f'{y_label}')
    ax.set_title(f"{chart_title}")

    # Save Plot to Graphical Analysis
    fig.savefig(save_path)
    plt.close(fig)  # Close figure to free memory


def multi_plot(plot_type, per_potential_dict, x_label, y_label, chart_title, save_path, specify_density):

    save_path = Path(save_path)
    save_path.parent.mkdir(parents=True, exist_ok=True)

    fig, ax = plt.subplots()

    for potential, agg_df in per_potential_dict.items():

        if "independent_variable" in agg_df.columns and specify_density:
            x = agg_df["independent_variable"].values
        else:
            x = agg_df["density"].values


        mean = np.asarray(agg_df["mean"])
        std = np.asarray(agg_df["std"])

        if plot_type == "marker":
            ax.errorbar(x, mean, yerr=std,fmt='-o', capthick=0.5, elinewidth=0.5, label = potential)
        
        elif plot_type == "line":
            alpha_fill = 0.25
            ax.plot(x, mean, label=potential)
            ax.fill_between(x, mean - std, mean + std, alpha=alpha_fill)

    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(chart_title)
    ax.legend(title = "Potential")

    fig.savefig(save_path)
    plt.close(fig)
# ------------------------------

# -------------------------------------------------------------------------------------------------
# Instructions: 
# 1. Assign data_tag, chat_title, save_file_name and y_label (RAW TEXT ONLY, NO PATHS)
# 2. Create a function that computes a desired value per structure file and returns this value
# 3. Call by_density_data_analysis, using your new function as its unique_data_function
# 4. Set specify_density=True to plot for only 1 density (e.g. RDF, ring histogram)
# 5. If specify_density = True, the unique data function MUST return: (independent_variable, dependent_variable)
# --------------------------------------------------------------------------------------------------

# Coordination analysis 
def coordination_analysis(directory, coordination_number):
    
    # Label coordination number 
    mapping = {
        2: ("sp", "sp Carbon Proportion"),
        3: ("sp2", "sp2 Carbon Proportion"),
        4: ("sp3", "sp3 Carbon Proportion")
    }
    env, y_label = mapping.get(coordination_number, (None, None))

    if env is None:
        print("ERROR: Coordination number should be between 2 and 4")
        env = f"{coordination_number}_coordinate"
        y_label = f"{coordination_number} coordinate atoms"
        
    data_tag = "coordination.txt"
    chart_title = f"Coordination vs. Density"
    save_file_name =f"{env}_coordination_proportion.png"
    x_label = "Density (g/cm¬≥)"
    
    # unique_data_function must be a callable that takes the loaded numpy array and returns a scalar
    def coord_function(data: np.ndarray):
        return float((np.count_nonzero(data == coordination_number) / data.size))

    by_density_data_analysis(directory, data_tag, coord_function, save_file_name, chart_title, x_label, y_label, specify_density=False)

# Ring Size analysis
def ring_analysis(directory, ring_size):
        
    data_tag = "ring.txt"
    chart_title = f"Number of {ring_size} Membered Rings vs. Density"
    save_file_name = f"{ring_size}_ring_frequency.png"
    x_label = "Density (g/cm¬≥)"
    y_label = f"{ring_size} Membered Rings"

    # unique_data_function must be a callable that takes the loaded numpy array and returns a scalar
    def ring_function(data: np.ndarray):
        return float(data[data[:, 0] == ring_size, 1][0])
    
    by_density_data_analysis(directory, data_tag, ring_function, save_file_name, chart_title, x_label, y_label, specify_density=False)
   
# Potential energy analysis 
def potential_energy_analysis(directory):

    data_tag = "potential_energy.txt"
    chart_title = f"Mean Potential Energy vs. Density"
    save_file_name = "mean_PE.png"
    x_label = "Density (g/cm¬≥)"
    y_label = 'Mean Potential Energy (eV)'
    
    # unique_data_function must be a callable that takes the loaded numpy array and returns a scalar
    def PE_function(data: np.ndarray):
        return np.mean(data)
    
    by_density_data_analysis(directory, data_tag, PE_function, save_file_name, chart_title, x_label, y_label, specify_density=False)

# Bond Length analysis
def bond_length_analysis(directory):
    
    data_tag = "bond_length.txt"
    chart_title = f"Mean Bond Length vs. Density"
    save_file_name = "mean_bond_length.png"
    x_label = "Density (g/cm¬≥)"
    y_label = 'Mean Bond Length (√Ö)'

    def bond_length_function (data: np.array):
        return np.average(data[:, 0], weights=data[:, 1])

    by_density_data_analysis(directory, data_tag, bond_length_function, save_file_name, chart_title, x_label, y_label, specify_density=False)

# Force Analysis
def force_analysis(directory):
        
    data_tag = "forces.txt"
    chart_title = "Mean Force Magnitude vs. Density"
    save_file_name = "mean_force_magnitude.png"
    x_label = "Density (g/cm¬≥)"
    y_label = "Mean Force Magnitude (eV/√Ö)"

    # unique_data_function must be a callable that takes the loaded numpy array and returns a scalar
    def force_function(data: np.ndarray):
        return np.mean(np.linalg.norm(data, axis=1))
    
    by_density_data_analysis(directory, data_tag, force_function, save_file_name, chart_title, x_label, y_label, specify_density=False)

# RDF Analysis
def RDF_analysis(directory):
    
    data_tag = "RDF.txt"
    chart_title = "Radial Distribution Function"
    save_file_name = "RDF.png"
    y_label = "g(r)"
    x_label = "r (√Ö)"

    # unique_data_function must be a callable that takes the loaded numpy array and returns a scalar
    def RDF_function(data: np.ndarray):
        r_array = data[:, 0]
        g_r_array = data[:, 1]
        return (r_array, g_r_array)
    
    by_density_data_analysis(directory, data_tag, RDF_function, save_file_name, chart_title, x_label, y_label, specify_density=True)

# Ring Histogram Analysis
def ring_size_distribution_analysis(directory):
    
    data_tag = "ring.txt"
    chart_title = "Ring Size Distribution"
    save_file_name = "ring_size_distribution.png"
    y_label = "Frequency"
    x_label = "Ring Size"

    def ring_distribution_function(data: np.ndarray):
        ring_size_array = data[:,0]
        frequency_array = data[:,1]
        return (ring_size_array, frequency_array)
    
    by_density_data_analysis(directory, data_tag, ring_distribution_function, save_file_name, chart_title, x_label, y_label, specify_density=True)
# -------------------------------------------------------------------------------------------------


# ------ ANALYSIS PARAMETERS ------
existing_plot_types = ["marker", "line"]

set_plot_type = "line"

set_analysis_directory = "Analysis/Structural Analysis"

set_coordination_number = 2
set_ring_size = 6
# ---------------------------------

coordination_analysis(directory=set_analysis_directory, coordination_number=set_coordination_number)

bond_length_analysis(directory=set_analysis_directory)

potential_energy_analysis(directory=set_analysis_directory)

ring_analysis(directory= set_analysis_directory, ring_size=set_ring_size)

force_analysis(directory=set_analysis_directory)

RDF_analysis(directory=set_analysis_directory)

# ring_size_distribution_analysis(directory=set_analysis_directory)
