# Build the seed train and test sets


#### Families

Initial seed name:

AlGaN_333_K_F_e_r

K = composition
F = family
e = expansion
r = randomised displacements

Folder structure:

supercell_size
    |
    -- functional
        |
        -- family
            |
            -- optgeom
            -- expansion
            -- random dispalcements

So we can select the F using random split and then select a subset of structures within that group.

In [None]:
%load_ext autoreload
%reload_ext autoreload
%autoreload 2

from pymatgen.core.structure import Structure
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from pymatgen.io.ase import AseAtomsAdaptor
from ase.io import read
from ase.data import chemical_symbols
import copy
import numpy as np
import pandas as pd
from scipy.constants import physical_constants
HARTREE_TO_EV = physical_constants['Hartree energy in eV'][0]
BOHR_TO_ANGSTROM = physical_constants['Bohr radius'][0] * 1e10  # Convert meters to Ã…ngstrom
BOHR_CUBED_TO_ANGSTROM_CUBED = BOHR_TO_ANGSTROM**3

import os
import json
import re
import shutil as sh

from janus_core.calculations.single_point import SinglePoint
from janus_core.calculations.geom_opt import GeomOpt
import sys
sys.path.append('../src')   # add src/ to Python path
from functions import *
from structure_generation import *
from helper_functions import *
from crystal_helper_functions import *

current_dir = os.path.dirname(os.path.abspath("__file__"))
sys.path.append(current_dir)

def vview(structure):
    from ase.visualize import view
    from pymatgen.io.ase import AseAtomsAdaptor
    
    view(AseAtomsAdaptor().get_atoms(structure))

### AlGaN - OPTIMISE STRUCTURES WITH CRYSTAL23

In [None]:
AlN_bulk_r2scan = Structure.from_file('../data/bulk_structures/AlN.cif')

supercell_matrix = np.eye(3)*3

AlN_333_r2scan = copy.deepcopy(AlN_bulk_r2scan)

AlN_333_r2scan.make_supercell(supercell_matrix)

AlN_333_r2scan.num_sites

In [None]:
# write_CRYSTAL_gui_from_data(AlN_bulk_crystal.lattice.matrix,AlN_bulk_crystal.atomic_numbers,AlN_bulk_crystal.cart_coords,'../data/bulk_structures/crystal/AlN.gui')

In [None]:
GaN_bulk_r2scan = Structure.from_file('../data/bulk_structures/GaN.cif')

supercell_matrix = np.eye(3)*3

GaN_333_r2scan = copy.deepcopy(GaN_bulk_r2scan)

GaN_333_r2scan.make_supercell(supercell_matrix)

GaN_333_r2scan.num_sites

## Symmetry analysis

In [None]:
atom_indices_aln_333 = get_all_configurations_pmg(AlN_333_r2scan)
np.savetxt('../data/symmetry/aln_333_indices.csv',atom_indices_aln_333,delimiter=',',fmt='%d')

In [None]:
atom_indices_aln = np.genfromtxt('../data/symmetry/aln_333_indices.csv',delimiter=',').astype('int')

## Generate SIC random structures

This saves the data into a json file, I'm not sure we need it.

In [None]:
active_sites=np.where(np.array(AlN_333_r2scan.atomic_numbers) == 13)[0]
num_active_sites=len(active_sites)

N_atom = 31

all_config_atom_number = {}

for n,N_atoms in enumerate(np.arange(27,28)):

    structures_random = generate_random_structures(AlN_333_r2scan,atom_indices=atom_indices_aln,
                                                   N_atoms=N_atoms,new_species=31,N_config=500,
                                                   DFT_config=20,active_sites=active_sites)

    atom_number_tmp = []
    for structure in structures_random:
        atom_number_tmp.append(list(structure.atomic_numbers))

    all_config_atom_number[str(N_atoms)] = atom_number_tmp

# with open('data/supercell_structures/AlGaN/AlGaN_super3.json', 'w') as json_file:
#     json.dump(all_config_atom_number, json_file)

In [None]:
with open('data/supercell_structures/AlGaN/AlGaN_super3.json', 'r', encoding='utf-8') as json_file:
    AlGaN_super3_all_config = json.load(json_file)

In [None]:
# Generate the Extended XYZ files

lattice = AlN_333_r2scan.lattice.matrix
positions = AlN_333_r2scan.frac_coords
for N_atoms in AlGaN_super3_all_config.keys():
    
    folder_name = f'data/supercell_structures/AlGaN/AlGaN_super3_{N_atoms}'
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    for i,config in enumerate(AlGaN_super3_all_config[N_atoms]):
        structure = Structure(lattice,config,positions)

        write_extended_xyz(structure,os.path.join(folder_name,f'AlGaN_super3_{N_atoms}_{i}.xyz'))

## Write CRYSTAL input files

In [None]:
AlN_lattice_matrix = np.round(AlN_super3.lattice.matrix[0:3], 6)
GaN_lattice_matrix = np.round(GaN_super3.lattice.matrix[0:3], 6)

AlGaN_lattice_matrix = (AlN_lattice_matrix + GaN_lattice_matrix)/2

In [None]:
from structure_generation import write_CRYSTAL_gui_from_data


lattice_matrix = AlGaN_lattice_matrix
cart_coords = np.round(AlN_super3.cart_coords,8)


for N_atoms in AlGaN_super3_all_config.keys():
    
    for i,config in enumerate(AlGaN_super3_all_config[N_atoms]):

        atomic_numbers = config

        folder_name = f'data/crystal/AlGaN/super3/config_{i}/'
        file_name = f'AlGaN_super3_{N_atoms}_{i}_0.gui'
        full_name = os.path.join(folder_name,file_name)
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        
        for i,config in enumerate(AlGaN_super3_all_config[N_atoms]):
            structure = Structure(lattice_matrix,config,cart_coords)

            write_CRYSTAL_gui_from_data(lattice_matrix,atomic_numbers,
                                cart_coords, full_name, dimensionality = 3)

In [None]:
folder_path = 'data/crystal/AlGaN/super3/'

folders = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

for folder in folders:

    folder_path_new = os.path.join(folder_path,folder)
    slurm_file_name = os.path.join(folder_path_new,f'{folder}_0.slurm')
    files = [name for name in os.listdir(folder_path_new) 
         if os.path.isfile(os.path.join(folder_path_new, name)) and name.endswith('.gui')]

    # copy .d12
    for file in files:
        input_file = os.path.join(folder_path_new,f'{file[:-4]}.d12')
        sh.copy('data/crystal/AlGaN/super3/super3_input.d12', input_file)

    bash_script = generate_slurm_file(files)
    with open(slurm_file_name, 'w') as file:
        for line in bash_script:
            file.write(f"{line}")



## Read CRYSTAL output files

In [None]:
with open('data/crystal/AlGaN/super3/output_files/AlGaN_super3_1_0_0.out', 'r') as f:
    file_content = f.readlines()

Example usage

In [None]:

# num_atoms = 108  
# # Parse the file and extract structures with lattice matrix conversion
# parsed_structures = parse_crystal_output(file_content, num_atoms)

# # Convert to DataFrame for inspection
# df_structures = pd.DataFrame(parsed_structures)

# # Generate extended XYZ files
# generate_extended_xyz_files_from_df(df_structures, 'data/crystal/AlGaN/super3/output_files/test')

#### Write all the extxyz from all output files

In [None]:
# Write all the extxyz from all output files
import os
import numpy as np
import pandas as pd

# Folder containing the .out files
# folder_path = "data/crystal/AlGaN/super3/output_files"
# output_folder = "data/crystal/AlGaN/super3/extxyz_files"
folder_path = "data/crystal/AlGaN/pbe0/output_files"
output_folder = "data/crystal/AlGaN/pbe0/extxyz_files"
os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Loop over X values
for X in np.arange(54):  # Adjust the range as needed
    for Y in np.arange(10):  # Adjust the range as needed
        # Global index for structures extracted for the current X and Y
        global_index = 0
        Z = 0

        while True:
            # Construct file path for the current Z
            file_name = f"AlGaN_super3_{X}_{Y}_{Z}.out"
            file_path = os.path.join(folder_path, file_name)

            # Check if the file exists
            if not os.path.exists(file_path):
                break  # Exit the loop when no more Z files exist for this X_Y

            # Read the file and process its content
            with open(file_path, "r") as f:
                file_content = f.readlines()

            # Parse the file content
            parsed_structures = parse_crystal_output(file_content, num_atoms=108)  # Replace 108 with your atom count

            # Convert parsed structures to a DataFrame
            df_structures = pd.DataFrame(parsed_structures)

            # Save all extracted structures with unique global indices
            for _, row in df_structures.iterrows():
                # Generate the output file name with the incrementing global index
                output_file = os.path.join(
                    output_folder, f"AlGaN_super3_{X}_{Y}_{global_index}.xyz"
                )

                # Write the structure to an extended XYZ file
                with open(output_file, "w") as out_f:
                    # Write number of atoms
                    num_atoms = len(row['cartesian_coordinates'])
                    out_f.write(f"{num_atoms}\n")

                    # Write metadata
                    lattice_flat = " ".join(f"{value:.12e}" for value in row['lattice_matrix'].flatten())
                    stress_flat = " ".join(f"{value:.12e}" for value in np.array(row['stress']).flatten())
                    out_f.write(
                        f"dft_energy={row['energy_ev']:.12e} "
                        f'Lattice="{lattice_flat}" '
                        f'dft_stress="{stress_flat}" '
                        f'Properties=species:S:1:pos:R:3:dft_forces:R:3 '
                        f'config_type=random '
                        # f'system_name={os.path.basename(output_file[:-4])}\n'
                        f'system_name=random\n'
                    )

                    # Write atomic data
                    for symbol, coord, force in zip(row['atomic_symbols'], row['cartesian_coordinates'], row['forces']):
                        out_f.write(
                            f"{symbol} {coord[0]:.12e} {coord[1]:.12e} {coord[2]:.12e} "
                            f"{force[0]:.12e} {force[1]:.12e} {force[2]:.12e}\n"
                        )

                # Increment the global index
                global_index += 1

            # Increment Z to process the next file
            Z += 1

Check for dusplicates

In [None]:
# Folder containing the .out files
folder_path = "data/crystal/AlGaN/pbe0/extxyz_files/"

# Example: Check for duplicates in AlGaN_super3_1_0_*
x = 1
y = 1
pattern_prefix = f"AlGaN_super3_{x}_{y}_"
duplicates = find_duplicate_files(folder_path, pattern_prefix)

if duplicates:
    print("Duplicate files found:")
    for file1, file2 in duplicates:
        print(f"{file1} and {file2}")
else:
    print("No duplicate files found.")

#### Concatenate files

In [None]:
# Example usage
input_folder = "data/crystal/AlGaN/pbe0/extxyz_files"
output_file = "data/crystal/AlGaN/pbe0/concatenated_files/AlGaN_super3_all.xyz"
concatenate_xyz_files(input_folder, output_file)

### Read structures ASE

The stress is rounded, change to full value from CRYSTAL

In [None]:
test_file = "data/crystal/AlGaN/super3/concatenated_files/AlGaN_super3_all.xyz"
atoms = read(test_file, index=":")

In [None]:

# # Directory containing the extxyz files
# directory = 'data/crystal/AlGaN/super3/extxyz_files/'

# # List to store the atoms and stress tensors
# atoms_list = []
# stress_list = []

# # Iterate over all files in the directory
# for filename in os.listdir(directory):
#     if filename.endswith('.xyz'):  # Only process .extxyz files
#         file_path = os.path.join(directory, filename)
        
#         # Read the ASE atoms object
#         atoms = read(file_path, format='extxyz')
#         atoms_list.append(atoms)
        
#         # Extract the stress tensor if it exists
#         stress_flat = atoms.info.get("Stress")
#         if stress_flat is not None:
#             stress = stress_flat.reshape(3, 3)
#             stress_list.append(stress)
#         else:
#             print(f"No stress information found in {filename}")
#             stress_list.append(None)

#### Test/Train split

In [None]:
# Convert lists to numpy arrays for easier indexing
atoms_array = np.array(atoms_list, dtype=object)
stress_array = np.array(stress_list, dtype=object)

# Generate random indices for train-test split
n_samples = len(atoms_array)
test_size = 0.2
n_test = int(n_samples * test_size)

# Create a random permutation of indices
indices = np.arange(n_samples)
np.random.shuffle(indices)

# Split indices for train and test sets
test_indices = indices[:n_test]
train_indices = indices[n_test:]

# Split the data
atoms_train = atoms_array[train_indices]
atoms_test = atoms_array[test_indices]
stress_train = stress_array[train_indices]
stress_test = stress_array[test_indices]

# Output information
print(f"Total structures: {n_samples}")
print(f"Training set: {len(atoms_train)} structures")
print(f"Testing set: {len(atoms_test)} structures")