# Select Starting Structures
Get a handfull of structures of each cluster size. Some should be the lowest energy structures.

Start by downloading the ZIP file of structures from [WDBase](https://sites.uw.edu/wdbase/database-of-water-clusters/) and store it in a `wdbase` folder.

In [1]:
from fff.simulation.utils import read_from_string
from more_itertools import batched
from io import TextIOWrapper
from zipfile import ZipFile
from random import sample
from pathlib import Path
from tqdm import tqdm
import pandas as pd

Configuration

In [2]:
k: int = 4  # Number of samples to draw of each size

## Get the Structures
For each size of water, get the top $k$ energy structures then draw $k$ randomly from those less than 5 kcal/mol above the groundstate.

In [3]:
top_level_zip = ZipFile('wdbase/W3-W30_all_geoms_TTM2.1-F.zip')

In [4]:
structures = []  # List of (xyz, n_waters, wdbase_index, [top|random], ttm_energy) triples
for file in tqdm(top_level_zip.filelist):
    # Skip directories
    if file.is_dir(): 
        continue
        
    # Get the number of waters
    n_waters = int(Path(file.filename).name.split("_")[0][1:])
    to_select_randomly = []  # Structures from which to draw random
    best_energy = None
    with top_level_zip.open(file) as fp:
        # Get the best structures and all those within 5 kcal/mol of best
        inside_zip = ZipFile(fp)  # Parse it as text
        
        # Read the file inside that ZIP
        with inside_zip.open(inside_zip.filelist[0], 'r') as fpp:
            fpp_text = TextIOWrapper(fpp)
            for i, lines in enumerate(batched(fpp_text, n_waters * 3 + 2)):  # Over each XYZ
                # Parse structure
                xyz = "".join(lines)
                energy = float(lines[1].split()[-1])
                read_from_string(xyz, 'xyz')  # Make sure it parses
                
                # Store best energy
                if i == 0:
                    best_energy = energy
                
                # Store the top structures
                if i < k:
                    structures.append(
                        (xyz, n_waters, i, 'top', energy)
                    )
                else:
                    # Break if the energy is too high
                    if energy > best_energy + 5:
                        break
                    to_select_randomly.append((xyz, i, energy))
                

        # Pick the random structures
        for xyz, i, energy in sample(to_select_randomly, min(len(to_select_randomly), k)):
            structures.append((xyz, n_waters, i, 'random', energy))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:29<00:00,  1.03s/it]


Save the results

In [5]:
structures = pd.DataFrame(structures, columns=['xyz', 'n_waters', 'wdbase_index', 'source', 'ttm_energy'])
structures.sort_values(['n_waters', 'ttm_energy'], inplace=True)

In [6]:
structures.to_csv('test-set-structures.csv', index=False)