# Ship Dataset

> Fill in a module description here

In [None]:
#| default_exp datasets.ship

In [None]:
#| export
from nonlinear_benchmarks.utilities import get_tmp_benchmark_directory
from pathlib import Path
import os
import h5py
import numpy as np
from easyDataverse import Dataverse
import pandas as pd
import shutil

In [None]:
#| export

def ship(
        save_path: Path, #directory the files are written to, created if it does not exist
        force_download: bool = False, # force download the dataset
        remove_download = True
):
    save_path = Path(save_path)
    download_dir = Path(get_tmp_benchmark_directory()) / 'Ship'

    if force_download and download_dir.exists():
        print(f"Force reload: Removing existing directory: {download_dir}")
        shutil.rmtree(download_dir)

    dataverse = Dataverse('https://darus.uni-stuttgart.de/')
    dataverse.load_dataset(
        pid='doi:10.18419/darus-2905',
        filedir=download_dir,
    )

    #str to Path to be plattform independent
    structure_mapping = {
        Path('patrol_ship_routine/processed/train'): 'train',
        Path('patrol_ship_routine/processed/validation'): 'valid',
        Path('patrol_ship_routine/processed/test'): 'test',
        Path('patrol_ship_ood/processed/test'): 'test_ood'
    }

    # Ensure desired directories exist
    for subdir in structure_mapping.values():
        os.makedirs(os.path.join(save_path, subdir), exist_ok=True)

    def convert_tab_to_hdf5(tab_path, hdf5_path):
        df = pd.read_csv(tab_path, sep='\t')
        with h5py.File(hdf5_path, 'w') as hdf:
            for column in df.columns:
                data = df[column].astype(np.float32).values
                hdf.create_dataset(column, data=data, dtype='f4')

    # Walk through the current directory structure and process files
    for subdir, dirs, files in os.walk(download_dir):
        for file in files:
            if file.endswith('.tab'):
                current_file_path = os.path.join(subdir, file)
                
                # Determine the relative path
                relative_subdir = Path(os.path.relpath(subdir, download_dir))
                
                # Find the corresponding desired subdir
                if relative_subdir in structure_mapping:
                    desired_subdir = structure_mapping[relative_subdir]
                    
                    # Construct desired file paths
                    base_filename = file.replace('.tab', '')
                    desired_hdf5_path = os.path.join(save_path, desired_subdir, base_filename + '.hdf5')
                    
                    convert_tab_to_hdf5(current_file_path, desired_hdf5_path)

    #remove downloaded files
    if remove_download:
        shutil.rmtree(download_dir)

In [None]:
tmp_dir = Path('./tmp')
ship(tmp_dir / 'ship')





HTTPStatusError: Redirect response '303 See Other' for url 'https://darus.uni-stuttgart.de/api/access/datafile/114588'
Redirect location: 'https://s3.tik.uni-stuttgart.de/fokus-dv-prod-1/10.18419/darus-2905/180f5800bee-63d629f6662b?response-content-disposition=attachment%3B%20filename%2A%3DUTF-8%27%27README.txt&response-content-type=text%2Fplain&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20250428T052632Z&X-Amz-SignedHeaders=host&X-Amz-Expires=172800&X-Amz-Credential=ZYFB5FYVYI021REQFP7K%2F20250428%2Fdataverse%2Fs3%2Faws4_request&X-Amz-Signature=51a9381fb352904d35a8db5089f8a01f9a4a8402e0fe1f2f5539666285774446'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/303

In [None]:
#| hide
#clean temporary hdf5 file
# shutil.rmtree(tmp_dir)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()