In [1]:
from typing import List

from pathlib import Path

In [38]:
ZENODO_SANDBOX = True

ZENODO_NOMODELS_ID = "52280"
ZENODO_MODELS_ID = "52280"

In [3]:
ray_results_path = Path("~/ray_results").expanduser()
assert ray_results_path.is_dir()

# Upload results

## Select files to upload

In [4]:
def get_files_nomodel():
    total = 0
    all_paths = []
    for f in ray_results_path.rglob('*'):
        if not f.is_file() or f.name.startswith('model.data'):
            continue
        
        total += f.stat().st_size
        all_paths.append(f)

    return total, all_paths

total, all_paths = get_files_nomodel()
print("Total GB:", total / 2**30, "files:", len(all_paths))

Total GB: 24.8716683909297 files: 32229


In [5]:
all_paths[0].relative_to(Path('~').expanduser())

PosixPath('ray_results/LightGCN_optim=map@10,dao=DEAD FoundationsDAO,freq=2d,normalize=True,cutoff_date=2021-11-28T00:00:00,fold=0_2024-03-21T22:11:14.136280/.validate_storage_marker')

## Create tar

In [31]:
import tarfile
from tqdm.autonotebook import tqdm

In [39]:
def compress_files(tar_fname, compress, flist):
    if tar_fname.exists():
        try:
            with tarfile.open(tar_fname, f'r:*') as tar:
                if len(tar.getnames()) == len(flist):
                    print("Same length, not removing", len(tar.getnames()), len(flist))
                    return
                else:
                    tar_fname.unlink()
        except:
            tar_fname.unlink()
    
    original_size = 0
    original_total = sum(f.stat().st_size for f in flist)
    with tarfile.open(tar_fname, f'x:{compress}') as tar:
        with tqdm(total=len(flist)) as pbar:
            for i, f in enumerate(flist):
                original_size += f.stat().st_size
                tar.add(f, recursive=False, arcname=f.relative_to(Path('~').expanduser()))
                pbar.update()
    
                if i % 5 == 0 and i > 1:
                    final_size = tar_fname.stat().st_size
                    ratio = final_size / original_size
                    pbar.set_postfix({
                        'ratio': f'{ratio*100:.2f}%',
                        'original': f'{original_size / 2**30:.2f}G',
                        'actual': f'{final_size / 2**30:.2f}G',
                        'est.': f'{ratio*original_total / 2**30:.2f}G',
                    })

    print(f"Original size: {original_size/ 2**30}G, New size: {final_size / 2**30}G, ratio: {final_size / original_size*100:.2f}%")


compress_files(Path("ray_results_nomodels.tar.xz"), "xz", all_paths)

NameError: name 'all_paths' is not defined

## Upload to Zenodo

In [78]:
!md5sum "ray_results_nomodels.tar.xz"

3ba771d323dc8c5b9a285eeab8b3731c  ray_results_nomodels.tar.xz


In [80]:
import requests
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
from zenodo_client import Zenodo

class ZenodoPlus(Zenodo):
    def _upload_big_files(self, *, bucket: str, paths, done=[]) -> List[requests.Response]:
        _paths = [paths] if isinstance(paths, (str, Path)) else paths
        _paths = [ Path(p) for p in _paths]
        _done_dict = { f['filename'] : f for f in done }
    
        rv = []
        # see https://developers.zenodo.org/#quickstart-upload
        for path in _paths:
            total_size = path.stat().st_size
            
            if path.name in _done_dict:
                _info = _done_dict[path.name]
                print("Should skip?", total_size)
                print(_info)
                raise
            
            with tqdm(
                desc=str(path),
                total=total_size,
                unit='B',
                unit_scale=True,
                unit_divisor=1024,
            ) as pbar:
                with open(path, "rb") as file:
                    e = MultipartEncoder(fields={'file': (path.name, file, 'text/plain')})
                    m = MultipartEncoderMonitor(
                        e, lambda monitor: pbar.update(monitor.bytes_read - pbar.n)
                    )

                    assert e.len >= total_size, "Encoder should have bigger size than file"
                    res = requests.put(
                        f"{bucket}/{path.name}",
                        data=m,
                        params={"access_token": self.access_token},
                        # headers={'Content-Type': m.content_type},
                    )

            res.raise_for_status()
            rv.append(res)
        return rv
    
    def upload_to_record(self, deposition_id: str, paths):
        url = f"{self.depositions_base}/{deposition_id}"
        res = requests.get(url, params={"access_token": self.access_token})
        res.raise_for_status()

        deposition_data = res.json()

        bucket = deposition_data['links']['bucket']
        self._upload_big_files(bucket=bucket, paths=paths, done=deposition_data['files'])

Z = ZenodoPlus(sandbox=ZENODO_SANDBOX)
Z.upload_to_record(ZENODO_NOMODELS_ID, ["ray_results_nomodels.tar.xz"])
print(f"Remember to publish {Z.depositions_base}/{ZENODO_NOMODELS_ID}")

ray_results_nomodels.tar.xz:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Remember to publish https://sandbox.zenodo.org/api/deposit/depositions/52280


# Upload models

In [9]:
def get_files_model():
    total = 0
    all_paths = []
    for f in ray_results_path.rglob('*'):
        if not f.is_file() or not f.name.startswith('model.data'):
            continue
        
        total += f.stat().st_size
        all_paths.append(f)

    return total, all_paths

total, model_paths = get_files_model()
print("Total GB:", total / 2**30, "files:", len(all_paths))

Total GB: 62.57409965991974 files: 32229


In [10]:
compress_files(Path("ray_results_models.tar.xz"), "xz", model_paths)

Same length, not removing 4000 4000
