# Archive

> archive class.

In [None]:
#| default_exp utils.archive

In [None]:
#| hide
from nbdev.showdoc import *

In [1]:
#| export
import os, pathlib, itertools
from pathlib import Path
from dataclasses import dataclass, field, KW_ONLY
from typing import Optional, List, ClassVar, Any, TypeAlias, Union

from ipos.imp import is_mod, is_var_imp, Module

In [88]:
#| export
from iza.types import (
    PathLike, PathType,
    RichConsole, RichProgress, RichText, RichTree
)
from iza.static import EXT_PY
from iza.imp import RichImp

### Directory Viewer

#### Archive Downloader

- `Directory` defined in `_02_utils/_03_directory.ipynb`
- `ConsoleType` defined in `_02_utils/_03_directory.ipynb`
- `get_console` imported in `_02_utils/_03_directory.ipynb`
- `is_rich_available` defined in `_02_utils/_08_archive.ipynb`
- `urljoin` defined in `_02_utils/_01_files.ipynb`
- `parse_url` imported in `_02_utils/_01_files.ipynb`

In [None]:
#| export
@dataclass
class ArchiveDownloader:    
    _: KW_ONLY
    rootdir: str
    archive: str
    entries: Union[str, list[str]]
    savedir: str
    extract: bool = False
    cleanup: bool = False
    compound_archive: bool = False
    archives: Optional[list[str]] = None
    console: Optional[RichConsole] = None
    progress: Optional[RichProgress] = None

    _rich: Module = field(init=False, repr=False, default=None)
    has_rich: bool = field(init=False, repr=False, default=None)

    

    def __post_init__(self):    
        r = RichImp()
        self._rich = r._module
        self.has_rich = is_mod(self._rich)

        self.entries = self.entries if isinstance(self.entries, list) else [self.entries]        
        self.console = get_console()        
        self.progress = self.get_progress()
    
        self.savedir = Path(self.savedir).expanduser()
        make_missing_dirs(self.savedir)

    def get_progress(self):
        progress = getattr(self, 'progress', None)
        if progress is not None:
            return progress

        if self.has_rich and is_var_imp('Progress'):
                self.progress = Progress(console=self.console)
                return self.progress

        return None

    @property
    def path(self) -> str:
        return urljoin(self.rootdir, self.archive)

    @property
    def urls(self) -> list[str]:
        urls = []
        if self.compound_archive and self.archives is not None:
            for archive, entry in itertools.product(self.archives, self.entries):
                urls.append(urljoin(self.rootdir, archive, entry))
        else:
            urls = [urljoin(self.path, entry) for entry in self.entries]
        return urls

    def download_missing_files(self) -> None:        
        total_files = len(self.urls)
        if self.has_rich and self.progress:
            with self.progress:
                task = self.progress.add_task("[cyan]Downloading...", total=total_files)
                for url in self.urls:
                    filename = Path(parse_url(url).path).name
                    fullpath = self.savedir / filename
                    if not fullpath.exists():
                        stream_file(url, str(fullpath))
                        self.progress.advance(task)
                    else:
                        self.progress.advance(task)
        else:            
            for url in tqdm(self.urls, desc='Downloading'):       
                filename = Path(parse_url(url).path).name
                fullpath = self.savedir / filename
                if not fullpath.exists():
                    stream_file(url, str(fullpath))
                

    def calc_n_to_extract(self) -> int:
        files = [self.savedir / entry for entry in self.entries]
        files = get_gz_files_in_dir(self.savedir)
        total = 0
        for file in files:
            if is_tarball(file):
                total += 1
            elif is_gz(file):
                total += 1
        return total                

    def extract_files(self) -> None:
        recurser = RecursiveDecompressor(
            dirname=self.savedir, 
            entries=self.entries,
            strategy=undo_gz,
            remove=self.cleanup,             
            progress=self.progress
        )
        recurser.decompress()
        return
        files = [self.savedir / entry for entry in self.entries]
        files = get_gz_files_in_dir(self.savedir)
        total = self.calc_n_to_extract()
        if self.has_rich and self.progress:
            with self.progress:
                task = self.progress.add_task("[cyan]Extracting...", total=total)
                for file in files:
                    undo_gz(file, remove=self.cleanup)                    
                    self.progress.advance(task)
        else:
            for file in tqdm(files, desc='Extracting'):
                undo_gz(file, remove=self.cleanup)

    def execute(self) -> None:
        if self.has_rich and self.console:
            self.console.print(f"Processing archive: [bold cyan]{self.archive}[/bold cyan]")
        else:
            print(f"Processing archive: {self.archive}")

        self.download_missing_files()
        if self.extract:
            self.extract_files()

        if self.has_rich and self.console:
            dir = RichDirectory(self.savedir, console=self.console)
            dir.print_rich()
        else:
            dir = Directory(self.savedir)
            dir.print()

##### Example

In [None]:
#| eval: False
from iza.static import AMAZON_BUCKET_FLUENTBIO

downloader = ArchiveDownloader(
    rootdir = AMAZON_BUCKET_FLUENTBIO,
    archive = 'public-datasets/pbmc/',
    entries = ['combined.html', 'filtered_matrix.tar.gz'],
    savedir = '~/Downloads/fluent_bio',  extract=True, cleanup=True
)
downloader.execute()

#### Typer

In [None]:
#| eval: False
#| export
try:
    import typer
    app = typer.Typer()

    @app.command()
    def download(rootdir: str, archive: str, entries: List[str], savedir: str, extract: bool = False, cleanup: bool = False):
        downloader = ArchiveDownloader(rootdir, archive, entries, savedir, extract, cleanup)
        downloader.execute()

except ImportError:
    pass

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()