# Files

> file functions.

In [None]:
#| default_exp utils.files

In [None]:
#| hide
from nbdev.showdoc import *

In [2]:
#| export
import os, sys, pwd, atexit, tempfile, inspect
import requests, tarfile, gzip, shutil

from tqdm.auto import tqdm

from typing import Optional, List, Union, Iterable, Tuple

In [3]:
#| export
from iza.static import (
    EXT_GZ, EXT_TAR, EXT_TAR_GZ,
)

### User Utils

In [4]:
#| export
def get_user() -> str:
    user = pwd.getpwuid(os.getuid())[0]
    return user

def collapse_user(path: str) -> str:
    _, rest = path.split(get_user())    
    return '~' + rest

### Extensions

In [49]:
#| export
def check_ext(filename:str, extension:str) -> bool:
    has_extension = extension in filename 
    splits = filename.split(extension)
    is_end_of_str = len(splits) >= 2 and splits[-1] == ''
    is_end_of_str = filename.endswith(extension)
    return has_extension and is_end_of_str

def drop_ext(filename:str, extension:Optional[str]=None) -> str:
    file = os.path.basename(filename)
    if extension is None:
        file, *_ = file.split('.')
    else:
        file = filename.replace(extension, '')
    return os.path.join(os.path.dirname(filename), file)

In [50]:
#| export
def is_tar(filename:str) -> bool:
    return check_ext(filename, EXT_TAR)

def is_gz(filename:str) -> bool:
    return check_ext(filename, EXT_GZ)

def is_targz(filename:str) -> bool:
    return check_ext(filename, EXT_TAR_GZ)

def is_tarball(filename:str) -> bool:
    return is_tar(filename) or is_targz(filename)


In [51]:
#| export
def filter_for_gz_files(files:List[str]) -> List[str]:
    return list(filter(lambda f: is_gz(f), files))

def get_gz_files_in_dir(dirname:str) -> List[str]:
    all_files = []

    for (root, dirs, files) in os.walk(dirname):   
        fullpaths = [os.path.join(root, file) for file in files]
        all_files.extend(fullpaths)
    
    gz_files = filter_for_gz_files(all_files)
    return gz_files

### Decompression

In [52]:
#| export
def decompress_tarball(filename:str) -> Tuple[str, Optional[EOFError]]:
    '''
    Returns
    -------
        dirname : str
            The name of the archive e.g. `~/Downloads/fluentbio.tar.gz` would
            yield `~/Downloads/fluentbio`


    Notes
    -----
    FluentBio has a weird gzip so it complains when it is 
        actually fine
    '''
    error = None
    decompress_dir = os.path.dirname(filename)
    dirname = drop_ext(filename, EXT_TAR_GZ)
    try:
        with tarfile.open(filename) as tarball:
            tarball.extractall(decompress_dir)
            tarball.close()

    except EOFError as error:
        pass

    return dirname, error


def decompress_gunzip(filename:str, remove:bool=False) -> Tuple[str, Optional[EOFError]]:
    '''
    Returns
    -------
        file : str
            The name of the decompressed file e.g. `~/Downloads/fluentbio.tsv.gz` would
            yield `~/Downloads/fluentbio.tsv`

    Notes
    -----
    FluentBio has a weird gzip so it complains when it is 
        actually fine
    '''
    error = None
    decompressed_file = drop_ext(filename, EXT_GZ)
    try:             
        with gzip.open(filename, 'rb') as gunzipped:
            with open(decompressed_file, 'wb') as unzipped:
                shutil.copyfileobj(gunzipped, unzipped)     
                   
    except EOFError as error:
        pass

    if os.path.isfile(decompressed_file) and remove:
        os.remove(filename)

    return decompressed_file, error

def undo_gz(filename: str) -> str:
    if is_gz(filename):
        filename, _ = decompress_gunzip(filename, remove=True)
    elif is_tarball(filename):
        filename, _ = decompress_tarball(filename)
    return filename

### Directories

In [53]:
#| export
def make_missing_dirs(dirs:List[str]):
    if isinstance(dirs, str):
        dirs = [dirs]
        
    for d in dirs:
        if not os.path.exists(d):
            os.makedirs(d)
            
def dir_dirs(dirname:str) -> List[str]:
    entries = os.listdir(dirname)
    is_subdir = lambda e : os.path.isdir(os.path.join(dirname, e))
    return list(filter(is_subdir, entries))

In [54]:
#| export
def decompress_directory_of_gunzipped_files(
    dirname:str, desc:Optional[str]=None, remove:Optional[bool]=False
) -> None:
    if desc is None:
        desc = dirname.split('/')[-1]

    gz_files = get_gz_files_in_dir(dirname)
    for filename in tqdm(gz_files, desc=desc):
        decomp_filename, error = decompress_gunzip(filename, remove)   


def decompress_tarball_of_gunzipped_files(
    filename:str, desc:Optional[str]=None, remove:Optional[bool]=False
) -> None:
    # NOTE: initial decompress of .tar.gz
    dirname, error = decompress_tarball(filename)

    if desc is None:
        desc = dirname.split('/')[-1]

    # NOTE: decompress all internal .gz files
    decompress_directory_of_gunzipped_files(dirname, desc, remove)

### Streaming

In [11]:
#| export
def stream_file(uri:str, filename:Optional[str]=None, desc:Optional[str]=None) -> None:
    '''
    Parameters
    ----------
    uri : str
        The URI to download

    filename : str, optional
        The fullpath name of the file to download. Defaults to 
        `~/Downloads/os.path.basename(uri)`.

    desc : str, optional
        The description of the `tqdm` progress bar. Defaults to 
        `os.path.basename(uri)`.
    '''
    if filename is None:
        download_dir = os.path.expanduser(f'~/Downloads')        
        filename = os.path.join(download_dir, os.path.basename(uri))

    basename = os.path.basename(filename)
    if desc is None:
        desc = basename

    response = requests.get(uri, stream=True)
    total = int(response.headers.get('content-length', 0))
    
    with tqdm.wrapattr(
        open(filename, 'wb'), 'write', 
        miniters=1, desc=desc, total=total
    ) as fout:
        for chunk in response.iter_content(chunk_size=4096):
            fout.write(chunk)

### Downloads

In [12]:
#| export
def download_and_decompress_tarball_of_gunzipped_files(
    uri:str, download_dir:str=None, desc:Optional[str]=None, remove:Optional[bool]=False
):
    filename = os.path.basename(uri)
    fullpath = os.path.join(download_dir, filename)

    if desc is None:
        description = f'Downloading {filename}'


    # NOTE: Amazon --> filtered_matrix.tar.gz
    stream_file(uri, description)

    if desc is None:
        description = f'Decompressing {filename}'
    # NOTE: filtered_matrix.tar.gz --> filtered_matrix/**/file.tsv
    decompress_tarball_of_gunzipped_files(fullpath, desc, remove)


### Temporary Files

In [16]:
#| export
def make_temp_file(**kwargs: Any) -> tempfile.NamedTemporaryFile:
    temp = tempfile.NamedTemporaryFile(**kwargs)
    @atexit.register
    def delete_temp() -> None:
        temp.close()
    return temp

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()