In [112]:
import os
import zipfile
import numpy as np
from pprint import pprint as pp # For pretty printing nested dictioanry


In [127]:
def infolist_to_dict(infolist):

    """return a dict with some key attributes from the ZipInfo object"""
     
    info_dict = {
        'filename':[],
        'file_size':[],
        'file_type':[],
        }

    for info in infolist:
        # is it faster to store fname as a temp var or access the attribute twice?
        fname = info.filename
        ftype = fname.split('.')[-1].lower()
        info_dict['filename'].append(fname)
        info_dict['file_size'].append(info.file_size)
        info_dict['file_type'].append(ftype)

    for k,v in info_dict.items():
        info_dict[k] = np.array(v)
    
    return info_dict

In [114]:
def number_of_files_summary(info_dict):
    
    ftypes = np.unique(info_dict['file_type'])
    print(f"number of files: {len(info_dict['file_type'])}")

    for ftype in ftypes:
        count = np.sum(info_dict['file_type'] == ftype)
        print("    {}: {:,}".format(ftype, count))  
    

In [115]:
def find_max_or_min_file_size(info_dict, find='max'):

    if find=='max':
        size = info_dict['file_size'].max()

    if find=='min':
        size = info_dict['file_size'].min()
    
    idx = np.where(info_dict['file_size'] == size)[0] # take first result
    fname = info_dict['filename'][idx][0]

    return fname, size


def format_bytes(size):
    # from https://stackoverflow.com/questions/12523586/python-format-size-application-converting-b-to-kb-mb-gb-tb
    # 2**10 = 1024
    power = 2**10
    n = 0
    power_labels = {0 : '', 1: 'kilo', 2: 'mega', 3: 'giga', 4: 'tera'}
    while size > power:
        size /= power
        n += 1

    label = power_labels[n]+'bytes'

    return "{:.3f} {}".format(size,label)


In [116]:
# Import zipfile module
# Read in zipfile

fpath = "../data/raw/graphische_sammlung_sample.zip"
f = zipfile.ZipFile(fpath, "r")

# list of file names of files in zipfile
flist = f.namelist()

# list with info about files in zipfile
infolist = f.infolist()

# atrributes of a specific file
# info_obj.filename
# info_obj.is_dir

In [13]:
info_dict = infolist_to_dict(infolist)
number_of_files_summary(info_dict)

for find in ['max','min']:
    fname, size = find_max_or_min_file_size(info_dict, find=find)
    print('{} file size:\n    {}  {}'.format(find, fname, format_bytes(size)))

number of files: 359
    csv: 1
    png: 358
max file size:
    2/2803.png  329.126 kilobytes
min file size:
    1/1288.png  62.265 kilobytes


In [125]:

def make_nested_filepath_tree(paths):
    
    """https://stackoverflow.com/questions/66994061/convert-list-of-file-paths-to-a-nested-dictionary-similar-to-a-json-file"""
        
    # Sort so deepest paths are first
    paths = sorted(paths, key = lambda s: len(s.lstrip('/').split('/')), reverse = True)

    tree_path = {}
    for path in paths:
        # Split into list and remove leading '/' if present
        levels = path.lstrip('/').split("/")
        
        file = levels.pop()
        acc = tree_path
        for i, p in enumerate(levels, start = 1):
            if i == len(levels):
                # Reached termination of a path
                # Use current terminal object is present, else use list
                acc[p] = acc[p] if p in acc else []
                if isinstance(acc[p], list):
                    # Only append if we are at a list
                    acc[p].append(file)
            else:
                # Exaand with dictionary by default
                acc.setdefault(p, {})
            acc = acc[p]

    return tree_path


def reformat_nested_dict_for_treeviewer(in_dict):
    
    """recursive function to convert nested dictionary to format needed for vuetify treeviewer copmonent"""
    tree = []
    sorted_keys = np.sort(list(in_dict.keys()))
    for i, key in enumerate(sorted_keys):
        
        value = in_dict[key]
        if isinstance(value, dict):
            # call function recursively if the children are a dictionary
            children = format_for_treeviewer(value)
        else:
            
            children = value
        
        obj = {
            'id':i,
            'name':key,
            'children': children
            }
        
        tree.append(obj)
    
    return tree


def format_filelist_for_treeview(fpath_list):
    tree = make_nested_filepath_tree(fpath_list)
    treeviewer = reformat_nested_dict_for_treeviewer(filenames_dict)
    
    return treeviewer

In [126]:
format_filelist_for_treeview(info_dict['filename'])

[{'id': 0,
  'name': '0',
  'children': ['128.png',
   '133.png',
   '140.png',
   '149.png',
   '152.png',
   '164.png',
   '172.png',
   '179.png',
   '18.png',
   '189.png',
   '19.png',
   '218.png',
   '219.png',
   '221.png',
   '224.png',
   '236.png',
   '240.png',
   '264.png',
   '277.png',
   '285.png',
   '287.png',
   '3.png',
   '300.png',
   '312.png',
   '313.png',
   '324.png',
   '327.png',
   '33.png',
   '336.png',
   '346.png',
   '351.png',
   '356.png',
   '388.png',
   '397.png',
   '398.png',
   '419.png',
   '420.png',
   '421.png',
   '430.png',
   '450.png',
   '451.png',
   '458.png',
   '460.png',
   '461.png',
   '465.png',
   '470.png',
   '476.png',
   '492.png',
   '511.png',
   '512.png',
   '513.png',
   '52.png',
   '520.png',
   '531.png',
   '549.png',
   '556.png',
   '567.png',
   '57.png',
   '576.png',
   '592.png',
   '599.png',
   '601.png',
   '602.png',
   '605.png',
   '61.png',
   '610.png',
   '616.png',
   '617.png',
   '620.png',
   '

In [None]:
items: [a
        {
          id: 1,
          name: 'Applications :',
          children: [
            { id: 2, name: 'Calendar : app' },
            { id: 3, name: 'Chrome : app' },
            { id: 4, name: 'Webstorm : app' },
          ],
        },
        {
          id: 5,

In [50]:

res = make_path(info_dict['filename'])
res = make_path_for_treeviewer(nested_filelist)
print(res)  # Pretty print result

{'b': {'0': ['140.png', '164.png', '18.png', '221.png', '264.png', '3.png', '351.png', '356.png', '450.png', '458.png', '476.png', '549.png', '567.png', '592.png', '655.png', '688.png', '725.png', '786.png', '791.png', '799.png', '805.png', '811.png', '82.png', '856.png', '945.png', '95.png'], '1': ['1031.png', '1048.png', '1060.png', '1067.png', '1103.png', '1128.png', '1143.png', '1144.png', '1177.png', '1237.png', '1279.png', '1311.png', '1319.png', '1375.png', '1376.png', '1392.png', '1407.png', '1475.png', '1510.png', '1513.png', '1552.png', '1595.png', '1597.png', '1656.png', '1811.png', '1815.png', '1905.png', '1913.png', '1940.png', '1971.png'], '2': ['2029.png', '2058.png', '2140.png', '2144.png', '2230.png', '2297.png', '2351.png', '2359.png', '2388.png', '2396.png', '2486.png', '2505.png', '2562.png', '2570.png', '2598.png', '2651.png', '2653.png', '2664.png', '2804.png', '2831.png', '2833.png', '2849.png', '2887.png', '2994.png']}, 'c': {'0': ['149.png', '152.png', '219.png

In [29]:
nested_filelist = [np.random.choice(['','a/','b/','c/'])+f for f in info_dict['filename'].tolist()]

In [18]:
df_info = pd.DataFrame(info_dict)
df_info['filename']

NameError: name 'pd' is not defined

In [41]:
info_obj = infolist[0]
info_obj.file_size

256942

In [18]:
f.getinfo('1/1528.png')

232795

In [7]:
np.unique(flist[:5])

array(['0/128.png', '0/133.png', '0/140.png', '0/149.png', '0/152.png'],
      dtype='<U9')