In [40]:
import os
import zipfile
import numpy as np
import pandas as pd


In [93]:
def infolist_to_dict(infolist):

    """return a dict with some key attributes from the ZipInfo object"""
     
    info_dict = {
        'filename':[],
        'file_size':[],
        'file_type':[],
        }

    for info in infolist:
        # is it faster to store fname as a temp var or access the attribute twice?
        fname = info.filename
        ftype = fname.split('.')[-1].lower()
        info_dict['filename'].append(fname)
        info_dict['file_size'].append(info.file_size)
        info_dict['file_type'].append(ftype)

    for k,v in info_dict.items():
        info_dict[k] = np.array(v)
    
    return info_dict

In [94]:
def number_of_files_summary(info_dict):
    
    ftypes = np.unique(info_dict['file_type'])
    print(f"number of files: {len(info_dict['file_type'])}")

    for ftype in ftypes:
        count = np.sum(info_dict['file_type'] == ftype)
        print("    {}: {:,}".format(ftype, count))  
    

In [150]:
def find_max_or_min_file_size(info_dict, find='max'):

    if find=='max':
        size = info_dict['file_size'].max()

    if find=='min':
        size = info_dict['file_size'].min()
    
    idx = np.where(info_dict['file_size'] == size)[0] # take first result
    fname = info_dict['filename'][idx][0]

    return fname, size


def format_bytes(size):
    # from https://stackoverflow.com/questions/12523586/python-format-size-application-converting-b-to-kb-mb-gb-tb
    # 2**10 = 1024
    power = 2**10
    n = 0
    power_labels = {0 : '', 1: 'kilo', 2: 'mega', 3: 'giga', 4: 'tera'}
    while size > power:
        size /= power
        n += 1

    label = power_labels[n]+'bytes'

    return "{:.3f} {}".format(size,label)


In [152]:
format_bytes(size)

'329.126 kilobytes'

   largest file: 2/2803.png (329.126 kilobytes)


In [95]:
# Import zipfile module
# Read in zipfile

fpath = "../data/raw/graphische_sammlung_sample.zip"
f = zipfile.ZipFile(fpath, "r")

# list of file names of files in zipfile
flist = f.namelist()

# list with info about files in zipfile
infolist = f.infolist()

# atrributes of a specific file
info_obj.filename
info_obj.is_dir

<bound method ZipInfo.is_dir of <ZipInfo filename='0/128.png' compress_type=deflate external_attr=0x20 file_size=256942 compress_size=255445>>

In [160]:
info_dict = infolist_to_dict(infolist)
number_of_files_summary(info_dict)

for find in ['max','min']:
    fname, size = find_max_or_min_file_size(info_dict, find=find)
    print('{} file size:\n    {}  {}'.format(find, fname, format_bytes(size)))

number of files: 359
    csv: 1
    png: 358
max file size:
    2/2803.png  329.126 kilobytes
min file size:
    1/1288.png  62.265 kilobytes


In [92]:
df_info = pd.DataFrame(info_dict)
df_info

Unnamed: 0,filename,file_size,file_type
0,0/128.png,256942,png
1,0/133.png,286240,png
2,0/140.png,218997,png
3,0/149.png,214385,png
4,0/152.png,283019,png
...,...,...,...
354,2/2979.png,109186,png
355,2/2980.png,193005,png
356,2/2982.png,248440,png
357,2/2994.png,251041,png


In [41]:
info_obj = infolist[0]
info_obj.file_size

256942

In [18]:
f.getinfo('1/1528.png')

232795

In [7]:
np.unique(flist[:5])

array(['0/128.png', '0/133.png', '0/140.png', '0/149.png', '0/152.png'],
      dtype='<U9')