# Examin the size of an environment, package by package

**NOTE**: this is created mostly from:

> [https://uwekorn.com/2020/09/08/trimming-down-pyarrow-conda-1-of-x.html](https://uwekorn.com/2020/09/08/trimming-down-pyarrow-conda-1-of-x.html)

## Import and utilities

In [19]:
from pathlib import Path
from glob import glob

import json
import pandas as pd
import matplotlib.pyplot as plt

def get_clean_suffix(name):
    """Get the filename suffix without numeric elements"""
    suffixes = [x for x in name.split(".")[1:] if not x.isnumeric()]
    return (suffixes or [""])[-1]

def gather_files(environment, verbose=False):
    """Gather the list of file in an environment"""
    files = []
    for meta in glob(f'{environment}/conda-meta/*.json'):
            with open(meta, "r") as f:
                info = json.load(f)
                for file in info["files"]:
                    try:
                        path = Path(f"{environment}/{file}")
                        if not path.is_symlink():
                            files.append({
                                "package": info["name"],
                                "name": file,
                                "size": path.stat().st_size,
                                "suffix": get_clean_suffix(path.name)
                            })
                    except:
                        if verbose:
                            print(f"Package: {meta} | File: {file}")
                        else:
                            pass
                    
    return pd.DataFrame(files)

## Collect sizes

In [None]:
%time fs = gather_files(Path("/opt/conda"))
fs["size_mb"] = fs["size"] / 1024 / 1024

In [2]:
fs.head()

Unnamed: 0,package,name,size,suffix,size_mb
0,zlib,include/zconf.h,16262,h,0.015509
1,zlib,include/zlib.h,96239,h,0.091781
2,zlib,lib/libz.so.1.2.11,109272,so,0.10421
3,zlib,lib/pkgconfig/zlib.pc,259,pc,0.000247
4,conda,bin/activate,184,,0.000175


## Explore sizes

In [4]:
fs["size_mb"].sum() / 1024

2.493656547740102

In [16]:
from qgrid import show_grid
pkgs = fs.groupby("package")\
         ["size_mb"]\
         .sum()\
         .sort_values(ascending=False)\
         .reset_index()
show_grid(pkgs)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [17]:
pkgs[pkgs["package"].str.contains("geos")]

Unnamed: 0,package,size_mb
69,geos,4.387086
273,pygeos,0.278291
