# CytoTable object graph analysis

This notebook explores how CytoTable objects operate. The work is related to [CytoTable#75](https://github.com/cytomining/CytoTable/issues/75).

In [1]:
import gc
import sys

import cytotable
import pandas as pd

gc.set_debug(gc.DEBUG_LEAK)

cytotable.convert(
    source_path="./examples/data/all_cellprofiler.sqlite",
    dest_path="./examples/data/test-result.parquet",
    dest_datatype="parquet",
    preset="cellprofiler_sqlite_pycytominer",
)

collected = gc.collect()
if gc.garbage:
    print(f"Memory leak detected: {len(gc.garbage)} objects")
    df = pd.DataFrame(
        [
            {
                "id": id(obj),
                "type": type(obj),
                "refcount": sys.getrefcount(obj),
                "repr": repr(obj),
                "size": sys.getsizeof(obj),
            }
            for obj in gc.garbage
        ]
    )

df.head()

Memory leak detected: 1119 objects


gc: collectable <list 0x13a681300>
gc: collectable <list 0x13c003580>
gc: collectable <dict 0x13beea980>
gc: collectable <AppFuture 0x124f01d00>
gc: collectable <Condition 0x124f01040>
gc: collectable <builtin_function_or_method 0x13c12e1d0>
gc: collectable <dict 0x13bf21500>
gc: collectable <builtin_function_or_method 0x13c12e220>
gc: collectable <builtin_function_or_method 0x13c12e270>
gc: collectable <builtin_function_or_method 0x13c12e2c0>
gc: collectable <builtin_function_or_method 0x13c12e310>
gc: collectable <collections.deque 0x11a1397c0>
gc: collectable <dict 0x13c003c80>
gc: collectable <list 0x13c003040>
gc: collectable <list 0x13bf21800>
gc: collectable <list 0x13bf21740>
gc: collectable <list 0x118916880>
gc: collectable <method 0x13c123680>
gc: collectable <functools.partial 0x13c12e3b0>
gc: collectable <tuple 0x1104014c0>
gc: collectable <list 0x13c039f40>
gc: collectable <Future 0x13bebcf70>
gc: collectable <Condition 0x13bebcf40>
gc: collectable <builtin_function_or_me

Unnamed: 0,id,type,refcount,repr,size
0,5274866432,<class 'list'>,10,[],56
1,5301613952,<class 'list'>,4,[],56
2,5300464000,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
3,4914683136,<class 'parsl.dataflow.futures.AppFuture'>,4,<AppFuture at 0x124f01d00 state=finished retur...,48
4,4914679872,<class 'threading.Condition'>,4,<Condition(<unlocked _thread.RLock object owne...,48


In [2]:
# create a list of files to reference
list_of_sqlite_files = [
    "./examples/data/all_cellprofiler.sqlite",
]

In [3]:
cytotable.convert(
    source_path="./examples/data/all_cellprofiler.sqlite",
    dest_path="./examples/data/test-result.parquet",
    dest_datatype="parquet",
    preset="cellprofiler_sqlite_pycytominer",
)

Reusing previously loaded Parsl configuration.


PosixPath('/Users/dabu5788/Documents/work/CytoTable-benchmarks-d33bs/notebooks/examples/data/test-result.parquet')

In [4]:
collected = gc.collect()
if gc.garbage:
    print(f"Memory leak detected: {len(gc.garbage)} objects")
    df = pd.DataFrame(
        [
            {
                "id": id(obj),
                "type": type(obj),
                "refcount": sys.getrefcount(obj),
                "repr": repr(obj),
                "size": sys.getsizeof(obj),
            }
            for obj in gc.garbage
        ]
    )

df.head()

Memory leak detected: 2229 objects


gc: collectable <cell 0x1104019a0>
gc: collectable <tuple 0x13ca101f0>
gc: collectable <function 0x1104168b0>
gc: collectable <cell 0x1103f4820>
gc: collectable <tuple 0x13b689fa0>
gc: collectable <function 0x13ca11f70>
gc: collectable <list 0x13be7e480>
gc: collectable <list 0x13c9e5b40>
gc: collectable <dict 0x13bf74500>
gc: collectable <AppFuture 0x1103f41c0>
gc: collectable <Condition 0x1103f43d0>
gc: collectable <builtin_function_or_method 0x13c5f6090>
gc: collectable <dict 0x13bf4df80>
gc: collectable <builtin_function_or_method 0x13c5f61d0>
gc: collectable <builtin_function_or_method 0x110456450>
gc: collectable <builtin_function_or_method 0x110456860>
gc: collectable <builtin_function_or_method 0x110332e00>
gc: collectable <collections.deque 0x13c5d8a60>
gc: collectable <dict 0x13bf546c0>
gc: collectable <list 0x13c564140>
gc: collectable <list 0x13b6cd500>
gc: collectable <list 0x13b6aebc0>
gc: collectable <list 0x13abb0200>
gc: collectable <method 0x13c5f8700>
gc: collectable

Unnamed: 0,id,type,refcount,repr,size
0,5274866432,<class 'list'>,9,[],56
1,5301613952,<class 'list'>,4,[],56
2,5300464000,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
3,4914683136,<class 'parsl.dataflow.futures.AppFuture'>,4,<AppFuture at 0x124f01d00 state=finished retur...,48
4,4914679872,<class 'threading.Condition'>,4,<Condition(<unlocked _thread.RLock object owne...,48


In [5]:
df.sort_values(by=["size", "refcount"], ascending=False).drop_duplicates(
    subset="id"
).head(30)

Unnamed: 0,id,type,refcount,repr,size
2,5300464000,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
40,5302850560,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
122,5302878464,<class 'dict'>,7,{'depends': [<AppFuture at 0x13beeb4f0 state=f...,1176
172,5302881664,<class 'dict'>,7,{'depends': [<AppFuture at 0x13beeb970 state=f...,1176
473,5302926976,<class 'dict'>,7,{'depends': [<AppFuture at 0x13abb8190 state=f...,1176
572,5302926016,<class 'dict'>,7,{'depends': [<AppFuture at 0x13a162970 state=f...,1176
653,5307189120,<class 'dict'>,7,{'depends': [<AppFuture at 0x13b625c70 state=f...,1176
734,5307204096,<class 'dict'>,7,{'depends': [<AppFuture at 0x13b6f2100 state=f...,1176
1127,5301028096,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
1165,5292681664,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176


In [6]:
df[
    ~df["repr"].str.contains("AppFuture") & ~df["repr"].str.contains("deque")
].sort_values(by=["size", "refcount"], ascending=False).drop_duplicates(
    subset="id"
).head(
    30
).to_csv(
    "leaks.csv"
)

In [7]:
{
    "source_group_name": "Per_image.sqlite",
    "source": {
        "source_path": PosixPath(
            "/Users/dabu5788/Documents/work/CytoTable-benchmarks-d33bs/notebooks/examples/data/all_cellprofiler.sqlite"
        ),
        "table_name": "Per_Image",
        "offsets": [0],
    },
    "chunk_size": 1000,
    "offset": 0,
    "dest_path": PosixPath(
        "/Users/dabu5788/Documents/work/CytoTable-benchmarks-d33bs/notebooks/examples/data/test-result.parquet"
    ),
    "data_type_cast_map": None,
}

NameError: name 'PosixPath' is not defined

In [None]:
df.sort_values(by="refcount", ascending=False).drop_duplicates(subset="id")[
    "type"
].value_counts()

In [None]:
df[df["id"] == 5304345792].sort_values(by="refcount", ascending=False).iloc[0]["repr"]

In [None]:
{
    "source_group_name": "Per_nuclei.sqlite",
    "source": {
        "source_path": PosixPath(
            "/Users/dabu5788/Documents/work/CytoTable-benchmarks-d33bs/notebooks/examples/data/all_cellprofiler.sqlite"
        ),
        "table_name": "Per_Nuclei",
        "offsets": [0],
    },
    "chunk_size": 1000,
    "offset": 0,
    "dest_path": PosixPath(
        "/Users/dabu5788/Documents/work/CytoTable-benchmarks-d33bs/notebooks/examples/data/test-result.parquet"
    ),
    "data_type_cast_map": None,
}