# CytoTable object graph analysis

This notebook explores how CytoTable objects operate. The work is related to [CytoTable#75](https://github.com/cytomining/CytoTable/issues/75).

In [1]:
import gc
import pathlib
import sys

import cytotable
import pandas as pd

gc.set_debug(gc.DEBUG_LEAK)

cytotable.convert(
    source_path="../examples/data/all_cellprofiler.sqlite",
    dest_path="./examples/data/test-result.parquet",
    dest_datatype="parquet",
    preset="cellprofiler_sqlite_pycytominer",
)

collected = gc.collect()
if gc.garbage:
    print(f"Memory leak detected: {len(gc.garbage)} objects")
    df = pd.DataFrame(
        [
            {
                "id": id(obj),
                "type": type(obj),
                "refcount": sys.getrefcount(obj),
                "repr": repr(obj),
                "size": sys.getsizeof(obj),
            }
            for obj in gc.garbage
        ]
    )

df.head()

Memory leak detected: 1119 objects


gc: collectable <list 0x14e9aca80>
gc: collectable <list 0x13f569740>
gc: collectable <dict 0x14e9c5a00>
gc: collectable <AppFuture 0x13ef3fa00>
gc: collectable <Condition 0x13ef3fd30>
gc: collectable <builtin_function_or_method 0x14ebec900>
gc: collectable <dict 0x13f3e2780>
gc: collectable <builtin_function_or_method 0x14ebec950>
gc: collectable <builtin_function_or_method 0x14ebec9a0>
gc: collectable <builtin_function_or_method 0x14ebec9f0>
gc: collectable <builtin_function_or_method 0x14ebeca40>
gc: collectable <collections.deque 0x12a373760>
gc: collectable <dict 0x14e9b23c0>
gc: collectable <list 0x13f569180>
gc: collectable <list 0x14e9ac680>
gc: collectable <list 0x14e9ac700>
gc: collectable <list 0x14ebe39c0>
gc: collectable <method 0x14ebe3a00>
gc: collectable <functools.partial 0x14ebecae0>
gc: collectable <tuple 0x10770b550>
gc: collectable <list 0x13f5690c0>
gc: collectable <Future 0x13ef3fc10>
gc: collectable <Condition 0x13ef3fb80>
gc: collectable <builtin_function_or_me

Unnamed: 0,id,type,refcount,repr,size
0,5613734528,<class 'list'>,10,[],56
1,5357606720,<class 'list'>,4,[],56
2,5613836800,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
3,5351143936,<class 'parsl.dataflow.futures.AppFuture'>,4,<AppFuture at 0x13ef3fa00 state=finished retur...,48
4,5351144752,<class 'threading.Condition'>,4,<Condition(<unlocked _thread.RLock object owne...,48


In [2]:
# create a list of files to reference
list_of_sqlite_files = [
    "../examples/data/all_cellprofiler.sqlite",
]

In [None]:
cytotable.convert(
    source_path="../examples/data/all_cellprofiler.sqlite",
    dest_path="./examples/data/test-result.parquet",
    dest_datatype="parquet",
    preset="cellprofiler_sqlite_pycytominer",
)

In [4]:
collected = gc.collect()
if gc.garbage:
    print(f"Memory leak detected: {len(gc.garbage)} objects")
    df = pd.DataFrame(
        [
            {
                "id": id(obj),
                "type": type(obj),
                "refcount": sys.getrefcount(obj),
                "repr": repr(obj),
                "size": sys.getsizeof(obj),
            }
            for obj in gc.garbage
        ]
    )

df.head()

Memory leak detected: 2232 objects


gc: collectable <cell 0x10770caf0>
gc: collectable <tuple 0x13eead280>
gc: collectable <function 0x110033790>
gc: collectable <cell 0x107705bb0>
gc: collectable <tuple 0x13efedc40>
gc: collectable <function 0x1075b5d30>
gc: collectable <list 0x11007f900>
gc: collectable <list 0x13f3b9dc0>
gc: collectable <dict 0x14eaaa9c0>
gc: collectable <AppFuture 0x1075b60d0>
gc: collectable <Condition 0x1075b6400>
gc: collectable <builtin_function_or_method 0x14ecaeb30>
gc: collectable <dict 0x14eab0340>
gc: collectable <builtin_function_or_method 0x10757dd60>
gc: collectable <builtin_function_or_method 0x1077205e0>
gc: collectable <builtin_function_or_method 0x107720680>
gc: collectable <builtin_function_or_method 0x1077206d0>
gc: collectable <collections.deque 0x1100944c0>
gc: collectable <dict 0x14eccf6c0>
gc: collectable <list 0x13eab8400>
gc: collectable <list 0x13e860940>
gc: collectable <list 0x13f3b4d00>
gc: collectable <list 0x10771a740>
gc: collectable <method 0x13f3b2c00>
gc: collectable

Unnamed: 0,id,type,refcount,repr,size
0,5613734528,<class 'list'>,9,[],56
1,5357606720,<class 'list'>,4,[],56
2,5613836800,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
3,5351143936,<class 'parsl.dataflow.futures.AppFuture'>,4,<AppFuture at 0x13ef3fa00 state=finished retur...,48
4,5351144752,<class 'threading.Condition'>,4,<Condition(<unlocked _thread.RLock object owne...,48


In [5]:
df.sort_values(by=["size", "refcount"], ascending=False).drop_duplicates(
    subset="id"
).head(30)

Unnamed: 0,id,type,refcount,repr,size
2,5613836800,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
40,5616125760,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
122,5616141376,<class 'dict'>,7,{'depends': [<AppFuture at 0x13e7b2550 state=f...,1176
172,5616165248,<class 'dict'>,7,{'depends': [<AppFuture at 0x13e7b2ac0 state=f...,1176
473,5616200512,<class 'dict'>,7,{'depends': [<AppFuture at 0x13e84d400 state=f...,1176
572,5616568064,<class 'dict'>,7,{'depends': [<AppFuture at 0x13e84da30 state=f...,1176
653,5617087680,<class 'dict'>,7,{'depends': [<AppFuture at 0x14e9c7ca0 state=f...,1176
734,5617035264,<class 'dict'>,7,{'depends': [<AppFuture at 0x13e6e7cd0 state=f...,1176
1127,5614774720,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176
1165,5355809600,<class 'dict'>,7,"{'depends': [], 'executor': '_parsl_internal',...",1176


In [6]:
df[
    ~df["repr"].str.contains("AppFuture") & ~df["repr"].str.contains("deque")
].sort_values(by=["size", "refcount"], ascending=False).drop_duplicates(
    subset="id"
).head(
    30
).to_csv(
    "leaks.csv"
)

In [7]:
df.sort_values(by="refcount", ascending=False).drop_duplicates(subset="id")[
    "type"
].value_counts()

type
<class 'list'>                                558
<class 'builtin_function_or_method'>          520
<class 'dict'>                                346
<class 'tuple'>                               134
<class 'method'>                              107
<class 'functools.partial'>                   107
<class 'collections.deque'>                   104
<class 'threading.Condition'>                 104
<class 'parsl.dataflow.futures.AppFuture'>     52
<class 'concurrent.futures._base.Future'>      52
<class 'cell'>                                 51
<class 'pathlib.PosixPath'>                    26
<class 'function'>                             23
<class 'pyarrow._parquet.ParquetSchema'>       18
<class 'pyarrow._parquet.FileMetaData'>        18
<class 'cython_function_or_method'>             6
<class 'collections.OrderedDict'>               2
<class 'getset_descriptor'>                     2
<class 'pyarrow._s3fs.__Pyx_EnumMeta'>          1
<class 'pyarrow._fs.__Pyx_EnumMeta'>         