# CytoTable from the cloud (using cloud-based data sources)

![Image showing feature data from the cloud being read by CytoTable and exported to a CytoTable file afterwards.](../_static/features_to_cytotable_cloud.png)

__Figure 1.__ _CytoTable is capable of reading data from cloud-based locations such as AWS S3._

This notebook includes a quick demonstration of CytoTable with cloud-based data sources.
For a more general overview of using CytoTable and the concepts behind the work please see: [CytoTable mise en place (general overview)](https://cytomining.github.io/CytoTable/examples/cytotable_mise_en_place_general_overview.html)

In [1]:
import pathlib
from collections import Counter

import cytotable
import pandas as pd
import pyarrow.parquet as pq
from cloudpathlib import CloudPath, S3Client

## Using CytoTable with cloud-based CSV's

In [2]:
# setup variables for use in this section of the notebook
source_path = (
    "s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/"
    "analysis/2025_01_12_U2OS_Batch2/BR00146052/analysis/BR00146052-A01-1/"
)
dest_path = "./cloud_example.parquet"

In [3]:
# remove the dest_path if it's present
if pathlib.Path(dest_path).is_file():
    pathlib.Path(dest_path).unlink()

In [4]:
# setup a source cloudpath using unsigned (anonymous) requests to AWS S3
# to access publicly-available data using CytoTable
source_cloud_path = S3Client(no_sign_request=True).CloudPath(source_path)
print(source_cloud_path)
# show the files within the path
list(source_cloud_path.glob("*"))

s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/analysis/2025_01_12_U2OS_Batch2/BR00146052/analysis/BR00146052-A01-1/


[S3Path('s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/analysis/2025_01_12_U2OS_Batch2/BR00146052/analysis/BR00146052-A01-1/outlines'),
 S3Path('s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/analysis/2025_01_12_U2OS_Batch2/BR00146052/analysis/BR00146052-A01-1/Cells.csv'),
 S3Path('s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/analysis/2025_01_12_U2OS_Batch2/BR00146052/analysis/BR00146052-A01-1/CellsIncludingEdges.csv'),
 S3Path('s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/analysis/2025_01_12_U2OS_Batch2/BR00146052/analysis/BR00146052-A01-1/Cytoplasm.csv'),
 S3Path('s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/analysis/2025_01_12_U2OS_Batch2/BR00146052/analysis/BR00146052-A01-1/Experiment.csv'),
 S3Path('s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/analysis/2025_01_12_U2OS_Batch2/BR00146052/analysis/BR00146052-A01-1/Image.csv'),
 S3Path('s3://cellpainting-gallery/cpg0037-oasis/broad/workspace/analysis/2025_01_12_U2OS_Batch2

In [5]:
%%time

# run cytotable convert
result = cytotable.convert(
    source_path=source_path,
    source_datatype="csv",
    # set a chunk size for paginated
    # processing of results
    chunk_size=30000,
    # specify the destination path
    dest_path=dest_path,
    # specify a destination data format type
    dest_datatype="parquet",
    # specify a preset which enables quick use of common input file formats
    preset="cellprofiler_csv",
    # use unsigned (anonymous) requests to AWS S3
    no_sign_request=True,
)
print(pathlib.Path(result).name)

cloud_example.parquet
CPU times: user 665 ms, sys: 474 ms, total: 1.14 s
Wall time: 33.6 s


In [6]:
# show the table head using pandas
pq.read_table(source=result).to_pandas().head()

Unnamed: 0,Metadata_ImageNumber,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ObjectNumber,Image_FileName_CellOutlines,Image_FileName_IllumAGP,Image_FileName_IllumBrightfield,Image_FileName_IllumDNA,Image_FileName_IllumER,Image_FileName_IllumMito,...,Nuclei_Texture_Variance_RNA_10_02_256,Nuclei_Texture_Variance_RNA_10_03_256,Nuclei_Texture_Variance_RNA_3_00_256,Nuclei_Texture_Variance_RNA_3_01_256,Nuclei_Texture_Variance_RNA_3_02_256,Nuclei_Texture_Variance_RNA_3_03_256,Nuclei_Texture_Variance_RNA_5_00_256,Nuclei_Texture_Variance_RNA_5_01_256,Nuclei_Texture_Variance_RNA_5_02_256,Nuclei_Texture_Variance_RNA_5_03_256
0,1,1,1,1,A01_s1--cell_outlines.png,BR00146052_IllumAGP.npy,BR00146052_IllumBrightfield.npy,BR00146052_IllumDNA.npy,BR00146052_IllumER.npy,BR00146052_IllumMito.npy,...,87.268467,96.580786,80.795675,81.534795,83.061909,82.135369,82.21461,83.472095,83.587552,85.04391
1,1,2,2,2,A01_s1--cell_outlines.png,BR00146052_IllumAGP.npy,BR00146052_IllumBrightfield.npy,BR00146052_IllumDNA.npy,BR00146052_IllumER.npy,BR00146052_IllumMito.npy,...,82.909202,80.373067,81.970114,81.418564,83.180896,82.860898,78.857701,80.516994,83.621363,85.74305
2,1,3,3,3,A01_s1--cell_outlines.png,BR00146052_IllumAGP.npy,BR00146052_IllumBrightfield.npy,BR00146052_IllumDNA.npy,BR00146052_IllumER.npy,BR00146052_IllumMito.npy,...,26.404112,26.620592,24.788549,25.995656,24.661882,26.114983,26.546437,26.839052,26.655321,29.181173
3,1,4,4,4,A01_s1--cell_outlines.png,BR00146052_IllumAGP.npy,BR00146052_IllumBrightfield.npy,BR00146052_IllumDNA.npy,BR00146052_IllumER.npy,BR00146052_IllumMito.npy,...,67.762866,142.014839,100.85994,106.72622,118.372463,102.754337,99.659909,108.074924,111.27744,108.468769
4,1,5,5,5,A01_s1--cell_outlines.png,BR00146052_IllumAGP.npy,BR00146052_IllumBrightfield.npy,BR00146052_IllumDNA.npy,BR00146052_IllumER.npy,BR00146052_IllumMito.npy,...,2859.40417,2317.858027,2883.111739,2923.312269,2835.480838,2883.54814,2905.25921,2941.765027,2879.090419,2799.688685


In [7]:
# show metadata for the result file
pq.read_metadata(result)

<pyarrow._parquet.FileMetaData object at 0x16e018680>
  created_by: parquet-cpp-arrow version 22.0.0
  num_columns: 4413
  num_rows: 36
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 1354645

In [8]:
# show schema metadata which includes CytoTable information
# note: this information will travel with the file.
pq.read_schema(result).metadata

{b'data-producer': b'https://github.com/cytomining/CytoTable',
 b'data-producer-version': b'1.1.2.post8.dev0+5918618'}

In [9]:
# show schema column name summaries
print("Column name prefix counts:")
dict(Counter(w.split("_", 1)[0] for w in pq.read_schema(result).names))

Column name prefix counts:


{'Metadata': 4, 'Image': 14, 'Cytoplasm': 1480, 'Cells': 1494, 'Nuclei': 1421}

## Using CytoTable with cloud-based SQLite databases

In [10]:
# setup variables for use in this section of the notebook
source_path = (
    "s3://cellpainting-gallery/cpg0016-jump/source_4/"
    "workspace/backend/2021_08_23_Batch12/BR00126114"
)
dest_path = "./cloud_example.parquet"

In [11]:
# remove the dest_path if it's present
if pathlib.Path(dest_path).is_file():
    pathlib.Path(dest_path).unlink()

In [12]:
# setup a source cloudpath using unsigned (anonymous) requests to AWS S3
# to access publicly-available data using CytoTable
source_cloud_path = S3Client(no_sign_request=True).CloudPath(source_path)
print(source_cloud_path)
# show the files within the path
list(source_cloud_path.glob("*"))

s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114


[S3Path('s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114.csv'),
 S3Path('s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114.sqlite')]

In [13]:
%%time

# run cytotable convert
result = cytotable.convert(
    source_path=source_path,
    source_datatype="sqlite",
    # set a chunk size for paginated
    # processing of results
    chunk_size=30000,
    # specify the destination path
    dest_path=dest_path,
    # specify a destination data format type
    dest_datatype="parquet",
    # specify a preset which enables quick use of common input file formats
    preset="cellprofiler_sqlite_cpg0016_jump",
    # use unsigned (anonymous) requests to AWS S3
    no_sign_request=True,
    # set a local cache to avoid challenges with
    # OS-specific temp disk space
    local_cache_dir=f"./sqlite_s3_cache",
)
print(pathlib.Path(result).name)

Error: Unable to open database "/var/folders/zw/l3g4vq6508g98qkt05z6hb_m0000gp/T/tmp8ef3uvt4/cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114.sqlite": unable to open database file

LINE 4:                         FROM sqlite_scan(?, 'sqlite_master')
                                     ^

In [14]:
# show the table head using pandas
pq.read_table(source=result).to_pandas().head()

FileNotFoundError: /Users/buntend/Documents/work/CytoTable/docs/source/examples/cloud_example.parquet

In [None]:
# show metadata for the result file
pq.read_metadata(result)

In [None]:
# show schema metadata which includes CytoTable information
# note: this information will travel with the file.
pq.read_schema(result).metadata

In [None]:
# show schema column name summaries
print("Column name prefix counts:")
dict(Counter(w.split("_", 1)[0] for w in pq.read_schema(result).names))

In [None]:
# show full schema details
pq.read_schema(result)