Skip to content

Commit

Permalink
Merge pull request #76 from gwaygenomics/fix-count-cells
Browse files Browse the repository at this point in the history
Enable cell counting with more than 2 defined strata
  • Loading branch information
gwaybio committed Mar 24, 2020
2 parents 17a1a6e + cab5778 commit 88fcf65
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 19 deletions.
22 changes: 13 additions & 9 deletions pycytominer/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(
currently only supports one of ['mean', 'median']
output_file - [default: "none"] string if specified, write to location
compartments - list of compartments to process
merge_cols - column indicating which columns to merge compartments using
merge_cols - column indicating which columns to merge images and compartments
subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of
single cells to select
subsample_n - [default: "all"] int indicating how many samples to include
Expand Down Expand Up @@ -130,20 +130,24 @@ def count_cells(self, compartment="cells", count_subset=False):
if count_subset:
assert self.is_aggregated, "Make sure to aggregate_profiles() first!"
assert self.is_subset_computed, "Make sure to get_subsample() first!"
count_df = pd.crosstab(
self.subset_data_df.loc[:, self.strata[1]],
self.subset_data_df.loc[:, self.strata[0]],
).reset_index()
count_df = (
self.subset_data_df.groupby(self.strata)["ObjectNumber"]
.count()
.reset_index()
.rename({"ObjectNumber": "cell_count"}, axis="columns")
)
else:
query_cols = "TableNumber, ImageNumber, ObjectNumber"
query = "select {} from {}".format(query_cols, compartment)
count_df = self.image_df.merge(
pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
)

count_df = pd.crosstab(
count_df.loc[:, self.strata[1]], count_df.loc[:, self.strata[0]]
).reset_index()
count_df = (
count_df.groupby(self.strata)["ObjectNumber"]
.count()
.reset_index()
.rename({"ObjectNumber": "cell_count"}, axis="columns")
)

return count_df

Expand Down
9 changes: 6 additions & 3 deletions pycytominer/tests/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@

data_missing_df = pd.concat(
[
pd.DataFrame({"g": "a", "Cells_x": [1, 3, 8, np.nan], "Nuclei_y": [5, np.nan, 3, 1]}),
pd.DataFrame({"g": "b", "Cells_x": [1, 3, np.nan, 5], "Nuclei_y": [np.nan, 8, 3, 1]}),
pd.DataFrame(
{"g": "a", "Cells_x": [1, 3, 8, np.nan], "Nuclei_y": [5, np.nan, 3, 1]}
),
pd.DataFrame(
{"g": "b", "Cells_x": [1, 3, np.nan, 5], "Nuclei_y": [np.nan, 8, 3, 1]}
),
]
).reset_index(drop=True)

Expand Down Expand Up @@ -134,4 +138,3 @@ def test_aggregate_median_with_missing_values():
expected_result = expected_result.astype(dtype_convert_dict)

assert aggregate_result.equals(expected_result)

111 changes: 104 additions & 7 deletions pycytominer/tests/test_aggregate_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
random.seed(123)


def build_random_data(compartment="cells"):
def build_random_data(
compartment="cells",
ImageNumber=sorted(["x", "y"] * 50),
TableNumber=sorted(["x_hash", "y_hash"] * 50),
):
a_feature = random.sample(range(1, 1000), 100)
b_feature = random.sample(range(1, 1000), 100)
c_feature = random.sample(range(1, 1000), 100)
Expand All @@ -19,12 +23,14 @@ def build_random_data(compartment="cells"):
{"a": a_feature, "b": b_feature, "c": c_feature, "d": d_feature}
).reset_index(drop=True)

data_df.columns = ["{}_{}".format(compartment.capitalize(), x) for x in data_df.columns]
data_df.columns = [
"{}_{}".format(compartment.capitalize(), x) for x in data_df.columns
]

data_df = data_df.assign(
ObjectNumber=list(range(1, 51)) * 2,
ImageNumber=sorted(["x", "y"] * 50),
TableNumber=sorted(["x_hash", "y_hash"] * 50),
ImageNumber=ImageNumber,
TableNumber=TableNumber,
)

return data_df
Expand Down Expand Up @@ -114,7 +120,13 @@ def test_AggregateProfiles_reset_variables():

def test_AggregateProfiles_count():
count_df = ap.count_cells()
expected_count = pd.DataFrame({"Metadata_Well": ["A01", "A02"], "plate": [50, 50]})
expected_count = pd.DataFrame(
{
"Metadata_Plate": ["plate", "plate"],
"Metadata_Well": ["A01", "A02"],
"cell_count": [50, 50],
}
)
pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)


Expand Down Expand Up @@ -166,13 +178,25 @@ def test_aggregate_profiles():

def test_aggregate_subsampling_count_cells():
count_df = ap_subsample.count_cells()
expected_count = pd.DataFrame({"Metadata_Well": ["A01", "A02"], "plate": [50, 50]})
expected_count = pd.DataFrame(
{
"Metadata_Plate": ["plate", "plate"],
"Metadata_Well": ["A01", "A02"],
"cell_count": [50, 50],
}
)
pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)

profiles = ap_subsample.aggregate_profiles()

count_df = ap_subsample.count_cells(count_subset=True)
expected_count = pd.DataFrame({"Metadata_Well": ["A01", "A02"], "plate": [2, 2]})
expected_count = pd.DataFrame(
{
"Metadata_Plate": ["plate", "plate"],
"Metadata_Well": ["A01", "A02"],
"cell_count": [2, 2],
}
)
pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)


Expand Down Expand Up @@ -237,3 +261,76 @@ def test_aggregate_subsampling_profile_compress():
)

pd.testing.assert_frame_equal(result, expected_result)


def test_aggregate_count_cells_multiple_strata():
# Lauch a sqlite connection
file = "sqlite:///{}/test_strata.sqlite".format(tmpdir)

test_engine = create_engine(file)
test_conn = test_engine.connect()

# Setup data
base_image_number = sorted(["x", "y"] * 50)
base_table_number = sorted(["x_hash_a", "x_hash_b", "y_hash_a", "y_hash_b"] * 25)
cells_df = build_random_data(
compartment="cells",
ImageNumber=base_image_number,
TableNumber=base_table_number,
)
cytoplasm_df = build_random_data(
compartment="cytoplasm",
ImageNumber=base_image_number,
TableNumber=base_table_number,
)
nuclei_df = build_random_data(
compartment="nuclei",
ImageNumber=base_image_number,
TableNumber=base_table_number,
)
image_df = pd.DataFrame(
{
"TableNumber": ["x_hash_a", "x_hash_b", "y_hash_a", "y_hash_b"],
"ImageNumber": ["x", "x", "y", "y"],
"Metadata_Plate": ["plate"] * 4,
"Metadata_Well": ["A01", "A02"] * 2,
"Metadata_Site": [1, 1, 2, 2],
}
).sort_values(by="Metadata_Well")

# Ingest data into temporary sqlite file
image_df.to_sql("image", con=test_engine, index=False, if_exists="replace")
cells_df.to_sql("cells", con=test_engine, index=False, if_exists="replace")
cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace")
nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace")

# Setup AggregateProfiles Class
ap_strata = AggregateProfiles(
sql_file=file,
subsample_n="4",
strata=["Metadata_Plate", "Metadata_Well", "Metadata_Site"],
)

count_df = ap_strata.count_cells()
expected_count = pd.DataFrame(
{
"Metadata_Plate": ["plate"] * 4,
"Metadata_Well": sorted(["A01", "A02"] * 2),
"Metadata_Site": [1, 2] * 2,
"cell_count": [25] * 4,
}
)
pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)

profiles = ap_strata.aggregate_profiles()

count_df = ap_strata.count_cells(count_subset=True)
expected_count = pd.DataFrame(
{
"Metadata_Plate": ["plate"] * 4,
"Metadata_Well": sorted(["A01", "A02"] * 2),
"Metadata_Site": [1, 2] * 2,
"cell_count": [4] * 4,
}
)
pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)

0 comments on commit 88fcf65

Please sign in to comment.