Skip to content

Commit

Permalink
Merge pull request #73 from gwaygenomics/broad-sample-annotate
Browse files Browse the repository at this point in the history
Add CMAP options to annotate
  • Loading branch information
gwaybio committed Apr 8, 2020
2 parents 88fcf65 + 9936835 commit dd064c2
Show file tree
Hide file tree
Showing 3 changed files with 324 additions and 1 deletion.
129 changes: 129 additions & 0 deletions pycytominer/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,25 @@
Annotates profiles with metadata information
"""

import os
import numpy as np
import pandas as pd
from pycytominer.cyto_utils.output import output
from pycytominer.cyto_utils import infer_cp_features


def annotate(
profiles,
platemap,
cell_id="unknown",
join_on=["Metadata_well_position", "Metadata_Well"],
output_file="none",
add_metadata_id_to_platemap=True,
format_broad_cmap=False,
perturbation_mode="none",
external_metadata="none",
external_join_left="none",
external_join_right="none",
compression=None,
float_format=None,
):
Expand All @@ -22,6 +30,7 @@ def annotate(
Arguments:
profiles - either pandas DataFrame or a file that stores profile data
platemap - either pandas DataFrame or a file that stores platemap metadata
cell_id - [default: "unknown"] provide a string to annotate cell id column
join_on - list of length two indicating which variables to merge profiles and plate
[default: ["Metadata_well_position", "Metadata_Well"]]. The first element
indicates variable(s) in platemap and the second element indicates
Expand All @@ -31,6 +40,14 @@ def annotate(
if not specified, will return the annotated profiles. We recommend
that this output file be suffixed with "_augmented.csv".
add_metadata_id_to_platemap - boolean if the platemap variables should be recoded
format_broad_cmap - [default: False] boolean if we need to add columns to make
compatible with Broad CMAP naming conventions.
perturbation_mode - [default: "none"] - either "chemical", "genetic" or "none" and only
active if format_broad_cmap == True
external_metadata - [default: "none"] a string indicating a file with additional
metadata information
external_join_left - [default: "none"] the merge column in the profile metadata
external_join_right - [default: "none"] the merge column in the external metadata
compression - the mechanism to compress [default: None]
float_format - decimal precision to use in writing output file [default: None]
For example, use "%.3g" for 3 decimal precision.
Expand Down Expand Up @@ -62,6 +79,118 @@ def annotate(
profiles, left_on=join_on[0], right_on=join_on[1], how="inner"
).drop(join_on[0], axis="columns")

if format_broad_cmap:

pert_opts = ["none", "chemical", "genetic"]
assert (
perturbation_mode in pert_opts
), "perturbation mode must be one of {}".format(pert_opts)

assert (
"Metadata_broad_sample" in annotated.columns
), "Are you sure this is a CMAP file? 'Metadata_broad_sample column not found.'"

annotated = annotated.assign(
Metadata_pert_id=annotated.Metadata_broad_sample.str.extract(
r"(BRD[-N][A-Z0-9]+)"
),
Metadata_pert_mfc_id=annotated.Metadata_broad_sample,
Metadata_pert_well=annotated.loc[:, join_on[1]],
Metadata_pert_id_vendor="",
)

if "Metadata_pert_iname" in annotated.columns:
annotated = annotated.assign(
Metadata_pert_mfc_desc=annotated.Metadata_pert_iname,
Metadata_pert_name=annotated.Metadata_pert_iname,
)

if "Metadata_cell_id" not in annotated.columns:
annotated = annotated.assign(Metadata_cell_id=cell_id)

if perturbation_mode == "chemical":
annotated = annotated.assign(
Metadata_broad_sample_type=[
"control" if x in ["DMSO", np.nan] else "trt"
for x in annotated.Metadata_broad_sample
]
)

# Generate Metadata_broad_sample column
annotated.loc[
annotated.Metadata_broad_sample_type == "control",
"Metadata_broad_sample",
] = "DMSO"
annotated.loc[
annotated.Metadata_broad_sample == "empty", "Metadata_broad_sample_type"
] = "empty"

if "Metadata_mmoles_per_liter" in annotated.columns:
annotated.loc[
annotated.Metadata_broad_sample_type == "control",
"Metadata_mmoles_per_liter",
] = 0

if "Metadata_solvent" in annotated.columns:
annotated = annotated.assign(
Metadata_pert_vehicle=annotated.Metadata_solvent
)
if "Metadata_mg_per_ml" in annotated.columns:
annotated.loc[
annotated.Metadata_broad_sample_type == "control",
"Metadata_mg_per_ml",
] = 0

if perturbation_mode == "genetic":
if "Metadata_pert_name" in annotated.columns:
annotated = annotated.assign(
Metadata_broad_sample_type=[
"control" if x == "EMPTY" else "trt"
for x in annotated.Metadata_pert_name
]
)

if "Metadata_broad_sample_type" in annotated.columns:
annotated = annotated.assign(
Metadata_pert_type=annotated.Metadata_broad_sample_type
)
else:
annotated = annotated.assign(
Metadata_pert_type="", Metadata_broad_sample_type=""
)

# Add specific Connectivity Map (CMAP) formatting
if not isinstance(external_metadata, pd.DataFrame):
if external_metadata != "none":
assert os.path.exists(
external_metadata
), "external metadata at {} does not exist".format(external_metadata)

external_metadata = pd.read_csv(external_metadata)

if isinstance(external_metadata, pd.DataFrame):
external_metadata.columns = [
"Metadata_{}".format(x) if not x.startswith("Metadata_") else x
for x in external_metadata.columns
]

annotated = (
annotated.merge(
external_metadata,
left_on=external_join_left,
right_on=external_join_right,
how="left",
)
.reset_index(drop=True)
.drop_duplicates()
)

# Reorder annotated metadata columns
meta_cols = infer_cp_features(annotated, metadata=True)
other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

annotated = annotated.loc[:, meta_cols + other_cols]

if output_file != "none":
output(
df=annotated,
Expand Down
194 changes: 194 additions & 0 deletions pycytominer/tests/test_annotate_cmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import os
import tempfile
import random
import pytest
import pandas as pd
from pycytominer import annotate

random.seed(123)

# Get temporary directory
tmpdir = tempfile.gettempdir()

# Lauch a sqlite connection
output_file = os.path.join(tmpdir, "test_external.csv")

# Build data to use in tests
example_broad_samples = [
"BRD-K76022557-003-28-9",
"BRD-K65856711-001-03-6",
"BRD-K38019854-323-01-4",
"BRD-K06182768-001-02-3",
"BRD-K91623615-001-06-8",
"BRD-K13094524-001-09-1",
]
expected_pert_ids = [
"BRD-K76022557",
"BRD-K65856711",
"BRD-K38019854",
"BRD-K06182768",
"BRD-K91623615",
"BRD-K13094524",
]
example_genetic_perts = ["TP53", "KRAS", "DNMT3", "PTEN", "EMPTY", "EMPTY"]

data_df = pd.concat(
[
pd.DataFrame(
{"Metadata_Well": ["A01", "A02", "A03"], "x": [1, 3, 8], "y": [5, 3, 1]}
),
pd.DataFrame(
{"Metadata_Well": ["B01", "B02", "B03"], "x": [1, 3, 5], "y": [8, 3, 1]}
),
]
).reset_index(drop=True)

platemap_df = pd.DataFrame(
{
"well_position": ["A01", "A02", "A03", "B01", "B02", "B03"],
"gene": ["x", "y", "z"] * 2,
}
).reset_index(drop=True)

broad_platemap_df = platemap_df.assign(Metadata_broad_sample=example_broad_samples)


def test_annotate_cmap_assert():
with pytest.raises(AssertionError) as nocmap:
anno_result = annotate(
profiles=data_df,
platemap=platemap_df,
join_on=["Metadata_well_position", "Metadata_Well"],
format_broad_cmap=True,
perturbation_mode="none",
)

assert "Are you sure this is a CMAP file?" in str(nocmap.value)


def test_annotate_cmap_pertnone():
anno_result = annotate(
profiles=data_df,
platemap=broad_platemap_df,
join_on=["Metadata_well_position", "Metadata_Well"],
format_broad_cmap=True,
perturbation_mode="none",
)

added_cols = [
"Metadata_pert_id",
"Metadata_pert_mfc_id",
"Metadata_pert_well",
"Metadata_pert_id_vendor",
"Metadata_cell_id",
"Metadata_pert_type",
"Metadata_broad_sample_type",
]

assert all(x in anno_result.columns for x in added_cols)
assert anno_result.Metadata_pert_id.tolist() == expected_pert_ids


def test_annotate_cmap_pertgenetic():
anno_result = annotate(
profiles=data_df,
platemap=broad_platemap_df.assign(Metadata_pert_name=example_genetic_perts),
join_on=["Metadata_well_position", "Metadata_Well"],
format_broad_cmap=True,
perturbation_mode="genetic",
)

expected_Metadata_pert_type = ["trt", "trt", "trt", "trt", "control", "control"]
assert anno_result.Metadata_pert_type.tolist() == expected_Metadata_pert_type
assert (
anno_result.Metadata_broad_sample_type.tolist() == expected_Metadata_pert_type
)
assert anno_result.Metadata_pert_id.tolist() == expected_pert_ids


def test_annotate_cmap_pertchemical():
anno_result = annotate(
profiles=data_df,
platemap=broad_platemap_df,
join_on=["Metadata_well_position", "Metadata_Well"],
format_broad_cmap=True,
perturbation_mode="genetic",
)

added_cols = [
"Metadata_pert_id",
"Metadata_pert_mfc_id",
"Metadata_pert_well",
"Metadata_pert_id_vendor",
"Metadata_cell_id",
"Metadata_pert_type",
"Metadata_broad_sample_type",
]

assert all(x in anno_result.columns for x in added_cols)

some_doses = [1000, 2, 1, 1, 1, 1]
chemical_platemap = broad_platemap_df.copy()
chemical_platemap.loc[0, "Metadata_broad_sample"] = "DMSO"
chemical_platemap = chemical_platemap.assign(
Metadata_mmoles_per_liter=some_doses,
Metadata_mg_per_ml=some_doses,
Metadata_solvent="DMSO",
)

anno_result = annotate(
profiles=data_df,
platemap=chemical_platemap,
join_on=["Metadata_well_position", "Metadata_Well"],
format_broad_cmap=True,
perturbation_mode="chemical",
)
expected_Metadata_pert_type = ["control", "trt", "trt", "trt", "trt", "trt"]
assert anno_result.Metadata_pert_type.tolist() == expected_Metadata_pert_type
assert (
anno_result.Metadata_broad_sample_type.tolist() == expected_Metadata_pert_type
)

expected_dose = [0, 2, 1, 1, 1, 1]
assert anno_result.Metadata_mmoles_per_liter.tolist() == expected_dose
assert anno_result.Metadata_mg_per_ml.tolist() == expected_dose

added_cols += [
"Metadata_mmoles_per_liter",
"Metadata_mg_per_ml",
"Metadata_solvent",
"Metadata_pert_vehicle",
]
assert all(x in anno_result.columns for x in added_cols)


def test_annotate_cmap_externalmetadata():
external_data_example = pd.DataFrame(
{"test_well_join": ["A01"], "test_info_col": ["DMSO is cool"]}
).reset_index(drop=True)

external_data_example.to_csv(output_file, index=False, sep=",")

some_doses = [1000, 2, 1, 1, 1, 1]
chemical_platemap = broad_platemap_df.copy()
chemical_platemap.loc[0, "Metadata_broad_sample"] = "DMSO"
chemical_platemap = chemical_platemap.assign(
Metadata_mmoles_per_liter=some_doses,
Metadata_mg_per_ml=some_doses,
Metadata_solvent="DMSO",
Metadata_cell_id="A549",
)

anno_result = annotate(
profiles=data_df,
platemap=chemical_platemap,
join_on=["Metadata_well_position", "Metadata_Well"],
format_broad_cmap=True,
perturbation_mode="chemical",
external_metadata=output_file,
external_join_left="Metadata_Well",
external_join_right="Metadata_test_well_join",
)

assert anno_result.loc[0, "Metadata_test_info_col"] == "DMSO is cool"
assert anno_result.Metadata_cell_id.unique()[0] == "A549"
2 changes: 1 addition & 1 deletion pycytominer/variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def variance_threshold(
population_df, features="infer", samples="none", freq_cut=0.05, unique_cut=0.01
):
"""
Exclude features that have correlations below a certain threshold
Exclude features that have low variance (low information content)
Arguments:
population_df - pandas DataFrame that includes metadata and observation features
Expand Down

0 comments on commit dd064c2

Please sign in to comment.