Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change whiten nomenclature to spherize #102

Merged
merged 4 commits into from
Sep 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions pycytominer/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
infer_cp_features,
load_profiles,
)
from pycytominer.operations import Whiten, RobustMAD
from pycytominer.operations import Spherize, RobustMAD


def normalize(
Expand All @@ -22,8 +22,8 @@ def normalize(
output_file="none",
compression=None,
float_format=None,
whiten_center=True,
whiten_method="ZCA-cor",
spherize_center=True,
spherize_method="ZCA-cor",
):
"""
Normalize features
Expand All @@ -46,11 +46,11 @@ def normalize(
that this output file be suffixed with "_normalized.csv".
compression - the mechanism to compress [default: None]
float_format - decimal precision to use in writing output file [default: None]
For example, use "%.3g" for 3 decimal precision.
whiten_center - if data should be centered before whitening transform [default: True]
(only used if method = "whiten")
whiten_method - the type of whitening normalization used [default: 'ZCA-cor']
(only used if method = "whiten")
For example, use "%.3g" for 3 decimal precision.
spherize_center - if data should be centered before sphering (aka whitening)
transform (only used if method = "spherize") [default: True]
spherize_method - the type of sphering (aka whitening) normalization used (only
used if method = "spherize") [default: 'ZCA-cor']

Return:
A normalized DataFrame
Expand All @@ -62,7 +62,7 @@ def normalize(
# Define which scaler to use
method = method.lower()

avail_methods = ["standardize", "robustize", "mad_robustize", "whiten"]
avail_methods = ["standardize", "robustize", "mad_robustize", "spherize"]
assert method in avail_methods, "operation must be one {}".format(avail_methods)

if method == "standardize":
Expand All @@ -71,8 +71,8 @@ def normalize(
scaler = RobustScaler()
elif method == "mad_robustize":
scaler = RobustMAD()
elif method == "whiten":
scaler = Whiten(center=whiten_center, method=whiten_method)
elif method == "spherize":
scaler = Spherize(center=spherize_center, method=spherize_method)

if features == "infer":
features = infer_cp_features(profiles)
Expand Down
2 changes: 1 addition & 1 deletion pycytominer/operations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .correlation_threshold import correlation_threshold
from .variance_threshold import variance_threshold, calculate_frequency
from .get_na_columns import get_na_columns
from .transform import Whiten, RobustMAD
from .transform import Spherize, RobustMAD
from .sparse_random_projection import sparse_random_projection
19 changes: 10 additions & 9 deletions pycytominer/operations/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
from sklearn.base import BaseEstimator, TransformerMixin


class Whiten(BaseEstimator, TransformerMixin):
class Spherize(BaseEstimator, TransformerMixin):
"""
Class to whiten data in the base sklearn transform API
Note, this implementation is modified/inspired from the following sources:
Class to apply a sphering transform (aka whitening) data in the base sklearn
transform API. Note, this implementation is modified/inspired from the following
sources:
1) A custom function written by Juan C. Caicedo
2) A custom ZCA function at https://github.com/mwv/zca
3) Notes from Niranj Chandrasekaran (https://github.com/cytomining/pycytominer/issues/90)
Expand All @@ -26,7 +27,7 @@ def __init__(self, epsilon=1e-6, center=True, method="ZCA"):
Arguments:
epsilon - fudge factor parameter
center - option to center input X matrix
method - a string indicating which class of whitening to perform
method - a string indicating which class of sphering to perform
"""
avail_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]

Expand All @@ -40,10 +41,10 @@ def __init__(self, epsilon=1e-6, center=True, method="ZCA"):

def fit(self, X, y=None):
"""
Identify the whitening transform given self.X
Identify the sphering transform given self.X

Argument:
X - dataframe to fit whitening transform
X - dataframe to fit sphering transform
"""
# Get the mean of the features (columns) and center if specified
self.mu = X.mean()
Expand All @@ -63,7 +64,7 @@ def fit(self, X, y=None):
# Process the eigenvalues into a diagonal matrix and fix rounding errors
D = np.diag(1.0 / np.sqrt(s.clip(self.epsilon)))

# Calculate the whitening matrix
# Calculate the sphering matrix
self.W = np.dot(D, U.transpose())

# If ZCA, perform additional rotation
Expand Down Expand Up @@ -91,7 +92,7 @@ def fit(self, X, y=None):
# process the covariance diagonal matrix and fix rounding errors
v = np.diag(1.0 / np.sqrt(np.diag(C).clip(self.epsilon)))

# Calculate the whitening matrix
# Calculate the sphering matrix
self.W = np.dot(np.dot(D, G.transpose()), v)

# If ZCA-cor, perform additional rotation
Expand All @@ -102,7 +103,7 @@ def fit(self, X, y=None):

def transform(self, X, y=None):
"""
Perform the whitening transform
Perform the sphering transform
"""
return np.dot(X - self.mu, self.W.transpose())

Expand Down
32 changes: 16 additions & 16 deletions pycytominer/tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
d_feature = random.sample(range(1, 100), 10)
id_feature = ["control"] * 5 + ["treatment"] * 5

data_whiten_df = pd.DataFrame(
data_spherize_df = pd.DataFrame(
{"a": a_feature, "b": b_feature, "c": c_feature, "d": d_feature, "id": id_feature}
).reset_index(drop=True)

Expand Down Expand Up @@ -438,16 +438,16 @@ def test_normalize_standardize_allsamples_compress():
pd.testing.assert_frame_equal(normalize_result, expected_result)


def test_normalize_whiten():
for whiten_method in ["ZCA", "PCA", "ZCA-cor", "PCA-cor"]:
for whiten_center in [True, False]:
def test_normalize_spherize():
for spherize_method in ["ZCA", "PCA", "ZCA-cor", "PCA-cor"]:
for spherize_center in [True, False]:
result = normalize(
data_whiten_df,
data_spherize_df,
features=["a", "b", "c", "d"],
meta_features=["id"],
method="whiten",
whiten_method=whiten_method,
whiten_center=whiten_center,
method="spherize",
spherize_method=spherize_method,
spherize_center=spherize_center,
)
result_cov = (
pd.DataFrame(np.cov(np.transpose(result.drop("id", axis="columns"))))
Expand All @@ -456,17 +456,17 @@ def test_normalize_whiten():
.clip(1)
.sum()
)
expected_result = data_whiten_df.shape[1] - 1
expected_result = data_spherize_df.shape[1] - 1
assert result_cov == expected_result

result = normalize(
data_whiten_df,
data_spherize_df,
samples="id == 'control'",
features=["a", "b", "c", "d"],
meta_features=["id"],
method="whiten",
whiten_method=whiten_method,
whiten_center=whiten_center,
method="spherize",
spherize_method=spherize_method,
spherize_center=spherize_center,
)
result_cov = (
np.cov(
Expand All @@ -480,10 +480,10 @@ def test_normalize_whiten():
.sum()
)
# Add some tolerance to result b/c of low sample size
expected_result = data_whiten_df.shape[1]
expected_result = data_spherize_df.shape[1]
assert result_cov < expected_result

non_whiten_result_cov = (
non_spherize_result_cov = (
np.cov(
np.transpose(
result.query("id == 'treatment'").drop("id", axis="columns")
Expand All @@ -493,4 +493,4 @@ def test_normalize_whiten():
.sum()
.sum()
)
assert non_whiten_result_cov >= expected_result - 5
assert non_spherize_result_cov >= expected_result - 5
26 changes: 13 additions & 13 deletions pycytominer/tests/test_operations/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd
from scipy.stats import median_absolute_deviation
from pycytominer.operations.transform import Whiten, RobustMAD
from pycytominer.operations.transform import Spherize, RobustMAD

random.seed(123)

Expand All @@ -18,11 +18,11 @@
).reset_index(drop=True)


def test_whiten():
whiten_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
for method in whiten_methods:
def test_spherize():
spherize_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
for method in spherize_methods:
for center in [True, False]:
scaler = Whiten(method=method, center=center)
scaler = Spherize(method=method, center=center)
scaler = scaler.fit(data_df)
transform_df = scaler.transform(data_df)

Expand All @@ -40,24 +40,24 @@ def test_whiten():
assert int(result) == expected_result


def test_low_variance_whiten():
def test_low_variance_spherize():
err_str = "Divide by zero error, make sure low variance columns are removed"
data_no_variance = data_df.assign(e=1)
whiten_methods = ["PCA-cor", "ZCA-cor"]
for method in whiten_methods:
spherize_methods = ["PCA-cor", "ZCA-cor"]
for method in spherize_methods:
for center in [True, False]:
scaler = Whiten(method=method, center=center)
scaler = Spherize(method=method, center=center)
with pytest.raises(ValueError) as errorinfo:
scaler = scaler.fit(data_no_variance)

assert err_str in str(errorinfo.value.args[0])


def test_whiten_precenter():
def test_spherize_precenter():
data_precentered = data_df - data_df.mean()
whiten_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
for method in whiten_methods:
scaler = Whiten(method=method, center=False)
spherize_methods = ["PCA", "ZCA", "PCA-cor", "ZCA-cor"]
for method in spherize_methods:
scaler = Spherize(method=method, center=False)
scaler = scaler.fit(data_precentered)
transform_df = scaler.transform(data_df)

Expand Down