Skip to content

Commit

Permalink
DP feature linear modeling and visualization (WayScience#31)
Browse files Browse the repository at this point in the history
* perform linear model with DP features

* add DP visualization for linear model

* add DP lm figure and small tweak to CP lm fig

* remove extra empty cell

* add DP cyto feature power analysis and viz

* add title to CP power analysis viz
  • Loading branch information
gwaybio committed Jan 20, 2023
1 parent 62170be commit a90feed
Show file tree
Hide file tree
Showing 16 changed files with 9,420 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -387,14 +387,6 @@
"dev.off()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20688dd7-b20e-4509-98d4-e607c8d7f3e8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "f0989e9b-604d-459d-bdda-bfe8752f5be3",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,6 @@ png(paste0(cp_heatmap_file_noext, ".png"), width = 6.5, height = 6, units = "in"
draw(ht)
dev.off()



# Load data
dp_file <- file.path(input_data_dir, "nf1_sc_norm_fs_deepprofiler_nuc.csv.gz")

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
582 changes: 577 additions & 5 deletions 5_analyze_data/notebooks/linear_model/fit_linear_model.ipynb

Large diffs are not rendered by default.

100 changes: 96 additions & 4 deletions 5_analyze_data/notebooks/linear_model/fit_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,19 @@
# Define inputs and outputs
data_dir = pathlib.Path("..", "..", "..", "4_processing_features", "data")
cp_file = pathlib.Path(data_dir, "nf1_sc_norm_cellprofiler.csv.gz")
dp_file = pathlib.Path(data_dir, "nf1_sc_norm_fs_deepprofiler_cyto.csv.gz")

output_dir = pathlib.Path("results")
output_file = pathlib.Path(output_dir, "linear_model_cp_features.tsv")
output_cp_file = pathlib.Path(output_dir, "linear_model_cp_features.tsv")
output_dp_file = pathlib.Path(output_dir, "linear_model_dp_features.tsv")


# ## CellProfiler features

# In[3]:


# Load data


cp_df = pd.read_csv(cp_file, index_col=0)

# Define CellProfiler features
Expand Down Expand Up @@ -91,7 +93,7 @@
)

# Output file
lm_results.to_csv(output_file, sep="\t", index=False)
lm_results.to_csv(output_cp_file, sep="\t", index=False)

print(lm_results.shape)
lm_results.head()
Expand All @@ -103,3 +105,93 @@
# Small exploration visualization
lm_results.plot(x="cell_count_coef", y="WT_coef", kind="scatter")


# ## DeepProfiler features

# In[7]:


# Load data
dp_df = pd.read_csv(dp_file)

# Define CellProfiler features
dp_features = dp_df.columns[dp_df.columns.str.startswith("efficientnet")]

print(f"We are testing {len(dp_features)} DeepProfiler features")
print(dp_df.shape)
dp_df.head()


# In[8]:


# Merge number of single cells per well data
cell_count_df = (
dp_df
.groupby("Metadata_Well")["Metadata_Plate"]
.count()
.reset_index()
.rename(columns={"Metadata_Plate": "Metadata_number_of_singlecells"})
)

dp_df = dp_df.merge(cell_count_df, on="Metadata_Well")


# In[9]:


# Setup linear modeling framework
variables = ["Metadata_number_of_singlecells"]
X = dp_df.loc[:, variables]

# Add dummy matrix of categorical genotypes
genotype_x = pd.get_dummies(data=dp_df.Metadata_Genotype)

X = pd.concat([X, genotype_x], axis=1)

print(X.shape)
X.head()


# In[10]:


# Fit linear model for each feature
lm_results = []
for dp_feature in dp_features:
# Subset DP data to each individual feature (univariate test)
dp_subset_df = dp_df.loc[:, dp_feature]

# Fit linear model
lm = LinearRegression(fit_intercept=True)
lm_result = lm.fit(X=X, y=dp_subset_df)

# Extract Beta coefficients
# (contribution of feature to X covariates)
coef = lm_result.coef_

# Estimate fit (R^2)
r2_score = lm.score(X=X, y=dp_subset_df)

# Add results to a growing list
lm_results.append([dp_feature, r2_score] + list(coef))

# Convert results to a pandas DataFrame
lm_results = pd.DataFrame(
lm_results,
columns=["feature", "r2_score", "cell_count_coef", "Null_coef", "WT_coef"]
)

# Output file
lm_results.to_csv(output_dp_file, sep="\t", index=False)

print(lm_results.shape)
lm_results.head()


# In[11]:


# Small exploration visualization
lm_results.plot(x="cell_count_coef", y="WT_coef", kind="scatter")

378 changes: 374 additions & 4 deletions 5_analyze_data/notebooks/linear_model/lm_power_analysis.ipynb

Large diffs are not rendered by default.

87 changes: 87 additions & 0 deletions 5_analyze_data/notebooks/linear_model/lm_power_analysis.r
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ library(pwr)
suppressPackageStartupMessages(library(dplyr))

output_file <- file.path("results", "power_analysis_cp_features_lm.tsv")
output_dp_file <- file.path("results", "power_analysis_dp_cyto_features_lm.tsv")

# Load data
lm_results_file <- file.path("results", "linear_model_cp_features.tsv")
Expand Down Expand Up @@ -84,3 +85,89 @@ power_results_df %>%

print(dim(power_results_df))
head(power_results_df)

# Load data
lm_results_file <- file.path("results", "linear_model_dp_features.tsv")
lm_results_df <- readr::read_tsv(
lm_results_file,
col_types = readr::cols(.default="d", feature="c")
)

print(dim(lm_results_df))
head(lm_results_df)

# Load feature data (for calculating n)
data_dir <-file.path("..", "..", "..", "4_processing_features", "data")
dp_file <- file.path(data_dir, "nf1_sc_norm_deepprofiler_cyto.csv.gz")

dp_df <- readr::read_csv(
dp_file,
col_types = readr::cols(
.default="d",
Metadata_Plate="c",
Metadata_Well="c",
Metadata_Site="c",
Metadata_Plate_Map_Name="c",
Metadata_DNA="c",
Metadata_ER="c",
Metadata_Actin="c",
Metadata_Genotype="c",
Metadata_Genotype_Replicate="c",
Metadata_Model="c"
)
)
print(dim(dp_df))
head(dp_df, 3)

# Define constants for power analysis (n_samples is different from CP features)
n_conditions <- 2 # NF1 WT and Null
n_samples <- dim(dp_df)[1]

u <- n_conditions - 1
v <- n_samples - u - 1
sig_level <- 0.05 / dim(lm_results_df)[1]
power <- 0.8

print(c(u, v))
print(sig_level)

# Given all R2 values perform power analysis
all_power_results <- list()
for (dp_feature in lm_results_df$feature) {
# Subset to the given feature lm results
lm_result_subset_df <- lm_results_df %>%
dplyr::filter(feature == !!dp_feature)

# Pull out the estimated R2 value
r2_val <- lm_result_subset_df %>% dplyr::pull(r2_score)

# The power estimate is undefined for r2_val = 1, skip if so
if (r2_val == 1) {
all_power_results[[dp_feature]] <- c(dp_feature, u, v, sig_level, NULL, NULL)
next
}

# Transform R2 score to F2 effect size
f2_val <- r2_val / (1 - r2_val)

# Calculate power, note that v contains an estimate of sample size
power_result <- pwr.f2.test(u = u, v = NULL, f2 = f2_val, sig.level = sig_level, power = power)

# Calculate required sample size from the v formula
estimated_sample_size <- power_result$v + u + 1

# Save results for future visualization
all_power_results[[dp_feature]] <- c(dp_feature, u, v, sig_level, power, estimated_sample_size)

}

power_results_df <- do.call(rbind, all_power_results) %>% dplyr::as_tibble()

colnames(power_results_df) <- c("feature", "u", "v", "sig_level", "power", "estimated_sample_size")

# Output to file
power_results_df %>%
readr::write_tsv(output_dp_file)

print(dim(power_results_df))
head(power_results_df)
Loading

0 comments on commit a90feed

Please sign in to comment.