DP feature linear modeling and visualization (WayScience#31)

* perform linear model with DP features * add DP visualization for linear model * add DP lm figure and small tweak to CP lm fig * remove extra empty cell * add DP cyto feature power analysis and viz * add title to CP power analysis viz
d33bs · Jan 20, 2023 · a90feed · a90feed
1 parent 62170be
commit a90feed
Show file tree

Hide file tree

Showing 16 changed files with 9,420 additions and 43 deletions.
diff --git a/5_analyze_data/notebooks/Heatmap_analysis/nf1_complexheatmap.ipynb b/5_analyze_data/notebooks/Heatmap_analysis/nf1_complexheatmap.ipynb
@@ -387,14 +387,6 @@
     "dev.off()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "20688dd7-b20e-4509-98d4-e607c8d7f3e8",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "id": "f0989e9b-604d-459d-bdda-bfe8752f5be3",

diff --git a/5_analyze_data/notebooks/Heatmap_analysis/nf1_complexheatmap.r b/5_analyze_data/notebooks/Heatmap_analysis/nf1_complexheatmap.r
@@ -87,8 +87,6 @@ png(paste0(cp_heatmap_file_noext, ".png"), width = 6.5, height = 6, units = "in"
 draw(ht)
 dev.off()
 
-
-
 # Load data
 dp_file <- file.path(input_data_dir, "nf1_sc_norm_fs_deepprofiler_nuc.csv.gz")
 

diff --git a/5_analyze_data/notebooks/linear_model/figures/linear_model_cp_features.png b/5_analyze_data/notebooks/linear_model/figures/linear_model_cp_features.png
diff --git a/5_analyze_data/notebooks/linear_model/figures/linear_model_dp_features.png b/5_analyze_data/notebooks/linear_model/figures/linear_model_dp_features.png
diff --git a/5_analyze_data/notebooks/linear_model/figures/power_analysis_cp_lm.png b/5_analyze_data/notebooks/linear_model/figures/power_analysis_cp_lm.png
diff --git a/5_analyze_data/notebooks/linear_model/figures/power_analysis_dp_lm.png b/5_analyze_data/notebooks/linear_model/figures/power_analysis_dp_lm.png
diff --git a/5_analyze_data/notebooks/linear_model/fit_linear_model.ipynb b/5_analyze_data/notebooks/linear_model/fit_linear_model.ipynb
diff --git a/5_analyze_data/notebooks/linear_model/fit_linear_model.py b/5_analyze_data/notebooks/linear_model/fit_linear_model.py
@@ -22,17 +22,19 @@
 # Define inputs and outputs
 data_dir = pathlib.Path("..", "..", "..", "4_processing_features", "data")
 cp_file = pathlib.Path(data_dir, "nf1_sc_norm_cellprofiler.csv.gz")
+dp_file = pathlib.Path(data_dir, "nf1_sc_norm_fs_deepprofiler_cyto.csv.gz")
 
 output_dir = pathlib.Path("results")
-output_file = pathlib.Path(output_dir, "linear_model_cp_features.tsv")
+output_cp_file = pathlib.Path(output_dir, "linear_model_cp_features.tsv")
+output_dp_file = pathlib.Path(output_dir, "linear_model_dp_features.tsv")
 
 
+# ## CellProfiler features
+
 # In[3]:
 
 
 # Load data
-
-
 cp_df = pd.read_csv(cp_file, index_col=0)
 
 # Define CellProfiler features
@@ -91,7 +93,7 @@
 )
 
 # Output file
-lm_results.to_csv(output_file, sep="\t", index=False)
+lm_results.to_csv(output_cp_file, sep="\t", index=False)
 
 print(lm_results.shape)
 lm_results.head()
@@ -103,3 +105,93 @@
 # Small exploration visualization
 lm_results.plot(x="cell_count_coef", y="WT_coef", kind="scatter")
 
+
+# ## DeepProfiler features
+
+# In[7]:
+
+
+# Load data
+dp_df = pd.read_csv(dp_file)
+
+# Define CellProfiler features
+dp_features = dp_df.columns[dp_df.columns.str.startswith("efficientnet")]
+
+print(f"We are testing {len(dp_features)} DeepProfiler features")
+print(dp_df.shape)
+dp_df.head()
+
+
+# In[8]:
+
+
+# Merge number of single cells per well data
+cell_count_df = (
+    dp_df
+    .groupby("Metadata_Well")["Metadata_Plate"]
+    .count()
+    .reset_index()
+    .rename(columns={"Metadata_Plate": "Metadata_number_of_singlecells"})
+)
+
+dp_df = dp_df.merge(cell_count_df, on="Metadata_Well")
+
+
+# In[9]:
+
+
+# Setup linear modeling framework
+variables = ["Metadata_number_of_singlecells"]
+X = dp_df.loc[:, variables]
+
+# Add dummy matrix of categorical genotypes
+genotype_x = pd.get_dummies(data=dp_df.Metadata_Genotype)
+
+X = pd.concat([X, genotype_x], axis=1)
+
+print(X.shape)
+X.head()
+
+
+# In[10]:
+
+
+# Fit linear model for each feature
+lm_results = []
+for dp_feature in dp_features:
+    # Subset DP data to each individual feature (univariate test)
+    dp_subset_df = dp_df.loc[:, dp_feature]
+
+    # Fit linear model
+    lm = LinearRegression(fit_intercept=True)
+    lm_result = lm.fit(X=X, y=dp_subset_df)
+
+    # Extract Beta coefficients
+    # (contribution of feature to X covariates)
+    coef = lm_result.coef_
+
+    # Estimate fit (R^2)
+    r2_score = lm.score(X=X, y=dp_subset_df)
+
+    # Add results to a growing list
+    lm_results.append([dp_feature, r2_score] + list(coef))
+
+# Convert results to a pandas DataFrame
+lm_results = pd.DataFrame(
+    lm_results,
+    columns=["feature", "r2_score", "cell_count_coef", "Null_coef", "WT_coef"]
+)
+
+# Output file
+lm_results.to_csv(output_dp_file, sep="\t", index=False)
+
+print(lm_results.shape)
+lm_results.head()
+
+
+# In[11]:
+
+
+# Small exploration visualization
+lm_results.plot(x="cell_count_coef", y="WT_coef", kind="scatter")
+
diff --git a/5_analyze_data/notebooks/linear_model/lm_power_analysis.ipynb b/5_analyze_data/notebooks/linear_model/lm_power_analysis.ipynb
diff --git a/5_analyze_data/notebooks/linear_model/lm_power_analysis.r b/5_analyze_data/notebooks/linear_model/lm_power_analysis.r
@@ -2,6 +2,7 @@ library(pwr)
 suppressPackageStartupMessages(library(dplyr))
 
 output_file <- file.path("results", "power_analysis_cp_features_lm.tsv")
+output_dp_file <- file.path("results", "power_analysis_dp_cyto_features_lm.tsv")
 
 # Load data
 lm_results_file <- file.path("results", "linear_model_cp_features.tsv")
@@ -84,3 +85,89 @@ power_results_df %>%
 
 print(dim(power_results_df))
 head(power_results_df)
+
+# Load data
+lm_results_file <- file.path("results", "linear_model_dp_features.tsv")
+lm_results_df <- readr::read_tsv(
+    lm_results_file,
+    col_types = readr::cols(.default="d", feature="c")
+)
+
+print(dim(lm_results_df))
+head(lm_results_df)
+
+# Load feature data (for calculating n)
+data_dir <-file.path("..", "..", "..", "4_processing_features", "data")
+dp_file <- file.path(data_dir, "nf1_sc_norm_deepprofiler_cyto.csv.gz")
+
+dp_df <- readr::read_csv(
+    dp_file,
+    col_types = readr::cols(
+        .default="d",
+        Metadata_Plate="c",
+        Metadata_Well="c",
+        Metadata_Site="c",
+        Metadata_Plate_Map_Name="c",
+        Metadata_DNA="c",
+        Metadata_ER="c",
+        Metadata_Actin="c",
+        Metadata_Genotype="c",
+        Metadata_Genotype_Replicate="c",
+        Metadata_Model="c"
+    )
+)
+print(dim(dp_df))
+head(dp_df, 3)
+
+# Define constants for power analysis (n_samples is different from CP features)
+n_conditions <- 2  # NF1 WT and Null
+n_samples <- dim(dp_df)[1]
+
+u <- n_conditions - 1
+v <- n_samples - u - 1
+sig_level <- 0.05 / dim(lm_results_df)[1]
+power <- 0.8
+
+print(c(u, v))
+print(sig_level)
+
+# Given all R2 values perform power analysis
+all_power_results <- list()
+for (dp_feature in lm_results_df$feature) {
+    # Subset to the given feature lm results
+    lm_result_subset_df <- lm_results_df %>%
+        dplyr::filter(feature == !!dp_feature)
+
+    # Pull out the estimated R2 value
+    r2_val <- lm_result_subset_df %>% dplyr::pull(r2_score)
+
+    # The power estimate is undefined for r2_val = 1, skip if so
+    if (r2_val == 1) {
+        all_power_results[[dp_feature]] <- c(dp_feature, u, v, sig_level, NULL, NULL)
+        next
+    }
+
+    # Transform R2 score to F2 effect size
+    f2_val <- r2_val / (1 - r2_val)
+
+    # Calculate power, note that v contains an estimate of sample size
+    power_result <- pwr.f2.test(u = u, v = NULL, f2 = f2_val, sig.level = sig_level, power = power)
+
+    # Calculate required sample size from the v formula
+    estimated_sample_size <- power_result$v + u + 1
+
+    # Save results for future visualization
+    all_power_results[[dp_feature]] <- c(dp_feature, u, v, sig_level, power, estimated_sample_size)
+
+}
+
+power_results_df <- do.call(rbind, all_power_results) %>% dplyr::as_tibble()
+
+colnames(power_results_df) <- c("feature", "u", "v", "sig_level", "power", "estimated_sample_size")
+
+# Output to file
+power_results_df %>%
+    readr::write_tsv(output_dp_file)
+
+print(dim(power_results_df))
+head(power_results_df)