Merge pull request WayScience#17 from jenna-tomkinson/edit_features_m…

…odule Edit 4_processing_features module
d33bs · Nov 28, 2022 · 1e70c20 · 1e70c20
2 parents 5536d86 + 861d71b
commit 1e70c20
Show file tree

Hide file tree

Showing 9 changed files with 167 additions and 37 deletions.
diff --git a/3_extracting_features/metadata/NF1_annotations.csv b/3_extracting_features/metadata/NF1_annotations.csv
@@ -1,9 +1,9 @@
-Plate,Well,Gene Identifier,Gene Symbol,Genotype,Channels
-1,C6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,C7,ENSG00000196712,NF1,Het,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,D6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,D7,ENSG00000196712,NF1,Het,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,E6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,E7,ENSG00000196712,NF1,Het,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,F6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
-1,F7,ENSG00000196712,NF1,Het,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+Plate,Well,Gene Identifier,Gene Symbol,Genotype,Channels
+1,C6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,C7,ENSG00000196712,NF1,Null,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,D6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,D7,ENSG00000196712,NF1,Null,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,E6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,E7,ENSG00000196712,NF1,Null,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,F6,ENSG00000196712,NF1,WT,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
+1,F7,ENSG00000196712,NF1,Null,DAPI (nuclei); GFP (endoplasmic reticulum); RFP (actin/cytoplasm)
diff --git a/4_processing_features/4.extract_sc_features.sh b/4_processing_features/4.extract_sc_features.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+jupyter nbconvert --to python extract_single_cell_features.ipynb
+python extract_single_cell_features.py
diff --git a/4_processing_features/README.md b/4_processing_features/README.md
@@ -0,0 +1,43 @@
+# 4. Processing Extracted Single Cell Features 
+
+In this module, we present our pipeline for processing outputted `.sqlite` file with single cell features from CellProfiler.
+The processed features are saved into compressed `.csv.gz` for use during statistical analysis.
+
+## Pycytominer
+
+We use [Pycytominer](https://github.com/cytomining/pycytominer) to perform the aggregation, merging, and normalization of the NF1 single cell features.
+
+For more information regarding the functions that we used, please see [the documentation](https://pycytominer.readthedocs.io/en/latest/pycytominer.cyto_utils.html#pycytominer.cyto_utils.cells.SingleCells.merge_single_cells) from the Pycytominer team.
+
+### Normalization
+
+CellProfiler features can display a variety of distributions across cells.
+To facilitate analysis, we standardize all features (z-score) to the same scale.
+
+---
+
+## Step 1: Setup Processing Feature Environment
+
+### Step 1a: Create Environment
+
+Make sure you are in the `4_processing_features` directory before performing the below command.
+
+```sh
+# Run this command in terminal to create the conda environment
+conda env create -f 4.processing_features.yml
+```
+
+## Step 2: Normalize Single Cell Features
+
+### Step 2a: Set Up Paths
+
+Within the [extract_single_cell_features.ipynb](4_processing_features/extract_single_cell_features.ipynb) notebook, you can chnage the paths to reflect the local paths or names for your machine (***IF* you changed anything from the original pipeline**) for the various parameters (e.g. CellProfiler directory, output directory, path to sqlite file, etc.)
+
+### Step 2b: Run Extract Single Cell Features
+
+Using the code below, run the notebook to extract and normalize single cell features from CellProfiler.
+
+```bash
+# Run this script in terminal
+bash 4.extract_sc_features.sh
+```
diff --git a/4_processing_features/data/nf1_sc_cellprofiler.csv.gz b/4_processing_features/data/nf1_sc_cellprofiler.csv.gz
diff --git a/4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz b/4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz
diff --git a/4_processing_features/extract_single_cell_features.ipynb b/4_processing_features/extract_single_cell_features.ipynb
@@ -5,7 +5,15 @@
    "id": "70a2251c-2a69-43f1-92bf-793095abc2cd",
    "metadata": {},
    "source": [
-    "## Process single cell morphology features for CellProfiler readouts"
+    "# Process single cell morphology features for CellProfiler readouts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7a50486",
+   "metadata": {},
+   "source": [
+    "## Import Libraries"
    ]
   },
   {
@@ -22,6 +30,14 @@
     "from pycytominer.cyto_utils import cells, output"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5f3929d2",
+   "metadata": {},
+   "source": [
+    "## Set up paths to CellProfiler directory and output"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -33,14 +49,24 @@
     "cp_dir = \"../CellProfiler_pipelines\"\n",
     "output_dir = \"data\"\n",
     "\n",
+    "# Set name and path of .sqlite file and path to metadata\n",
     "sql_file = \"NF1_data.sqlite\"\n",
     "single_cell_file = f\"sqlite:///{cp_dir}/Analysis_Output/{sql_file}\"\n",
     "platemap_file = f\"{cp_dir}/Metadata/platemap_NF1_CP.csv\"\n",
     "\n",
+    "# Set path with name for outputted data\n",
     "sc_output_file = pathlib.Path(f\"{output_dir}/nf1_sc_cellprofiler.csv.gz\")\n",
     "sc_norm_output_file = pathlib.Path(f\"{output_dir}/nf1_sc_norm_cellprofiler.csv.gz\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "728f5105",
+   "metadata": {},
+   "source": [
+    "## Set up names for linking columns between tables in the database file"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -59,6 +85,14 @@
     "}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3884560c",
+   "metadata": {},
+   "source": [
+    "## Load and view platemap file"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -108,7 +142,7 @@
        "      <td>7</td>\n",
        "      <td>C7</td>\n",
        "      <td>NF1</td>\n",
-       "      <td>Het</td>\n",
+       "      <td>Null</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
@@ -124,7 +158,7 @@
        "      <td>7</td>\n",
        "      <td>D7</td>\n",
        "      <td>NF1</td>\n",
-       "      <td>Het</td>\n",
+       "      <td>Null</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
@@ -140,7 +174,7 @@
        "      <td>7</td>\n",
        "      <td>E7</td>\n",
        "      <td>NF1</td>\n",
-       "      <td>Het</td>\n",
+       "      <td>Null</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
@@ -156,7 +190,7 @@
        "      <td>7</td>\n",
        "      <td>F7</td>\n",
        "      <td>NF1</td>\n",
-       "      <td>Het</td>\n",
+       "      <td>Null</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -165,13 +199,13 @@
       "text/plain": [
        "  WellRow  WellCol well_position gene_name genotype\n",
        "0       C        6            C6       NF1       WT\n",
-       "1       C        7            C7       NF1      Het\n",
+       "1       C        7            C7       NF1     Null\n",
        "2       D        6            D6       NF1       WT\n",
-       "3       D        7            D7       NF1      Het\n",
+       "3       D        7            D7       NF1     Null\n",
        "4       E        6            E6       NF1       WT\n",
-       "5       E        7            E7       NF1      Het\n",
+       "5       E        7            E7       NF1     Null\n",
        "6       F        6            F6       NF1       WT\n",
-       "7       F        7            F7       NF1      Het"
+       "7       F        7            F7       NF1     Null"
       ]
      },
      "execution_count": 4,
@@ -185,6 +219,14 @@
     "platemap_df"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "72daff37",
+   "metadata": {},
+   "source": [
+    "## Set up `SingleCells` class from Pycytominer"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -195,7 +237,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/waygr/miniforge3/envs/4.process-nf1-features/lib/python3.8/site-packages/pycytominer/cyto_utils/util.py:61: UserWarning: Non-canonical compartment detected: per_cells, per_cytoplasm, per_nuclei\n",
+      "/home/jenna/anaconda3/envs/4.process-nf1-features/lib/python3.8/site-packages/pycytominer/cyto_utils/util.py:61: UserWarning: Non-canonical compartment detected: per_cells, per_cytoplasm, per_nuclei\n",
       "  warnings.warn(warn_str)\n"
      ]
     }
@@ -214,6 +256,14 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f6a9d05d",
+   "metadata": {},
+   "source": [
+    "## Merge single cells "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 6,
@@ -492,6 +542,14 @@
     "sc_df.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6debbc47",
+   "metadata": {},
+   "source": [
+    "## Normalize Data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,
@@ -772,6 +830,8 @@
    "id": "480448ba-a0fc-4c4f-94e2-311543dce6df",
    "metadata": {},
    "source": [
+    "---\n",
+    "\n",
     "### Visualize basic count statistics"
    ]
   },
@@ -784,8 +844,8 @@
     {
      "data": {
       "text/plain": [
-       "Het    116\n",
-       "WT      33\n",
+       "Null    116\n",
+       "WT       33\n",
        "Name: Metadata_genotype, dtype: int64"
       ]
      },
@@ -848,7 +908,7 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>Het</th>\n",
+       "      <th>Null</th>\n",
        "      <td>0</td>\n",
        "      <td>12</td>\n",
        "      <td>0</td>\n",
@@ -876,7 +936,7 @@
       "text/plain": [
        "Metadata_Well      C6  C7  D6  D7  E6  E7  F6  F7\n",
        "Metadata_genotype                                \n",
-       "Het                 0  12   0  14   0  44   0  46\n",
+       "Null                0  12   0  14   0  44   0  46\n",
        "WT                 12   0   5   0   9   0   7   0"
       ]
      },
@@ -892,9 +952,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:4.process-nf1-features] *",
+   "display_name": "Python 3.8.13 ('4.process-nf1-features')",
    "language": "python",
-   "name": "conda-env-4.process-nf1-features-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -907,6 +967,11 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.8.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "6e6aca846613de2bb537f4a3be07c319e65610ef8366b19567099a39e74b14d7"
+   }
   }
  },
  "nbformat": 4,

diff --git a/...converted/extract_single_cell_features.py → ..._features/extract_single_cell_features.py b/...converted/extract_single_cell_features.py → ..._features/extract_single_cell_features.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python
 # coding: utf-8
 
-# ## Process single cell morphology features for CellProfiler readouts
+# # Process single cell morphology features for CellProfiler readouts
+
+# ## Import Libraries
 
 # In[1]:
 
@@ -13,21 +15,27 @@
 from pycytominer.cyto_utils import cells, output
 
 
+# ## Set up paths to CellProfiler directory and output
+
 # In[2]:
 
 
 # Set file and directory constants
 cp_dir = "../CellProfiler_pipelines"
 output_dir = "data"
 
+# Set name and path of .sqlite file and path to metadata
 sql_file = "NF1_data.sqlite"
 single_cell_file = f"sqlite:///{cp_dir}/Analysis_Output/{sql_file}"
 platemap_file = f"{cp_dir}/Metadata/platemap_NF1_CP.csv"
 
+# Set path with name for outputted data
 sc_output_file = pathlib.Path(f"{output_dir}/nf1_sc_cellprofiler.csv.gz")
 sc_norm_output_file = pathlib.Path(f"{output_dir}/nf1_sc_norm_cellprofiler.csv.gz")
 
 
+# ## Set up names for linking columns between tables in the database file
+
 # In[3]:
 
 
@@ -42,6 +50,8 @@
 }
 
 
+# ## Load and view platemap file
+
 # In[4]:
 
 
@@ -50,6 +60,8 @@
 platemap_df
 
 
+# ## Set up `SingleCells` class from Pycytominer
+
 # In[5]:
 
 
@@ -66,6 +78,8 @@
 )
 
 
+# ## Merge single cells 
+
 # In[6]:
 
 
@@ -84,6 +98,8 @@
 sc_df.head()
 
 
+# ## Normalize Data
+
 # In[7]:
 
 
@@ -99,6 +115,8 @@
 normalize_sc_df.head()
 
 
+# ---
+# 
 # ### Visualize basic count statistics
 
 # In[8]: