edits and add figures

d33bs · Jan 17, 2023 · a52b9c7 · a52b9c7
1 parent f758340
commit a52b9c7
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 68 deletions.
diff --git a/5_analyze_data/notebooks/Heatmap_analysis/figures/correlation_clustermap_sc.png b/5_analyze_data/notebooks/Heatmap_analysis/figures/correlation_clustermap_sc.png
diff --git a/5_analyze_data/notebooks/Heatmap_analysis/figures/correlation_heatmap_sc.png b/5_analyze_data/notebooks/Heatmap_analysis/figures/correlation_heatmap_sc.png
diff --git a/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.ipynb b/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.ipynb
diff --git a/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.py b/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.py
@@ -5,7 +5,7 @@
 
 # ## Import libraries
 
-# In[1]:
+# In[8]:
 
 
 import matplotlib.pyplot as plt
@@ -16,7 +16,7 @@
 
 # ## Read in NF1 data `csv`
 
-# In[2]:
+# In[9]:
 
 
 norm_fs_data = pathlib.Path("../../../4_processing_features/data/nf1_sc_norm_fs_cellprofiler.csv.gz")
@@ -29,7 +29,7 @@
 
 # ## Helper function to split `csv` into metadata and features
 
-# In[3]:
+# In[10]:
 
 
 def split_data(pycytominer_output: pd.DataFrame):
@@ -60,7 +60,7 @@ def split_data(pycytominer_output: pd.DataFrame):
 
 # ## Split NF1 data `csv`
 
-# In[4]:
+# In[11]:
 
 
 metadata_dataframe, feature_data = split_data(data)
@@ -69,7 +69,7 @@ def split_data(pycytominer_output: pd.DataFrame):
 
 # ## Transpose the NF1 dataframe
 
-# In[5]:
+# In[12]:
 
 
 data_trans = feature_data.transpose()
@@ -78,20 +78,26 @@ def split_data(pycytominer_output: pd.DataFrame):
 
 # ## Create correlation heatmap
 
-# In[6]:
+# In[13]:
 
 
 data_trans_heatmap = sb.heatmap(data_trans.corr())
 
 plt.show()
 
+save_path = pathlib.Path("figures/correlation_heatmap_sc.png")
+plt.savefig(save_path, bbox_inches="tight")
+
 
 # ## Create clustermap with correlation heatmap
 
-# In[7]:
+# In[14]:
 
 
 sb.clustermap(data_trans.corr(), 
             cmap='RdBu_r',
             )
 
+save_path = pathlib.Path("figures/correlation_clustermap_sc.png")
+plt.savefig(save_path, bbox_inches="tight")
+
diff --git a/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.ipynb b/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -55,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -309,7 +309,7 @@
        "[5 rows x 1055 columns]"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -333,17 +333,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def nf1_ks_test_two_sample(data: pd.DataFrame):\n",
-    "    \"\"\"seperate features by genotype and perform two sample ks-test on each feature\n",
+    "def nf1_ks_test_two_sample(normalized_data: pd.DataFrame):\n",
+    "    \"\"\"separate features by genotype and perform two sample ks-test on each feature\n",
     "\n",
     "    Parameters\n",
     "    ----------\n",
-    "    data : pd.Dataframe\n",
-    "        pycytominer output after normalization and feature selection\n",
+    "    normalized_data : pd.Dataframe\n",
+    "        pycytominer output after normalization\n",
     "\n",
     "    Returns\n",
     "    -------\n",
@@ -353,11 +353,11 @@
     "    feature_results = []\n",
     "\n",
     "    # divide the NF1 data based on genotype\n",
-    "    null_features = data[(data[\"Metadata_genotype\"] == \"Null\")]\n",
-    "    wt_features = data[(data[\"Metadata_genotype\"] == \"WT\")]\n",
+    "    null_features = normalized_data[(normalized_data[\"Metadata_genotype\"] == \"Null\")]\n",
+    "    wt_features = normalized_data[(normalized_data[\"Metadata_genotype\"] == \"WT\")]\n",
     "\n",
     "    # iterate through the columns in the data (both of the genotype dataframes will have the same columns)\n",
-    "    for column in data:\n",
+    "    for column in normalized_data:\n",
     "        # do not include metadata columns\n",
     "        if \"Metadata\" not in column:\n",
     "            # convert each individual column (feature) into numpy array\n",
@@ -366,8 +366,9 @@
     "            \n",
     "            # run two-sample ks-test for each feature \n",
     "            results = ks_2samp(wt_feature, null_feature)\n",
-    "            # have to seperate out namedtuple due to scipy hiding the last two results \n",
-    "            results = tuple([results.statistic, results.pvalue, results.statistic_location, results.statistic_sign])\n",
+    "            # convert all keys/ks-test results (even the hidden ones due to scipy) into a dictionary \n",
+    "            # and put them as a list\n",
+    "            results = tuple(list(results._asdict().values()))\n",
     "            feature_results.append(results)\n",
     "\n",
     "    feature_results = pd.DataFrame(feature_results, columns=[\"statistic\", \"pvalue\", \"statistic_location\", \"statistic_sign\"])\n",
@@ -414,12 +415,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Peform two sample KS-test"
+    "## Perform two sample KS-test"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -549,7 +550,7 @@
        "[1043 rows x 4 columns]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -569,7 +570,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -663,7 +664,7 @@
        "[1043 rows x 1 columns]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -692,7 +693,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -847,7 +848,7 @@
        "[1043 rows x 5 columns]"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }

diff --git a/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.py b/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.py
@@ -5,7 +5,7 @@
 
 # ## Import libraries
 
-# In[8]:
+# In[15]:
 
 
 import numpy as np
@@ -16,15 +16,15 @@
 
 # ## Set seed
 
-# In[9]:
+# In[16]:
 
 
 np.random.seed(0)
 
 
 # ## Load in NF1 data
 
-# In[10]:
+# In[17]:
 
 
 norm_fs_data = pathlib.Path("../../../4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz")
@@ -37,16 +37,16 @@
 
 # ## Helper functions to perform KS-test and create final `csv` file with results
 
-# In[11]:
+# In[18]:
 
 
-def nf1_ks_test_two_sample(data: pd.DataFrame):
-    """seperate features by genotype and perform two sample ks-test on each feature
+def nf1_ks_test_two_sample(normalized_data: pd.DataFrame):
+    """separate features by genotype and perform two sample ks-test on each feature
 
     Parameters
     ----------
-    data : pd.Dataframe
-        pycytominer output after normalization and feature selection
+    normalized_data : pd.Dataframe
+        pycytominer output after normalization
 
     Returns
     -------
@@ -56,11 +56,11 @@ def nf1_ks_test_two_sample(data: pd.DataFrame):
     feature_results = []
 
     # divide the NF1 data based on genotype
-    null_features = data[(data["Metadata_genotype"] == "Null")]
-    wt_features = data[(data["Metadata_genotype"] == "WT")]
+    null_features = normalized_data[(normalized_data["Metadata_genotype"] == "Null")]
+    wt_features = normalized_data[(normalized_data["Metadata_genotype"] == "WT")]
 
     # iterate through the columns in the data (both of the genotype dataframes will have the same columns)
-    for column in data:
+    for column in normalized_data:
         # do not include metadata columns
         if "Metadata" not in column:
             # convert each individual column (feature) into numpy array
@@ -69,8 +69,9 @@ def nf1_ks_test_two_sample(data: pd.DataFrame):
 
             # run two-sample ks-test for each feature 
             results = ks_2samp(wt_feature, null_feature)
-            # have to seperate out namedtuple due to scipy hiding the last two results 
-            results = tuple([results.statistic, results.pvalue, results.statistic_location, results.statistic_sign])
+            # convert all keys/ks-test results (even the hidden ones due to scipy) into a dictionary 
+            # and put them as a list
+            results = tuple(list(results._asdict().values()))
             feature_results.append(results)
 
     feature_results = pd.DataFrame(feature_results, columns=["statistic", "pvalue", "statistic_location", "statistic_sign"])
@@ -112,9 +113,9 @@ def merge_features_kstest(
     return merged_dataframe
 
 
-# ## Peform two sample KS-test
+# ## Perform two sample KS-test
 
-# In[12]:
+# In[19]:
 
 
 feature_results = nf1_ks_test_two_sample(data)
@@ -123,7 +124,7 @@ def merge_features_kstest(
 
 # ## Take feature columns from data and create a list
 
-# In[13]:
+# In[20]:
 
 
 # find feature names in the columns from the data
@@ -141,7 +142,7 @@ def merge_features_kstest(
 
 # ## Save the final `csv` file with merged features and results
 
-# In[14]:
+# In[21]:
 
 
 save_path = pathlib.Path("data/nf1_kstest_two_sample_results.csv")