diff --git a/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.ipynb b/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.ipynb index 6255103..363121e 100644 --- a/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.ipynb +++ b/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.ipynb @@ -41,6 +41,13 @@ "execution_count": 2, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(149, 444)\n" + ] + }, { "data": { "text/html": [ @@ -206,339 +213,111 @@ " -1.781201\n", " 0.518641\n", " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 144\n", - " F\n", - " 7\n", - " 46\n", - " NF1\n", - " Null\n", - " 32\n", - " 1\n", - " F7\n", - " 8\n", - " 12\n", - " ...\n", - " -0.730713\n", - " -3.721664\n", - " 0.344692\n", - " 0.526792\n", - " 0.444591\n", - " 0.664847\n", - " 1.259085\n", - " 0.745004\n", - " 0.968347\n", - " -0.322389\n", - " \n", - " \n", - " 145\n", - " F\n", - " 7\n", - " 46\n", - " NF1\n", - " Null\n", - " 32\n", - " 1\n", - " F7\n", - " 9\n", - " 13\n", - " ...\n", - " 0.925737\n", - " -1.410478\n", - " 0.047610\n", - " -0.030179\n", - " -0.897448\n", - " -1.032699\n", - " 0.849219\n", - " 0.627234\n", - " 0.096817\n", - " -0.326481\n", - " \n", - " \n", - " 146\n", - " F\n", - " 7\n", - " 46\n", - " NF1\n", - " Null\n", - " 32\n", - " 1\n", - " F7\n", - " 10\n", - " 14\n", - " ...\n", - " 1.010381\n", - " -1.976427\n", - " -1.631117\n", - " -0.338652\n", - " -0.706971\n", - " -1.383757\n", - " 0.484107\n", - " 0.874705\n", - " 0.836486\n", - " -0.367298\n", - " \n", - " \n", - " 147\n", - " F\n", - " 7\n", - " 46\n", - " NF1\n", - " Null\n", - " 32\n", - " 1\n", - " F7\n", - " 11\n", - " 16\n", - " ...\n", - " 0.816456\n", - " -0.562581\n", - " -0.583551\n", - " 0.145784\n", - " 0.008261\n", - " -0.761469\n", - " -0.490888\n", - " 0.058825\n", - " -0.088908\n", - " -0.304325\n", - " \n", - " \n", - " 148\n", - " F\n", - " 7\n", - " 46\n", - " NF1\n", - " Null\n", - " 32\n", - " 1\n", - " F7\n", - " 12\n", - " 18\n", - " ...\n", - " 1.049336\n", - " -0.086971\n", - " -1.339268\n", - " -0.283924\n", - " -1.436801\n", - " -1.305860\n", - " 1.200450\n", - " 0.431040\n", - " 0.486067\n", - " -0.341519\n", - " \n", " \n", "\n", - "

149 rows × 444 columns

\n", + "

5 rows × 444 columns

\n", "" ], "text/plain": [ - " Metadata_WellRow Metadata_WellCol Metadata_number_of_singlecells \\\n", - "0 C 6 12 \n", - "1 C 6 12 \n", - "2 C 6 12 \n", - "3 C 6 12 \n", - "4 C 6 12 \n", - ".. ... ... ... \n", - "144 F 7 46 \n", - "145 F 7 46 \n", - "146 F 7 46 \n", - "147 F 7 46 \n", - "148 F 7 46 \n", + " Metadata_WellRow Metadata_WellCol Metadata_number_of_singlecells \\\n", + "0 C 6 12 \n", + "1 C 6 12 \n", + "2 C 6 12 \n", + "3 C 6 12 \n", + "4 C 6 12 \n", "\n", - " Metadata_gene_name Metadata_genotype Metadata_ImageNumber \\\n", - "0 NF1 WT 1 \n", - "1 NF1 WT 1 \n", - "2 NF1 WT 1 \n", - "3 NF1 WT 1 \n", - "4 NF1 WT 4 \n", - ".. ... ... ... \n", - "144 NF1 Null 32 \n", - "145 NF1 Null 32 \n", - "146 NF1 Null 32 \n", - "147 NF1 Null 32 \n", - "148 NF1 Null 32 \n", + " Metadata_gene_name Metadata_genotype Metadata_ImageNumber Metadata_Plate \\\n", + "0 NF1 WT 1 1 \n", + "1 NF1 WT 1 1 \n", + "2 NF1 WT 1 1 \n", + "3 NF1 WT 1 1 \n", + "4 NF1 WT 4 1 \n", "\n", - " Metadata_Plate Metadata_Well Metadata_Cytoplasm_Parent_Cells \\\n", - "0 1 C6 1 \n", - "1 1 C6 2 \n", - "2 1 C6 3 \n", - "3 1 C6 4 \n", - "4 1 C6 1 \n", - ".. ... ... ... \n", - "144 1 F7 8 \n", - "145 1 F7 9 \n", - "146 1 F7 10 \n", - "147 1 F7 11 \n", - "148 1 F7 12 \n", + " Metadata_Well Metadata_Cytoplasm_Parent_Cells \\\n", + "0 C6 1 \n", + "1 C6 2 \n", + "2 C6 3 \n", + "3 C6 4 \n", + "4 C6 1 \n", "\n", - " Metadata_Cytoplasm_Parent_OrigNuclei ... \\\n", - "0 4 ... \n", - "1 5 ... \n", - "2 7 ... \n", - "3 8 ... \n", - "4 3 ... \n", - ".. ... ... \n", - "144 12 ... \n", - "145 13 ... \n", - "146 14 ... \n", - "147 16 ... \n", - "148 18 ... \n", + " Metadata_Cytoplasm_Parent_OrigNuclei ... \\\n", + "0 4 ... \n", + "1 5 ... \n", + "2 7 ... \n", + "3 8 ... \n", + "4 3 ... \n", "\n", - " Nuclei_Texture_InfoMeas1_RFP_3_03_256 \\\n", - "0 -1.197488 \n", - "1 0.188414 \n", - "2 -1.087258 \n", - "3 -1.250742 \n", - "4 -0.258815 \n", - ".. ... \n", - "144 -0.730713 \n", - "145 0.925737 \n", - "146 1.010381 \n", - "147 0.816456 \n", - "148 1.049336 \n", + " Nuclei_Texture_InfoMeas1_RFP_3_03_256 \\\n", + "0 -1.197488 \n", + "1 0.188414 \n", + "2 -1.087258 \n", + "3 -1.250742 \n", + "4 -0.258815 \n", "\n", - " Nuclei_Texture_InfoMeas2_GFP_3_00_256 \\\n", - "0 0.289091 \n", - "1 0.611666 \n", - "2 0.843883 \n", - "3 0.638684 \n", - "4 -2.222128 \n", - ".. ... \n", - "144 -3.721664 \n", - "145 -1.410478 \n", - "146 -1.976427 \n", - "147 -0.562581 \n", - "148 -0.086971 \n", + " Nuclei_Texture_InfoMeas2_GFP_3_00_256 \\\n", + "0 0.289091 \n", + "1 0.611666 \n", + "2 0.843883 \n", + "3 0.638684 \n", + "4 -2.222128 \n", "\n", - " Nuclei_Texture_InfoMeas2_RFP_3_00_256 \\\n", - "0 0.969456 \n", - "1 0.481954 \n", - "2 -0.214887 \n", - "3 1.163023 \n", - "4 -0.048779 \n", - ".. ... \n", - "144 0.344692 \n", - "145 0.047610 \n", - "146 -1.631117 \n", - "147 -0.583551 \n", - "148 -1.339268 \n", + " Nuclei_Texture_InfoMeas2_RFP_3_00_256 \\\n", + "0 0.969456 \n", + "1 0.481954 \n", + "2 -0.214887 \n", + "3 1.163023 \n", + "4 -0.048779 \n", "\n", - " Nuclei_Texture_InfoMeas2_RFP_3_01_256 \\\n", - "0 1.131385 \n", - "1 0.748184 \n", - "2 0.238299 \n", - "3 1.062039 \n", - "4 0.504843 \n", - ".. ... \n", - "144 0.526792 \n", - "145 -0.030179 \n", - "146 -0.338652 \n", - "147 0.145784 \n", - "148 -0.283924 \n", + " Nuclei_Texture_InfoMeas2_RFP_3_01_256 \\\n", + "0 1.131385 \n", + "1 0.748184 \n", + "2 0.238299 \n", + "3 1.062039 \n", + "4 0.504843 \n", "\n", - " Nuclei_Texture_InfoMeas2_RFP_3_02_256 \\\n", - "0 1.303680 \n", - "1 0.750277 \n", - "2 0.482832 \n", - "3 1.082605 \n", - "4 1.340830 \n", - ".. ... \n", - "144 0.444591 \n", - "145 -0.897448 \n", - "146 -0.706971 \n", - "147 0.008261 \n", - "148 -1.436801 \n", + " Nuclei_Texture_InfoMeas2_RFP_3_02_256 \\\n", + "0 1.303680 \n", + "1 0.750277 \n", + "2 0.482832 \n", + "3 1.082605 \n", + "4 1.340830 \n", "\n", - " Nuclei_Texture_InfoMeas2_RFP_3_03_256 \\\n", - "0 1.416917 \n", - "1 0.511083 \n", - "2 1.264950 \n", - "3 1.386850 \n", - "4 0.924382 \n", - ".. ... \n", - "144 0.664847 \n", - "145 -1.032699 \n", - "146 -1.383757 \n", - "147 -0.761469 \n", - "148 -1.305860 \n", + " Nuclei_Texture_InfoMeas2_RFP_3_03_256 \\\n", + "0 1.416917 \n", + "1 0.511083 \n", + "2 1.264950 \n", + "3 1.386850 \n", + "4 0.924382 \n", "\n", - " Nuclei_Texture_InverseDifferenceMoment_GFP_3_03_256 \\\n", - "0 -0.079438 \n", - "1 -0.065958 \n", - "2 -0.069749 \n", - "3 -0.272864 \n", - "4 0.612704 \n", - ".. ... \n", - "144 1.259085 \n", - "145 0.849219 \n", - "146 0.484107 \n", - "147 -0.490888 \n", - "148 1.200450 \n", + " Nuclei_Texture_InverseDifferenceMoment_GFP_3_03_256 \\\n", + "0 -0.079438 \n", + "1 -0.065958 \n", + "2 -0.069749 \n", + "3 -0.272864 \n", + "4 0.612704 \n", "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256 \\\n", - "0 -2.315521 \n", - "1 -1.460076 \n", - "2 -1.841707 \n", - "3 -1.789888 \n", - "4 -2.158178 \n", - ".. ... \n", - "144 0.745004 \n", - "145 0.627234 \n", - "146 0.874705 \n", - "147 0.058825 \n", - "148 0.431040 \n", + " Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256 \\\n", + "0 -2.315521 \n", + "1 -1.460076 \n", + "2 -1.841707 \n", + "3 -1.789888 \n", + "4 -2.158178 \n", "\n", - " Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256 \\\n", - "0 -1.693210 \n", - "1 -1.427579 \n", - "2 -0.798368 \n", - "3 -1.432404 \n", - "4 -1.781201 \n", - ".. ... \n", - "144 0.968347 \n", - "145 0.096817 \n", - "146 0.836486 \n", - "147 -0.088908 \n", - "148 0.486067 \n", + " Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256 \\\n", + "0 -1.693210 \n", + "1 -1.427579 \n", + "2 -0.798368 \n", + "3 -1.432404 \n", + "4 -1.781201 \n", "\n", - " Nuclei_Texture_SumVariance_RFP_3_01_256 \n", - "0 2.881199 \n", - "1 0.304121 \n", - "2 0.257284 \n", - "3 1.083761 \n", - "4 0.518641 \n", - ".. ... \n", - "144 -0.322389 \n", - "145 -0.326481 \n", - "146 -0.367298 \n", - "147 -0.304325 \n", - "148 -0.341519 \n", + " Nuclei_Texture_SumVariance_RFP_3_01_256 \n", + "0 2.881199 \n", + "1 0.304121 \n", + "2 0.257284 \n", + "3 1.083761 \n", + "4 0.518641 \n", "\n", - "[149 rows x 444 columns]" + "[5 rows x 444 columns]" ] }, "execution_count": 2, @@ -550,7 +329,9 @@ "norm_fs_data = pathlib.Path(\"../../../4_processing_features/data/nf1_sc_norm_fs_cellprofiler.csv.gz\")\n", "\n", "data = pd.read_csv(norm_fs_data, index_col=0)\n", - "data" + "\n", + "print(data.shape)\n", + "data.head()" ] }, { @@ -1654,7 +1435,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 7, diff --git a/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.py b/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.py index 9cbd505..3d90467 100644 --- a/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.py +++ b/5_analyze_data/notebooks/Heatmap_analysis/nf1_heatmap.py @@ -22,7 +22,9 @@ norm_fs_data = pathlib.Path("../../../4_processing_features/data/nf1_sc_norm_fs_cellprofiler.csv.gz") data = pd.read_csv(norm_fs_data, index_col=0) -data + +print(data.shape) +data.head() # ## Helper function to split `csv` into metadata and features diff --git a/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.ipynb b/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.ipynb index 45eeef3..6169a4e 100644 --- a/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.ipynb +++ b/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -55,13 +55,272 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(149, 1055)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_WellRowMetadata_WellColMetadata_number_of_singlecellsMetadata_gene_nameMetadata_genotypeMetadata_ImageNumberMetadata_PlateMetadata_WellMetadata_Cytoplasm_Parent_CellsMetadata_Cytoplasm_Parent_OrigNuclei...Nuclei_Texture_SumVariance_RFP_3_02_256Nuclei_Texture_SumVariance_RFP_3_03_256Nuclei_Texture_Variance_GFP_3_00_256Nuclei_Texture_Variance_GFP_3_01_256Nuclei_Texture_Variance_GFP_3_02_256Nuclei_Texture_Variance_GFP_3_03_256Nuclei_Texture_Variance_RFP_3_00_256Nuclei_Texture_Variance_RFP_3_01_256Nuclei_Texture_Variance_RFP_3_02_256Nuclei_Texture_Variance_RFP_3_03_256
0C612NF1WT11C614...3.1415403.202273-0.097356-0.096165-0.094202-0.1064563.3379693.3505283.2781683.310371
1C612NF1WT11C625...0.3159240.258633-0.087971-0.069493-0.065539-0.0953770.3147760.3139200.3484200.318693
2C612NF1WT11C637...0.2952330.3831610.0652510.005550-0.015212-0.0290870.3484920.3339400.3413120.347999
3C612NF1WT11C648...1.1517251.1599650.0234030.0519310.026268-0.0020941.1846951.2435191.2637511.167156
4C612NF1WT41C613...0.6997230.628294-0.428904-0.416992-0.429383-0.4209970.6902980.6620060.6858830.701466
\n", + "

5 rows × 1055 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_WellRow Metadata_WellCol Metadata_number_of_singlecells \\\n", + "0 C 6 12 \n", + "1 C 6 12 \n", + "2 C 6 12 \n", + "3 C 6 12 \n", + "4 C 6 12 \n", + "\n", + " Metadata_gene_name Metadata_genotype Metadata_ImageNumber Metadata_Plate \\\n", + "0 NF1 WT 1 1 \n", + "1 NF1 WT 1 1 \n", + "2 NF1 WT 1 1 \n", + "3 NF1 WT 1 1 \n", + "4 NF1 WT 4 1 \n", + "\n", + " Metadata_Well Metadata_Cytoplasm_Parent_Cells \\\n", + "0 C6 1 \n", + "1 C6 2 \n", + "2 C6 3 \n", + "3 C6 4 \n", + "4 C6 1 \n", + "\n", + " Metadata_Cytoplasm_Parent_OrigNuclei ... \\\n", + "0 4 ... \n", + "1 5 ... \n", + "2 7 ... \n", + "3 8 ... \n", + "4 3 ... \n", + "\n", + " Nuclei_Texture_SumVariance_RFP_3_02_256 \\\n", + "0 3.141540 \n", + "1 0.315924 \n", + "2 0.295233 \n", + "3 1.151725 \n", + "4 0.699723 \n", + "\n", + " Nuclei_Texture_SumVariance_RFP_3_03_256 \\\n", + "0 3.202273 \n", + "1 0.258633 \n", + "2 0.383161 \n", + "3 1.159965 \n", + "4 0.628294 \n", + "\n", + " Nuclei_Texture_Variance_GFP_3_00_256 Nuclei_Texture_Variance_GFP_3_01_256 \\\n", + "0 -0.097356 -0.096165 \n", + "1 -0.087971 -0.069493 \n", + "2 0.065251 0.005550 \n", + "3 0.023403 0.051931 \n", + "4 -0.428904 -0.416992 \n", + "\n", + " Nuclei_Texture_Variance_GFP_3_02_256 Nuclei_Texture_Variance_GFP_3_03_256 \\\n", + "0 -0.094202 -0.106456 \n", + "1 -0.065539 -0.095377 \n", + "2 -0.015212 -0.029087 \n", + "3 0.026268 -0.002094 \n", + "4 -0.429383 -0.420997 \n", + "\n", + " Nuclei_Texture_Variance_RFP_3_00_256 Nuclei_Texture_Variance_RFP_3_01_256 \\\n", + "0 3.337969 3.350528 \n", + "1 0.314776 0.313920 \n", + "2 0.348492 0.333940 \n", + "3 1.184695 1.243519 \n", + "4 0.690298 0.662006 \n", + "\n", + " Nuclei_Texture_Variance_RFP_3_02_256 Nuclei_Texture_Variance_RFP_3_03_256 \n", + "0 3.278168 3.310371 \n", + "1 0.348420 0.318693 \n", + "2 0.341312 0.347999 \n", + "3 1.263751 1.167156 \n", + "4 0.685883 0.701466 \n", + "\n", + "[5 rows x 1055 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "norm_fs_data = pathlib.Path(\"../../../4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz\")\n", "\n", - "data = pd.read_csv(norm_fs_data, compression=\"gzip\", index_col=0)" + "data = pd.read_csv(norm_fs_data, compression=\"gzip\", index_col=0)\n", + "\n", + "print(data.shape)\n", + "data.head()" ] }, { @@ -74,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -93,13 +352,19 @@ " \"\"\"\n", " feature_results = []\n", "\n", + " # divide the NF1 data based on genotype\n", " null_features = data[(data[\"Metadata_genotype\"] == \"Null\")]\n", " wt_features = data[(data[\"Metadata_genotype\"] == \"WT\")]\n", "\n", + " # iterate through the columns in the data (both of the genotype dataframes will have the same columns)\n", " for column in data:\n", + " # do not include metadata columns\n", " if \"Metadata\" not in column:\n", + " # convert each individual column (feature) into numpy array\n", " null_feature = null_features[column].to_numpy()\n", " wt_feature = wt_features[column].to_numpy()\n", + " \n", + " # run two-sample ks-test for each feature \n", " results = ks_2samp(wt_feature, null_feature)\n", " # have to seperate out namedtuple due to scipy hiding the last two results \n", " results = tuple([results.statistic, results.pvalue, results.statistic_location, results.statistic_sign])\n", @@ -109,10 +374,9 @@ "\n", " return feature_results\n", "\n", - "\n", "def merge_features_kstest(\n", " feature_results: pd.DataFrame,\n", - " column_names: list,\n", + " feature_names: list,\n", " save_path: pathlib.Path = None,\n", "):\n", " \"\"\"\n", @@ -133,7 +397,7 @@ " merged dataframe with features and ks-test results\n", " \"\"\"\n", " # put dataframes into list of where the columns should go\n", - " dataframes = [column_names, feature_results]\n", + " dataframes = [feature_names, feature_results]\n", "\n", " # merge dataframes together\n", " merged_dataframe = pd.concat(dataframes, axis=1)\n", @@ -155,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -285,7 +549,7 @@ "[1043 rows x 4 columns]" ] }, - "execution_count": 5, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -305,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -399,18 +663,23 @@ "[1043 rows x 1 columns]" ] }, - "execution_count": 6, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "features = data.iloc[:,12:]\n", - "column_names = features.columns.tolist()\n", - "column_names = pd.DataFrame(column_names)\n", - "column_names.columns = [\"Features\"]\n", + "# find feature names in the columns from the data\n", + "feature_names = [\n", + " col_name\n", + " for col_name in data.columns.tolist()\n", + " if \"Metadata\" not in col_name\n", + " ]\n", + "\n", + "feature_names = pd.DataFrame(feature_names)\n", + "feature_names.columns = [\"Features\"]\n", "\n", - "column_names" + "feature_names" ] }, { @@ -423,7 +692,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -578,7 +847,7 @@ "[1043 rows x 5 columns]" ] }, - "execution_count": 7, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -586,7 +855,7 @@ "source": [ "save_path = pathlib.Path(\"data/nf1_kstest_two_sample_results.csv\")\n", "\n", - "merge_features_kstest(feature_results, column_names, save_path)" + "merge_features_kstest(feature_results, feature_names, save_path)" ] } ], diff --git a/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.py b/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.py index 62f748a..e8bac74 100644 --- a/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.py +++ b/5_analyze_data/notebooks/KS-test_analysis/nf1_ks_test.py @@ -5,7 +5,7 @@ # ## Import libraries -# In[1]: +# In[8]: import numpy as np @@ -16,7 +16,7 @@ # ## Set seed -# In[2]: +# In[9]: np.random.seed(0) @@ -24,17 +24,20 @@ # ## Load in NF1 data -# In[3]: +# In[10]: norm_fs_data = pathlib.Path("../../../4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz") data = pd.read_csv(norm_fs_data, compression="gzip", index_col=0) +print(data.shape) +data.head() + # ## Helper functions to perform KS-test and create final `csv` file with results -# In[4]: +# In[11]: def nf1_ks_test_two_sample(data: pd.DataFrame): @@ -52,13 +55,19 @@ def nf1_ks_test_two_sample(data: pd.DataFrame): """ feature_results = [] + # divide the NF1 data based on genotype null_features = data[(data["Metadata_genotype"] == "Null")] wt_features = data[(data["Metadata_genotype"] == "WT")] + # iterate through the columns in the data (both of the genotype dataframes will have the same columns) for column in data: + # do not include metadata columns if "Metadata" not in column: + # convert each individual column (feature) into numpy array null_feature = null_features[column].to_numpy() wt_feature = wt_features[column].to_numpy() + + # run two-sample ks-test for each feature results = ks_2samp(wt_feature, null_feature) # have to seperate out namedtuple due to scipy hiding the last two results results = tuple([results.statistic, results.pvalue, results.statistic_location, results.statistic_sign]) @@ -68,10 +77,9 @@ def nf1_ks_test_two_sample(data: pd.DataFrame): return feature_results - def merge_features_kstest( feature_results: pd.DataFrame, - column_names: list, + feature_names: list, save_path: pathlib.Path = None, ): """ @@ -92,7 +100,7 @@ def merge_features_kstest( merged dataframe with features and ks-test results """ # put dataframes into list of where the columns should go - dataframes = [column_names, feature_results] + dataframes = [feature_names, feature_results] # merge dataframes together merged_dataframe = pd.concat(dataframes, axis=1) @@ -106,7 +114,7 @@ def merge_features_kstest( # ## Peform two sample KS-test -# In[5]: +# In[12]: feature_results = nf1_ks_test_two_sample(data) @@ -115,23 +123,28 @@ def merge_features_kstest( # ## Take feature columns from data and create a list -# In[6]: +# In[13]: + +# find feature names in the columns from the data +feature_names = [ + col_name + for col_name in data.columns.tolist() + if "Metadata" not in col_name + ] -features = data.iloc[:,12:] -column_names = features.columns.tolist() -column_names = pd.DataFrame(column_names) -column_names.columns = ["Features"] +feature_names = pd.DataFrame(feature_names) +feature_names.columns = ["Features"] -column_names +feature_names # ## Save the final `csv` file with merged features and results -# In[7]: +# In[14]: save_path = pathlib.Path("data/nf1_kstest_two_sample_results.csv") -merge_features_kstest(feature_results, column_names, save_path) +merge_features_kstest(feature_results, feature_names, save_path)