# Support notebook for Basic Protocol 2
# Annotate, normalize and feature select features for Single-Cell table.

In [1]:
import pandas as pd
import numpy as np
import pycytominer

# Inputs

- Import metadata table containing the names of Compounds, MOAs, Concentrations, and Plate. 

- Import raw table as a dataframe (obtained in Samples_retrieval.ipynb notebook)

In [2]:
path = r'G:\.shortcut-targets-by-id\1KeRLnpHR83EZFglJAK4Lzvtpxq504dxs\2022_Fossa_Cruz_InterpretingProfiles'

In [3]:
metadata = pd.read_csv(path + r'\Notebooks\Metadata_SQ00015195.csv')

In [4]:
df_raw = pd.read_csv(path + r'\Data\backend\SingleCellRetrieving\SQ00015195_raw.csv')

In [5]:
df_raw.head()

Unnamed: 0,Image_Metadata_Well,TableNumber,ImageNumber,ObjectNumber,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,Cells_AreaShape_Eccentricity,Cells_AreaShape_EulerNumber,...,Image_Width_IllumAGP,Image_Width_IllumDNA,Image_Width_IllumER,Image_Width_IllumMito,Image_Width_IllumRNA,Image_Width_OrigAGP,Image_Width_OrigDNA,Image_Width_OrigER,Image_Width_OrigMito,Image_Width_OrigRNA
0,A02,1240fc906b760746bfaa4913fdb947b6,10,1,7334,642,0,1.196197,0.732293,1,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
1,A02,1240fc906b760746bfaa4913fdb947b6,10,2,7084,1952,1,1.288208,0.865021,1,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
2,A02,1240fc906b760746bfaa4913fdb947b6,10,3,13696,2159,40,1.116869,0.632134,1,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
3,A02,1240fc906b760746bfaa4913fdb947b6,10,4,9814,1123,36,1.147676,0.773804,1,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
4,A02,1240fc906b760746bfaa4913fdb947b6,10,5,16297,1672,78,1.071092,0.708822,1,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160


# Annotate table using metadata information

In [6]:
df = pycytominer.annotate(profiles = df_raw, platemap = metadata, join_on =  ["Metadata_Well", "Image_Metadata_Well"])

In [7]:
df.head()

Unnamed: 0,Metadata_Concentration,Metadata_moa,Metadata_Compound,Metadata_Well,Metadata_Plate,TableNumber,ImageNumber,ObjectNumber,Cells_AreaShape_Area,Cells_AreaShape_Center_X,...,Image_Width_IllumAGP,Image_Width_IllumDNA,Image_Width_IllumER,Image_Width_IllumMito,Image_Width_IllumRNA,Image_Width_OrigAGP,Image_Width_OrigDNA,Image_Width_OrigER,Image_Width_OrigMito,Image_Width_OrigRNA
0,0.0,DMSO,DMSO,A02,SQ00015195,1240fc906b760746bfaa4913fdb947b6,10,1,7334,642,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
1,0.0,DMSO,DMSO,A02,SQ00015195,1240fc906b760746bfaa4913fdb947b6,10,2,7084,1952,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
2,0.0,DMSO,DMSO,A02,SQ00015195,1240fc906b760746bfaa4913fdb947b6,10,3,13696,2159,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
3,0.0,DMSO,DMSO,A02,SQ00015195,1240fc906b760746bfaa4913fdb947b6,10,4,9814,1123,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160
4,0.0,DMSO,DMSO,A02,SQ00015195,1240fc906b760746bfaa4913fdb947b6,10,5,16297,1672,...,2160,2160,2160,2160,2160,2160,2160,2160,2160,2160


# List columns to keep and select Location columns

- Location columns can't be normalized, because their values are locations (in pixels) of the nuclei center across the images.

- cols_keep are columns that we want to maintain, which will be used in the Basic Protocol 2 notebook.

In [8]:
#if you want to select specific columns just copy and paste here
cols_keep = ['Metadata_Plate', 'Image_FileName_OrigAGP',
             'Image_FileName_OrigDNA','Image_FileName_OrigER',
             'Image_FileName_OrigMito','Image_FileName_OrigRNA',
            'Metadata_Well', 'Metadata_Concentration',
             'Metadata_moa', 'Metadata_Compound', 'Image_Width_OrigDNA']

In [9]:
df_loc = df[['Nuclei_Location_Center_X', 'Nuclei_Location_Center_Y']]

In [10]:
df_norm = df.drop(columns=['Nuclei_Location_Center_X', 'Nuclei_Location_Center_Y'])

# Normalize features values

- Normalize the feature values using mad_robustize. For more details https://carpenter-singh-lab.broadinstitute.org/blog/how-normalize-cell-painting-data

In [18]:
df_norm2 = pycytominer.normalize(df_norm, meta_features = cols_keep, samples = "Metadata_Compound == 'DMSO'", method='mad_robustize', mad_robustize_epsilon = 0)

In [19]:
df_all = pd.concat([df_norm2, df_loc], axis = 1) #join the location_columns with the normalized df

# Feature selection

- Remove features based on operation methods below.

In [20]:
df_selected = pycytominer.feature_select(df_all, operation = ['correlation_threshold', 'variance_threshold', 'drop_na_columns', 'blocklist','drop_outliers'], outlier_cutoff = 500)

In [23]:
print('df shape before feature selection:', df_norm.shape, 'df shape after feature selection:', df_selected.shape)
print('Number of columns removed:', df_norm.shape[1] - df_selected.shape[1])

df shape before feature selection: (13895, 2443) df shape after feature selection: (13895, 763)
Number of columns removed: 1680


# Export dataframe as csv file

In [24]:
df_selected.to_csv('BasicProtocols2_Example.csv')