# DataFrame normalization and feature selection

In [1]:
import pandas as pd
import numpy as np
import pycytominer

## Import Metadata table

In [2]:
metadata = pd.read_csv(r"G:\.shortcut-targets-by-id\1KeRLnpHR83EZFglJAK4Lzvtpxq504dxs\2022_Fossa_Cruz_InterpretingProfiles\Data\metadata\Metadata_SQ00015195.csv")
metadata = metadata.round(2)

## Import the csv tables

- Import each table separately, add annotations and normalize in reference to DMSO (our negative control)

In [3]:
plates = ["SQ00015195", "SQ00015218", "SQ00015219", "SQ00015220", "SQ00015221"]
path = r"G:\.shortcut-targets-by-id\1KeRLnpHR83EZFglJAK4Lzvtpxq504dxs\2022_Fossa_Cruz_InterpretingProfiles\Data\backend\Raw tables"

In [4]:
df_lst = []
for plt in plates:
    df_temp = pd.read_csv(path + "\\" + plt + ".csv", low_memory=False)
    print(df_temp.shape)
    df_temp['Metadata_Plate'] = df_temp['Image_Metadata_Plate']
    df_temp['Metadata_Well'] = df_temp['Image_Metadata_Well']
    df_temp = pycytominer.annotate(df_temp, metadata, join_on = ['Metadata_Well', 'Metadata_Well'])
    df_norm = pycytominer.normalize(df_temp, method = 'mad_robustize', mad_robustize_epsilon = 0, samples = "Metadata_Compound == 'DMSO'") 
    df_lst.append(df_norm)
df = pd.concat(df_lst)


(384, 1785)
(384, 1785)
(384, 1785)
(384, 1785)
(384, 1785)


## Feature selection using pycytominer methods

In [5]:
# df_selected = pycytominer.feature_select(df, operation = ['correlation_threshold', 'variance_threshold', 'drop_na_columns', 'drop_outliers'], blocklist_file = 'blocklist_features.txt')
df_selected = pycytominer.feature_select(df, operation = ['correlation_threshold', 'variance_threshold', 'drop_na_columns', 'blocklist'])

In [6]:
print('How many columns were dropped?',df.shape[1] - df_selected.shape[1])

How many columns were dropped? 1186


## Remove duplicated columns

In [7]:
df_final = df_selected.loc[:,~df_selected.columns.duplicated()].copy()

## Export to csv

In [8]:
df_final.to_csv(path+ "\\" + "Joined_tables.csv")