In [1]:
import pandas as pd
import utilities as ut
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
folder_name = "data"
feat_path = f"./{folder_name}/2020_11_04_CPJUMP1_normalized_feature_select_negcon_batch.parquet.gz"

df = pd.read_parquet(feat_path)
pca_df = pd.read_parquet(f'./{folder_name}/pca_df.parquet.gzip')

In [3]:
ks = list(range(1, 7, 2))# Different values of k
unique_experiments = df['Metadata_experiment_type'].unique()
unique_cells = {exp: df[df['Metadata_experiment_type'] == exp]['Metadata_cell_line'].unique() for exp in unique_experiments}

modalities = ['Eq_scores', 'CellProfiler', 'PCA']
df_all_ap = pd.DataFrame()
df_all_prec_recall = pd.DataFrame()
df_all_q2 = pd.DataFrame()
for modality_1_perturbation in unique_experiments:
    for cell in unique_cells[modality_1_perturbation]:
        for modality_1_timepoint in df.query(f'Metadata_cell_line == "{cell}" & Metadata_experiment_type == "{modality_1_perturbation}"').Metadata_timepoint.unique():
            print(modality_1_perturbation, cell, modality_1_timepoint)
            cell_line = cell
            comp_time = modality_1_timepoint
            comp_mod = modality_1_perturbation
            comp_cond_query = f"Metadata_experiment_type == '{comp_mod}' \
                        & Metadata_cell_line == '{cell_line}' \
                        & Metadata_timepoint == {comp_time} \
                        & Metadata_experiment_condition == 'Standard'"
            if modality_1_perturbation == 'Compound':
                target_var = 'Metadata_pert_iname'
            else:
                target_var = 'Metadata_broad_sample'      
            for modality in modalities:
                if modality == 'Eq_scores':
                    mod_df = pd.read_parquet(f'./{folder_name}/{modality_1_perturbation}_with_{modality_1_perturbation}_eq_cv_{cell}_{modality_1_timepoint}.parquet.gzip')
                elif modality == 'CellProfiler':
                    mod_df = df.query(comp_cond_query).copy()
                elif modality == 'PCA':
                    mod_df = pca_df.query(comp_cond_query).copy()
                
                metadata_cols = [col for col in mod_df.columns if col.startswith('Metadata')]
                feature_cols = [col for col in mod_df.columns if not col.startswith('Metadata')]
            
                X = mod_df[feature_cols]
                y = mod_df[target_var].where(mod_df['Metadata_control_type'] != 'negcon', 'negcon')

                df_prec_recall = ut.knn_map_loocv(X, y, ks)
                # Adding metadata information to the results
                df_prec_recall['Modality'] = modality
                df_prec_recall['Perturbation'] = modality_1_perturbation
                df_prec_recall['Cell Line'] = cell
                df_prec_recall['Timepoint'] = modality_1_timepoint
                
                # Append to the main dataframes
                df_all_prec_recall = df_all_prec_recall.append(df_prec_recall)

Compound A549 48
Compound A549 24
Compound U2OS 24
Compound U2OS 48
ORF A549 48
ORF A549 96
ORF U2OS 48
ORF U2OS 96
CRISPR U2OS 144
CRISPR U2OS 96
CRISPR A549 144
CRISPR A549 96


In [4]:
def categorize_time(modality, timepoint):
    if modality == 'CRISPR':
        return 'Long' if timepoint == 144 else 'Short'
    elif modality == 'ORF':
        return 'Long' if timepoint == 96 else 'Short'
    elif modality == 'Compound':
        return 'Long' if timepoint == 48 else 'Short'
    else:
        return 'Unknown'

# Applying the function to create the 'Time' column
df_all_prec_recall['Time'] = df_all_prec_recall.apply(lambda row: categorize_time(row['Perturbation'], row['Timepoint']), axis=1)
df_all_prec_recall.to_csv(f'./{folder_name}/all_prec_recall_values.csv', index=False)

### Random data benchmark

In [6]:
row, col = ut.get_featuredata(df).shape
ar = np.random.rand(row, col)

In [7]:
rand_df = pd.concat([ut.get_metadata(df), pd.DataFrame(data=ar)], axis=1)
rand_df.columns = rand_df.columns.astype(str)

In [8]:
df = rand_df.copy()

In [9]:
pca_df = pd.read_parquet(f'./{folder_name}/rand_pca_df.parquet.gzip')

In [10]:
ks = list(range(1, 7, 2))# Different values of k
# Pre-filter data based on unique experiment types, cell lines, and timepoints
unique_experiments = ['Compound'] ## Only check random for one modality
unique_cells = {exp: rand_df[rand_df['Metadata_experiment_type'] == exp]['Metadata_cell_line'].unique() for exp in unique_experiments}

modalities = ['Eq_scores', 'CellProfiler', 'PCA']
df_all_ap = pd.DataFrame()
df_all_prec_recall = pd.DataFrame()
df_all_q2 = pd.DataFrame()
for modality_1_perturbation in unique_experiments:
    for cell in unique_cells[modality_1_perturbation]:
        for modality_1_timepoint in rand_df.query(f'Metadata_cell_line == "{cell}" & Metadata_experiment_type == "{modality_1_perturbation}"').Metadata_timepoint.unique():
            print(modality_1_perturbation, cell, modality_1_timepoint)
            cell_line = cell
            comp_time = modality_1_timepoint
            comp_mod = modality_1_perturbation
            comp_cond_query = f"Metadata_experiment_type == '{comp_mod}' \
                        & Metadata_cell_line == '{cell_line}' \
                        & Metadata_timepoint == {comp_time} \
                        & Metadata_experiment_condition == 'Standard'"
            if modality_1_perturbation == 'Compound':
                target_var = 'Metadata_pert_iname'
            else:
                target_var = 'Metadata_broad_sample'      
            for modality in modalities:
                if modality == 'Eq_scores':
                    mod_df = pd.read_parquet(f'./{folder_name}/{modality_1_perturbation}_with_{modality_1_perturbation}_eq_cv_{cell}_{modality_1_timepoint}_random.parquet.gzip')
                elif modality == 'CellProfiler':
                    mod_df = rand_df.query(comp_cond_query).copy()
                elif modality == 'PCA':
                    mod_df = pca_df.query(comp_cond_query).copy()
                
                metadata_cols = [col for col in mod_df.columns if col.startswith('Metadata')]
                feature_cols = [col for col in mod_df.columns if not col.startswith('Metadata')]
            
                X = mod_df[feature_cols]
                y = mod_df[target_var].where(mod_df['Metadata_control_type'] != 'negcon', 'negcon')
                # print("Modality and shape:", modality, X.shape)
                 df_prec_recall = ut.knn_map_loocv(X, y, ks)
                 # Adding metadata information to the results
                df_prec_recall['Modality'] = modality
                df_prec_recall['Perturbation'] = modality_1_perturbation
                df_prec_recall['Cell Line'] = cell
                df_prec_recall['Timepoint'] = modality_1_timepoint
                
                # Append to the main dataframes
                df_all_prec_recall = df_all_prec_recall.append(df_prec_recall)



Compound A549 48
Compound A549 24
Compound U2OS 24
Compound U2OS 48


In [11]:
# Applying the function to create the 'Time' column
df_all_prec_recall['Time'] = df_all_prec_recall.apply(lambda row: categorize_time(row['Perturbation'], row['Timepoint']), axis=1)
df_all_prec_recall.to_csv(f'./{folder_name}/all_prec_recall_values_random.csv', index=False)