In [1]:
import pandas as pd
import utilities as ut
import plotly.express as px
import pyPLS
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore')

In [7]:
cp_feat_path = "./data/2020_11_04_CPJUMP1_normalized_feature_select_negcon_batch_Standard.parquet.gz"
df = pd.read_parquet(cp_feat_path)

In [4]:
row, col = ut.get_featuredata(df).shape
ar = np.random.rand(row, col)
rand_df = pd.concat([ut.get_metadata(df), pd.DataFrame(data=ar)], axis=1)
rand_df.columns = rand_df.columns.astype(str)

In [5]:
n_components = 100
pca = PCA(n_components=n_components)
X = pca.fit_transform(ut.get_featuredata(rand_df).values.astype(float))
rand_pca_df = ut.get_metadata(df).copy()
for i in range(n_components):
    rand_pca_df[f'T{i+1}'] = X[:, i]

In [6]:
rand_pca_df.to_parquet('./data/rand_pca_df.parquet.gzip', compression='gzip')

In [7]:
n_components = 100
pca = PCA(n_components=n_components)
X = pca.fit_transform(ut.get_featuredata(df).values.astype(float))
pca_df = ut.get_metadata(df).copy()
for i in range(n_components):
    pca_df[f'T{i+1}'] = X[:, i]

In [7]:
pca_df.to_parquet('./data/pca_df.parquet.gzip', compression='gzip')

In [8]:
## Calc Eq_score data for real data
folder_name = "data"
for modality_1_perturbation in df.Metadata_experiment_type.unique():
    for cell in df.Metadata_cell_line.unique():
        for modality_1_timepoint in df.query(f'Metadata_cell_line == "{cell}" & Metadata_experiment_type == "{modality_1_perturbation}"').Metadata_timepoint.unique():
            print(modality_1_perturbation, cell, modality_1_timepoint)
            cell_line = cell
            comp_time = modality_1_timepoint
            comp_mod = modality_1_perturbation
            comp_cond_query = f"Metadata_experiment_type == '{comp_mod}' \
                        & Metadata_cell_line == '{cell_line}' \
                        & Metadata_timepoint == {comp_time} \
                        & Metadata_experiment_condition == 'Standard'"
            modality_1_df = df.query(comp_cond_query)
            if modality_1_perturbation == 'Compound':
                reference_col = 'Metadata_pert_iname'
            else:
                reference_col = 'Metadata_broad_sample'
            # Log transform CP features
            modality_1_df = pd.concat([ut.get_metadata(modality_1_df), ut.get_featuredata(modality_1_df).applymap(ut.log_transform)], axis=1)
            # Calculate Eq. scores
            comp_comp_eq_df, comp_comp_sse_df, q2_df = ut.calc_eq_score_df_with_cv_optimized(modality_1_df, modality_1_df, reference_col=reference_col)
            eq_filename = f'{modality_1_perturbation}_with_{modality_1_perturbation}_eq_cv_{cell}_{modality_1_timepoint}'
            sse_filename = f'{modality_1_perturbation}_with_{modality_1_perturbation}_sse_cv_{cell}_{modality_1_timepoint}'
            q2_filename = f'{modality_1_perturbation}_with_{modality_1_perturbation}_q2_cv_{cell}_{modality_1_timepoint}'
            comp_comp_eq_df.to_parquet(f"./{folder_name}/{eq_filename}.parquet.gzip")
            comp_comp_sse_df.to_parquet(f"./{folder_name}/{sse_filename}.parquet.gzip")
            q2_df.to_parquet(f"./{folder_name}/{q2_filename}.parquet.gzip")

Compound A549 48
Compound A549 24
Compound U2OS 24
Compound U2OS 48
ORF A549 48
ORF A549 96
ORF U2OS 48
ORF U2OS 96
CRISPR A549 144
CRISPR A549 96
CRISPR U2OS 144
CRISPR U2OS 96


In [11]:
## Calc Eq_scores for random data
folder_name = "data"
for modality_1_perturbation in ['Compound']: ## Only for Compound is enough
    for cell in rand_df.Metadata_cell_line.unique():
        for modality_1_timepoint in rand_df.query(f'Metadata_cell_line == "{cell}" & Metadata_experiment_type == "{modality_1_perturbation}"').Metadata_timepoint.unique():
            print(modality_1_perturbation, cell, modality_1_timepoint)
            cell_line = cell
            comp_time = modality_1_timepoint
            comp_mod = modality_1_perturbation
            comp_cond_query = f"Metadata_experiment_type == '{comp_mod}' \
                        & Metadata_cell_line == '{cell_line}' \
                        & Metadata_timepoint == {comp_time} \
                        & Metadata_experiment_condition == 'Standard'"
            modality_1_df = rand_df.query(comp_cond_query)
            if modality_1_perturbation == 'Compound':
                reference_col = 'Metadata_pert_iname'
            else:
                reference_col = 'Metadata_broad_sample'
            # Log transform CP features
            modality_1_df = pd.concat([ut.get_metadata(modality_1_df), ut.get_featuredata(modality_1_df).applymap(ut.log_transform)], axis=1)
            # Calculate Eq. scores
            comp_comp_eq_df, comp_comp_sse_df, q2_df = ut.calc_eq_score_df_with_cv_optimized(modality_1_df, modality_1_df, reference_col=reference_col)
            eq_filename = f'{modality_1_perturbation}_with_{modality_1_perturbation}_eq_cv_{cell}_{modality_1_timepoint}_random'
            sse_filename = f'{modality_1_perturbation}_with_{modality_1_perturbation}_sse_cv_{cell}_{modality_1_timepoint}_random'
            q2_filename = f'{modality_1_perturbation}_with_{modality_1_perturbation}_q2_cv_{cell}_{modality_1_timepoint}_random'
            comp_comp_eq_df.to_parquet(f"./{folder_name}/{eq_filename}.parquet.gzip")
            comp_comp_sse_df.to_parquet(f"./{folder_name}/{sse_filename}.parquet.gzip")
            q2_df.to_parquet(f"./{folder_name}/{q2_filename}.parquet.gzip")

Compound A549 48
Compound A549 24
Compound U2OS 24
Compound U2OS 48
