## Process and apply models to single cell profiles from other data batches

We will apply classifiers to these data to prioritize samples that we predict to have specific drug-tolerance mechanisms.

In [1]:
import sys
import joblib
import pathlib
import sqlite3
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from pycytominer.cyto_utils import infer_cp_features

from utils.ml_utils import model_apply
from utils.single_cell_utils import process_sites, normalize_sc

sys.path.append("../0.generate-profiles")
from scripts.profile_util import load_config

In [2]:
np.random.seed(1234)

  """Entry point for launching an IPython kernel.


In [3]:
# Set constants
batch = "2019_03_20_Batch2"
plate = "207106_exposure320"

feature_filter = ["Object", "Location", "Count", "Parent"]
scaler_method = "standard"
seed = 123
n_sample_sites_per_well = 3

In [4]:
# Load locations of single cell files
config = pathlib.Path("../0.generate-profiles/profile_config.yaml")
pipeline, single_cell_files = load_config(config, append_sql_prefix=False, local=False)

In [5]:
# Load models
model_file = pathlib.Path("models", "multiclass_cloneAE_wildtype.joblib")
top_model = joblib.load(model_file)

shuffle_model_file = pathlib.Path("models", "multiclass_cloneAE_wildtype_shuffled.joblib")
top_shuffle_model = joblib.load(shuffle_model_file)

In [6]:
# Load platemap and metadata
workspace_dir = pipeline["workspace_dir"]
batch_dir = pathlib.Path(workspace_dir, "backend", batch)
metadata_dir = pathlib.Path("../0.generate-profiles", "metadata", batch)

barcode_plate_map_file = pathlib.Path(metadata_dir, "barcode_platemap.csv")
barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)

plate_map_name = (
    barcode_plate_map_df
    .query("Assay_Plate_Barcode == @plate")
    .Plate_Map_Name
    .values[0]
)

plate_map_file = pathlib.Path(metadata_dir, "platemap", f"{plate_map_name}.txt")
plate_map_df = pd.read_csv(plate_map_file, sep="\t")
plate_map_df.columns = [x if x.startswith("Metadata_") else f"Metadata_{x}" for x in plate_map_df.columns]
plate_map_df.head()

Unnamed: 0,Metadata_plate_map_name,Metadata_well_position,Metadata_CellLine,Metadata_Dosage
0,PlateMap_207106_exposure320,B02,WT,0.0
1,PlateMap_207106_exposure320,B03,WT,0.0
2,PlateMap_207106_exposure320,B04,WT,0.0
3,PlateMap_207106_exposure320,B05,CloneA,0.0
4,PlateMap_207106_exposure320,B06,CloneA,0.0


## Load single cell data

In [7]:
plate_column = pipeline["aggregate"]["plate_column"]
well_column = pipeline["aggregate"]["well_column"]

In [8]:
# Establish connection to sqlite file
single_cell_sqlite = single_cell_files[batch]["plates"][str(plate)]
conn = sqlite3.connect(single_cell_sqlite)

In [9]:
image_cols = f"TableNumber, ImageNumber, {plate_column}, {well_column}"
image_query = f"select {image_cols} from image"
image_df = (
    pd.read_sql_query(image_query, conn)
    .merge(
        plate_map_df,
        left_on=well_column,
        right_on="Metadata_well_position"
    )
    .drop(["Metadata_well_position"], axis="columns")
)

print(image_df.shape)
image_df.head()

(324, 7)


Unnamed: 0,TableNumber,ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage
0,35063824554719371464203761079871132620,1,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
1,55321079170264086709741025581452256546,37,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
2,156575971115499494274828396611545171867,73,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
3,177833145266349265724759827001312244688,109,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
4,322613752142964989790892398074175721670,145,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0


In [10]:
# Assert that image number is unique
assert len(image_df.ImageNumber.unique()) == image_df.shape[0]

In [11]:
# Randomly sample three sites per well to reduce number of single cells to store
sampled_image_df = image_df.groupby("Metadata_Well").apply(pd.DataFrame.sample, n=n_sample_sites_per_well)

sampled_image_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,TableNumber,ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage
Metadata_Well,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
B02,8,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
B02,2,156575971115499494274828396611545171867,73,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
B02,1,55321079170264086709741025581452256546,37,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0
B03,15,191579149614056690500293134755529854267,218,207106_exposure320,B03,PlateMap_207106_exposure320,WT,0.0
B03,13,47577814904125996899647167415927575113,146,207106_exposure320,B03,PlateMap_207106_exposure320,WT,0.0


In [12]:
%%time
sc_df = process_sites(
    connection=conn,
    imagenumbers=sampled_image_df.ImageNumber.tolist(),
    image_df=image_df,
    feature_filter=feature_filter,
    seed=seed,
    scaler_method=scaler_method,
    normalize=True
)

CPU times: user 9min 16s, sys: 2min 53s, total: 12min 9s
Wall time: 14min 27s


In [13]:
print(sc_df.shape)
sc_df.head()

(83248, 3422)


Unnamed: 0,Metadata_TableNumber,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage,Cells_AreaShape_Area,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,...,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_20_00,Nuclei_Texture_Variance_RNA_20_01,Nuclei_Texture_Variance_RNA_20_02,Nuclei_Texture_Variance_RNA_20_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,-0.038891,-1.235026,-1.721307,...,-0.187873,-0.182338,-0.099601,-0.139078,-0.133357,0.009567,-0.178961,-0.179179,-0.177022,-0.182817
1,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,0.012099,-0.57474,-1.713141,...,-0.013204,-0.010766,0.060531,-0.178687,0.089074,0.184089,-0.023213,-0.009299,-0.025929,-0.047081
2,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,-0.009134,0.955031,-1.753971,...,0.195357,0.243322,0.610701,0.114445,0.270754,0.648666,0.139362,0.156686,0.176658,0.169338
3,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,-0.12436,-0.14817,-1.704974,...,-0.380355,-0.404005,-0.434148,-0.393991,-0.442776,-0.406503,-0.349596,-0.340229,-0.332098,-0.34318
4,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,0.009411,-1.133695,-1.721307,...,-0.335985,-0.362422,-0.241079,-0.225911,-0.255972,-0.272176,-0.3415,-0.339134,-0.339213,-0.337372


In [14]:
# Load test set data and reindex to match feature list
test_file = pathlib.Path("data", "single_cell_test.tsv.gz")
test_df = pd.read_csv(test_file, sep="\t")

cp_feature_order = infer_cp_features(test_df)

print(test_df.shape)
test_df.head()

(2225, 1965)


Unnamed: 0,Metadata_TableNumber,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_clone_number,Metadata_plate_ID,Metadata_plate_filename,Metadata_treatment,Metadata_treatment_time,...,Nuclei_Texture_Variance_Mito_5_02,Nuclei_Texture_Variance_Mito_5_03,Nuclei_Texture_Variance_RNA_10_00,Nuclei_Texture_Variance_RNA_10_01,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,213615935970490117812207546998927362843,265,218360,D06,218360,Clone E,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,...,0.954049,0.834088,1.084193,1.078415,1.137907,1.058255,1.053407,1.123878,1.149669,1.139936
1,337567195667998632376188922851239700308,61,218360,B02,218360,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,...,0.134245,0.043566,-0.173681,-0.112904,-0.23151,-0.269831,-0.235633,-0.269732,-0.219162,-0.227415
2,44853145860897853858828255259732821643,925,218360,D06,218360,Clone E,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,...,-0.612768,-0.598612,-0.480425,-0.481877,-0.53821,-0.517666,-0.500248,-0.49874,-0.52691,-0.518281
3,287065407067703193955139587947415917934,217,218360,E08,218360,WT parental,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,...,-0.353493,-0.352171,-0.407358,-0.445175,-0.423596,-0.355138,-0.440462,-0.439416,-0.419995,-0.413208
4,208385314166099803906004723782294996203,865,218360,D06,218360,Clone E,218360,20200626-WTpAE-Lo,0.1% DMSO,13 hr,...,-0.68843,-0.692637,-0.588585,-0.592409,-0.570305,-0.575362,-0.5828,-0.577815,-0.580289,-0.575444


In [15]:
coef_file = pathlib.Path("coefficients/single_cell_multiclass_coefficients.tsv")
coef_df = pd.read_csv(coef_file, sep="\t")

print(coef_df.shape)
coef_df.head()

(1954, 4)


Unnamed: 0,feature,WT parental,Clone A,Clone E
0,Cells_AreaShape_Center_X,-0.004546,-0.012042,-0.011397
1,Cells_AreaShape_Center_Y,-0.028817,-0.032763,0.048197
2,Cells_AreaShape_Compactness,0.142783,-0.06295,-0.035459
3,Cells_AreaShape_Eccentricity,0.00736,-0.021722,0.018757
4,Cells_AreaShape_Extent,0.009453,0.016111,-0.012926


In [16]:
# Assert the feature order and the model are equivalent
assert cp_feature_order == coef_df.feature.tolist()

In [17]:
# Reindex features in the proper order before saving
meta_features = infer_cp_features(sc_df, metadata=True)
reindex_features = meta_features + cp_feature_order
sc_reindexed_df = sc_df.reindex(reindex_features, axis="columns")

print(sc_reindexed_df.shape)
sc_reindexed_df.head()

(83248, 1961)


Unnamed: 0,Metadata_TableNumber,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage,Cells_AreaShape_Center_X,Cells_AreaShape_Center_Y,Cells_AreaShape_Compactness,...,Nuclei_Texture_Variance_Mito_5_02,Nuclei_Texture_Variance_Mito_5_03,Nuclei_Texture_Variance_RNA_10_00,Nuclei_Texture_Variance_RNA_10_01,Nuclei_Texture_Variance_RNA_10_02,Nuclei_Texture_Variance_RNA_10_03,Nuclei_Texture_Variance_RNA_5_00,Nuclei_Texture_Variance_RNA_5_01,Nuclei_Texture_Variance_RNA_5_02,Nuclei_Texture_Variance_RNA_5_03
0,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,-1.235026,-1.721307,-0.725126,...,0.633446,0.581916,-0.18672,-0.202274,-0.187873,-0.182338,-0.178961,-0.179179,-0.177022,-0.182817
1,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,-0.57474,-1.713141,-0.41097,...,0.296044,0.270217,-0.013348,-0.036462,-0.013204,-0.010766,-0.023213,-0.009299,-0.025929,-0.047081
2,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,0.955031,-1.753971,-0.625757,...,0.585797,0.570853,0.182656,-0.046836,0.195357,0.243322,0.139362,0.156686,0.176658,0.169338
3,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,-0.14817,-1.704974,-0.506209,...,-0.059298,-0.053522,-0.343957,-0.351943,-0.380355,-0.404005,-0.349596,-0.340229,-0.332098,-0.34318
4,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,-1.133695,-1.721307,-0.684017,...,-0.147103,-0.136848,-0.335371,-0.324811,-0.335985,-0.362422,-0.3415,-0.339134,-0.339213,-0.337372


In [18]:
# Output file
sc_output_file = pathlib.Path(f"data/single_cell_{batch}_plate_{plate}_random_cells.tsv.gz")
sc_reindexed_df.to_csv(sc_output_file, sep="\t", compression="gzip", index=False)

## Apply Models

In [19]:
y_recode = {"WT parental": 0, "Clone A": 1, "Clone E": 2}
y_recode_reverse = {y: x for x, y in y_recode.items()}

In [20]:
sc_df = sc_reindexed_df.reindex(cp_feature_order, axis="columns")
meta_df = sc_reindexed_df.reindex(meta_features, axis="columns")

In [21]:
real_scores_df = model_apply(
    model=top_model,
    x_df=sc_df.fillna(0),
    meta_df=meta_df,
    y_recode=y_recode_reverse,
    data_fit="other_batch",
    shuffled=False,
    predict_proba=False
)

output_file = pathlib.Path(f"scores/{batch}_{plate}_othersinglecells.tsv.gz")
real_scores_df.to_csv(output_file, sep="\t", compression="gzip", index=False)

print(real_scores_df.shape)
real_scores_df.head()

(83248, 10)


Unnamed: 0,WT parental,Metadata_TableNumber,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage,data_fit,shuffled
0,2,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,False
1,0,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,False
2,1,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,False
3,1,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,False
4,1,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,False


In [22]:
shuffled_scores_df = model_apply(
    model=top_shuffle_model,
    x_df=sc_df.fillna(0),
    meta_df=meta_df,
    y_recode=y_recode_reverse,
    data_fit="other_batch",
    shuffled=True,
    predict_proba=True
)

output_file = pathlib.Path(f"scores/{batch}_{plate}_shuffled_model_othersinglecells.tsv.gz")
shuffled_scores_df.to_csv(output_file, sep="\t", compression="gzip", index=False)

print(shuffled_scores_df.shape)
shuffled_scores_df.head()

(83248, 12)


Unnamed: 0,WT parental,Clone A,Clone E,Metadata_TableNumber,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_plate_map_name,Metadata_CellLine,Metadata_Dosage,data_fit,shuffled
0,0.3252,0.358208,0.316592,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,True
1,0.336868,0.333202,0.329929,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,True
2,0.333176,0.344637,0.322187,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,True
3,0.338977,0.35353,0.307492,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,True
4,0.30815,0.336989,0.354861,305660431977815877849633354263997856075,289,207106_exposure320,B02,PlateMap_207106_exposure320,WT,0.0,other_batch,True
