In [None]:
from pathlib import Path
import sys, os

import copy
import itertools
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from collections import defaultdict


### Set random seed
seed = 101
random.seed(seed)
np.random.seed(seed)

# path
data_path = Path("/content/drive/MyDrive/LMAB/LMAB2/phenotype project/data")
print(f"proj_path: {data_path}")
sys.path.append(str(data_path.parent))

proj_path: /content/drive/MyDrive/LMAB/LMAB2/phenotype project/data


<b> Prepare experimental dataset

In [None]:
# Generate drug combinations
def generate_drug_combs(settings):
    """
    generate datasets for some n_drugs, to use num_drugs_used at each time (others default to 0)
    """
    # extrat settings
    num_drugs = settings["num_drugs"]
    num_drug_used = settings["num_drugs_used"]    # Number of drugs to use at one time
    dilution_start = settings["dilution_start_2^x"] 
    dilution_stop = settings["dilution_stop_2^x"]
    # 1. generatie dilution combinations
    n_dilutions = np.abs(dilution_start - dilution_stop)+1   # number of dilutions (e.g. 10)
    n_gradient = dilution_start - dilution_stop + 1
    dilutions = np.logspace(dilution_start, dilution_stop, n_gradient, base=2)
    dilutions = np.tile(dilutions, num_drug_used).reshape((num_drug_used, -1 ))
    print(f"{n_gradient} dilutions:")
    # 2. generatie drug combinations
    dilutions_combs = np.array(list(map(lambda x: x.flatten(), np.meshgrid(*dilutions))))   # shape: (n_drugs_used, n_cons) e.g.(2, 10000)

    # Mesh all the index of drugs wanted to use to n_columns (combination space) and then resize
    drug_combs = np.concatenate(np.meshgrid(*[range(num_drugs)]*num_drug_used)).reshape((num_drug_used,-1)).T   # index of drugs: (n_combinations, n_drugs_used)
    # Generate all combinations based on dilutions_combs and drugs_combs
    all_combs = []
    count = 0
    print(f"Dilutions combinations:{dilutions_combs.shape}")
    for comb in drug_combs:   
        # Comb (e.g., [0, 1]): index of the drugs that is non-zero
        for n_dil in range(dilutions_combs.shape[1]):
            comb_entry = np.zeros(num_drugs)  # row entry
            comb_entry[[comb]] = dilutions_combs[:, n_dil]
            all_combs.append(comb_entry)
            count += 1
    final_combs = pd.DataFrame(np.stack(all_combs)).drop_duplicates()        
    # Return (mainly) all_combs
    return dilutions_combs , drug_combs, final_combs

def full_cartesian_comb(settings):
    num_drugs = settings["num_drugs"]
    dilution_start = settings["dilution_start_2^x"] 
    dilution_stop = settings["dilution_stop_2^x"]
    #
    n_dilutions = np.abs(dilution_start - dilution_stop)+1   # number of dilutions (e.g. 10)
    n_gradient = dilution_start - dilution_stop + 1
    dilutions = np.concatenate((np.logspace(dilution_start, dilution_stop, n_gradient, base=2),[0]))
    print(f"Number of dilutions: {n_dilutions}, {dilutions}")
    #
    return pd.DataFrame(list(itertools.product(dilutions, repeat=num_drugs)))

In [None]:
drug_comb_settings = {"num_drugs": 4,
                      "num_drugs_used": 4,
                      "dilution_start_2^x": 7,  # start dilution 
                      "dilution_stop_2^x": -2   # end dilution
                      }
    # use some drugs at a time
#dilutions_combs , drug_combs, drug_conc_plate = generate_drug_combs(drug_comb_settings)
    # use all (cartesian) combinations
drug_conc_plate = full_cartesian_comb(drug_comb_settings)
print(drug_conc_plate.shape)
drug_conc_plate.head(2)


Number of dilutions: 10, [128.    64.    32.    16.     8.     4.     2.     1.     0.5    0.25
   0.  ]
(14641, 4)


Unnamed: 0,0,1,2,3
0,128.0,128.0,128.0,128.0
1,128.0,128.0,128.0,64.0


In [None]:
### Save to dataset
drug_annot_string = ",".join(list(map(lambda x: x[0]+"@"+str(x[1]), drug_comb_settings.items())))   # file annotation
save_name = "SET-UP-"+drug_annot_string +".csv"
drug_conc_plate.to_csv(data_path/save_name)
print(f"Saved to{data_path/save_name}")

Saved to/content/drive/MyDrive/LMAB/LMAB2/phenotype project/data/SET-UP-num_drugs@4,num_drugs_used@4,dilution_start_2^x@7,dilution_stop_2^x@-2.csv


In [None]:
all_combinations = pd.read_csv(data_path/save_name, index_col=0)
all_combinations.head()
#all_combinations["viability"]
X = all_combinations.to_numpy()

### Dataset

Planned Experimental concentrations

In [None]:
#save_name="SET-UP-num_drugs@4,num_drugs_used@4,dilution_start_2^x@7,dilution_stop_2^x@-2.csv"
experiment_space_setup = pd.read_csv(data_path/save_name, index_col=0)
batch_0_setup = pd.read_csv(data_path/"SET-UP-Initial_Experiment-Batch.csv", index_col=0)
# sort the value in result df
batch_0_setup = batch_0_setup.sort_values(by=list(batch_0_setup.columns))

In [None]:
print("Experiment space:", experiment_space_setup.shape, "Initial batch:", batch_0_setup.shape)
experiment_space_setup.head(2)
batch_0_setup.head(2)

Experiment space: (14641, 4) Initial batch: (54, 4)


Unnamed: 0,0,1,2,3
1710086,0.0,0.0,0.5,2.0
918006,0.0,0.5,128.0,2.0


Real experimental result

In [None]:
# load experimental result
batch_0_result = pd.read_csv(data_path/"EXP_RESULT_batch0.csv",index_col=0)
test_exp_result = pd.read_csv(data_path/"RECORD_TEST_results.csv")  # HOLD OUT set for testing model performance
print(f"Saved additonal dataset as csv to {data_path}")
print("Batch 0:", batch_0_result.shape, "|| Test Hold-out:", test_exp_result.shape)
# sort in batch result
batch_0_result.sort_values(by=list(batch_0_result.columns))
batch_0_result.head(2)
#test_exp_result.head(2)

Saved additonal dataset as csv to /content/drive/MyDrive/LMAB/LMAB2/phenotype project/data
Batch 0: (54, 5) || Test Hold-out: (54, 6)


Unnamed: 0,(+)-Griseofulvin,(S)-(+)-Camptothecin,Chloramphenicol,Metformin,Cell viability
0,1.0,2.0,0.0,0.5,0.707
1,64.0,2.0,3.125,128.0,0.636


### Organize data
We need organize as follow:


*   Experiment Sample Space needs to be the data to be queried (X_query)
*   Experiment sets:
    *   Training data: ground truth
    *   Test data :(same everytime) evaluate model performance 

*   To-do in each iterations:
    *   Remove (drop) the rows that are used ( presented in training data)
    *   Save the rows that were used for query to output the next batch of experiment
    *   Save the learner, model as it goes



In [None]:
### We have initial training data, remove it from experiment space
def remove_rows_upon_query(exp_space_df: pd.DataFrame, X_queried_df: pd.DataFrame):   
    try: 
        #return exp_space_df.drop(X_queried_df.index, axis=0)   ## If only query match (which probably won't)
        return pd.concat([exp_space_df, X_queried_df]).drop_duplicates(keep=False)
    except KeyError: 
        print("DataFrame Unchanged as Keys not all found.")
        exp_space_df

def update_query_round(data_path: Path):
    """ function assumes you already have QUERY_SPACE_BATCH0"""
    query_rounds_f = sorted([f for f in os.listdir(data_path) if f.startswith("QUERY_ROUND")])
    for f in query_rounds_f:
        batch = Path(f).stem[-1]
        if not batch.isnumeric() or int(batch)==0: continue   # don't do 0
        print(f"Batch:{batch}")
        exp_space_fn = f"QUERY_SPACE_batch{int(batch)-1}.csv"
        if not os.path.exists(data_path/exp_space_fn):
            batch_setup = pd.read_csv(data_path/f, index_col=0)
            batch_query_space = pd.read_csv(data_path/exp_space_fn, index_col=0)
            new_query_space = remove_rows_upon_query(batch_query_space, batch_setup)
            new_query_space.to_csv(data_path/("QUERY_SPACE_batch"+str(batch)+".csv"))

# Naming: QUERY SPACE (in place of experiment space)
# 1. Manual 
batch_0_query_space = remove_rows_upon_query(experiment_space_setup, batch_0_setup)
batch_0_query_space.to_csv(data_path/"QUERY_SPACE_batch0.csv")
print(batch_0_query_space.shape)
## Or 2. Automatic
update_query_round(data_path)

(14587, 4)
Batch:1


In [None]:
# add batch results to all train data
def append_to_record(record_df_fp: Path, batch_df):
    if not os.path.exists(record_df_fp): 
        print(f"New record create for batch: {record_df_fp}")
        batch_df.to_csv(record_df_fp)
    else:
        record_df = pd.read_csv(record_df_fp, index_col=0)
        pd.concat([record_df, batch_df]).drop_duplicates(keep="first").to_csv(record_df_fp)
        print(f"appended to path: {str(record_df_fp)} \nRecord Shape: {record_df.shape}")

def update_train_records(data_path: Path):
    exp_results_fns = [f for f in os.listdir(data_path) if f.startswith("EXP_RESULT")]
    for f in exp_results_fns:
        batch_result = pd.read_csv(data_path, index_col=0)
        append_to_record(data_path/"RECORD_TRAIN_result.csv", batch_result)
    

# Create Record Dataset to all trained results
# Manual
append_to_record(data_path/"RECORD_TRAIN_result.csv", batch_0_result)
# Auto
update_train_records(data_path)

appended to path: /content/drive/MyDrive/LMAB/LMAB2/phenotype project/data/RECORD_TRAIN_result.csv 
Record Shape: (54, 5)


### Final data remarks
CATEGORY:

*   RECORD: containing all the data accumulated - particularly in train (train/dest)
*   QUERY_SPACE: the remaining experimental space to query
*   EXP: Unprocessed raw data (No modification needed)
*   SETUP: (No modification needed)

Working dataset (changing each batch):
*   EXP_RESULT_batch{x}.csv
    *    Added each time
*   RECORD_TRAIN_results.csv 
    *    Append new experiment results to it
*   QUERY_SPACE_batch{x}.csv
    *    Each batch shrinks from previous 