In [1]:
# exploring some of the data stats of train data given drug-sample info
#
# trying to tease out info to support the assertion that there are a static # of 'plates' used
# for the experimental process when the perturbations are made
#
# input data sets are in ../data
# new data sets from this work are in data folder, below the current folder

In [2]:
import pandas as pd
import numpy as np

In [3]:
# sample-drug association in training (drug NOT provided in test, btw)
dsdf = pd.read_csv('../data/train_drug.csv')

In [4]:
# training set
train_df = pd.read_csv('../data/train_features.csv')

In [5]:
dsdf.head()

Unnamed: 0,sig_id,drug_id
0,id_000644bb2,b68db1d53
1,id_000779bfc,df89a8e5a
2,id_000a6266a,18bb41b2c
3,id_0015fd391,8c7f86626
4,id_001626bd3,7cbed3131


In [6]:
# of unique 'drugs' in the train set 
len(dsdf["drug_id"].value_counts())

3289

In [7]:
# top 20 most common drug occurrences in train
dsdf["drug_id"].value_counts().head(20)

cacb2b860    1866
87d714366     718
9f80f3f77     246
8b87a7a83     203
5628cb3ee     202
d08af5d4b     196
292ab2c28     194
d50f18348     186
d1b47f29d     178
67c879e79      19
d488d031d      18
83a9ea167      18
52d1e6f43      18
30aa2f709      14
6b8b675cc      14
a7c2673c1      14
11f66c124      14
5d9bb0ebe      14
1a52478dc      14
8c91d6909      13
Name: drug_id, dtype: int64

In [8]:
# population of occurrences (count of counts)
# this just provides some visibility to the distribution of the groupings
# takeaways from this info:
# 1866 - this is the actual ctl_vehicle - it is all over the samples so many / plate
# 6 - this is a single occurrence of a DRUG (6 because there are 2 dose values and 3 time periods: 2x3 = 6)
dsdf["drug_id"].value_counts().value_counts().sort_values(axis=0)

202        1
718        1
246        1
19         1
196        1
194        1
186        1
203        1
178        1
1866       1
3          2
18         3
2          3
4          3
8          4
11         4
14         6
13        25
12        64
5         66
1        129
7        196
6       2774
Name: drug_id, dtype: int64

In [9]:
# possible controls have occurences > 100 and < 1000
# idea there is that these drugs appear on many plates and possibly multiple locations on each plate
# you would do that if you needed other 'control' variables

possible_control_ids = dsdf["drug_id"].value_counts().gt(100) & dsdf["drug_id"].value_counts().lt(1000)
poss_control_values = possible_control_ids[possible_control_ids].index.tolist()

In [10]:
poss_control_values

['87d714366',
 '9f80f3f77',
 '8b87a7a83',
 '5628cb3ee',
 'd08af5d4b',
 '292ab2c28',
 'd50f18348',
 'd1b47f29d']

In [11]:
# merge in the drug_id column to train
train_df2 = train_df.merge(dsdf, on="sig_id")

In [12]:
train_df2.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99,drug_id
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176,b68db1d53
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371,df89a8e5a
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931,18bb41b2c
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154,8c7f86626
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125,7cbed3131


In [13]:
# boolean for the drug_ids that have less than 100 occurrences
# dsdf["drug_id"].value_counts().lt(100)

In [14]:
# name that boolean (series)
# actual_drugids = dsdf["drug_id"].value_counts().lt(100)

In [15]:
# make a list of the drugs that return from applying that boolean to the full set
# actual_drugs_list = actual_drugids[dsdf["drug_id"]].tolist()

In [16]:
# here is the train data with controls removed
# train_drugs_only = train_df[actual_drugs_list]

In [17]:
# len(train_drugs_only)

In [18]:
# what are the samples for the drug known as "2289e7c53"?
# d_2289e7c53 = dsdf["drug_id"] == "2289e7c53"
# train_df[dsdf["drug_id"] == "2289e7c53"]

In [19]:
# show training rows for samples that are from drug: did
def get_train_drug(did):
    global dsdf, train_df
    dbool = dsdf["drug_id"] == did
    return(train_df2[dbool])

In [20]:
# this just shows how that function works when you inout a drug_id
d1 = get_train_drug('5628cb3ee')
d1.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99,drug_id
48,id_008b48693,trt_cp,24,D1,2.52,-2.02,1.171,2.042,-1.807,0.5109,...,0.0071,-0.3901,0.5272,-1.065,0.0289,-0.2725,-0.1511,-0.4635,-0.1365,5628cb3ee
56,id_009379beb,trt_cp,24,D1,4.746,-3.991,1.395,1.806,-0.7316,3.249,...,0.612,-0.1706,-0.2873,0.9902,0.6887,-0.7908,0.0828,-1.372,0.4935,5628cb3ee
236,id_02758a23e,trt_cp,72,D1,2.825,-1.278,0.1994,0.1683,-1.466,0.2878,...,-0.792,-4.065,-0.754,-0.8609,-0.419,-4.299,-0.5327,-2.569,0.1003,5628cb3ee
309,id_034af0742,trt_cp,48,D1,1.866,-0.8712,0.6766,0.2664,-0.9588,0.9508,...,1.125,-0.1679,0.035,0.1707,-0.3857,0.6929,0.6949,0.6394,0.4659,5628cb3ee
506,id_05487da6c,trt_cp,48,D2,-0.3399,-1.786,-0.7389,0.6163,-2.514,-0.693,...,0.2141,-0.073,0.6851,1.244,0.5276,-0.8585,0.0389,0.5957,1.412,5628cb3ee


In [21]:
# the time and dose combinations
cp_time = [24, 48, 72]
cp_dose = ['D1', 'D2']

In [22]:
# for all the possible control drugs, show the # of samples across that drugs dose and time combinations
# the # of rows for each drug-time-dose combo gives some indication of how many plates there are and if a drug
# might be found multiple times on a single plate
for drug in poss_control_values:
    d1 = get_train_drug(drug)
    print("\n")
    for t in cp_time:
        for d in cp_dose:
            s = d1.loc[(d1["cp_time"] == t) & (d1["cp_dose"] == d)]
            print(f"drug_id: {drug}, cp_time: {t}, cp_dose: {d}: {len(s)} rows.")



drug_id: 87d714366, cp_time: 24, cp_dose: D1: 121 rows.
drug_id: 87d714366, cp_time: 24, cp_dose: D2: 114 rows.
drug_id: 87d714366, cp_time: 48, cp_dose: D1: 131 rows.
drug_id: 87d714366, cp_time: 48, cp_dose: D2: 116 rows.
drug_id: 87d714366, cp_time: 72, cp_dose: D1: 123 rows.
drug_id: 87d714366, cp_time: 72, cp_dose: D2: 113 rows.


drug_id: 9f80f3f77, cp_time: 24, cp_dose: D1: 41 rows.
drug_id: 9f80f3f77, cp_time: 24, cp_dose: D2: 38 rows.
drug_id: 9f80f3f77, cp_time: 48, cp_dose: D1: 45 rows.
drug_id: 9f80f3f77, cp_time: 48, cp_dose: D2: 41 rows.
drug_id: 9f80f3f77, cp_time: 72, cp_dose: D1: 40 rows.
drug_id: 9f80f3f77, cp_time: 72, cp_dose: D2: 41 rows.


drug_id: 8b87a7a83, cp_time: 24, cp_dose: D1: 34 rows.
drug_id: 8b87a7a83, cp_time: 24, cp_dose: D2: 31 rows.
drug_id: 8b87a7a83, cp_time: 48, cp_dose: D1: 36 rows.
drug_id: 8b87a7a83, cp_time: 48, cp_dose: D2: 34 rows.
drug_id: 8b87a7a83, cp_time: 72, cp_dose: D1: 34 rows.
drug_id: 8b87a7a83, cp_time: 72, cp_dose: D2: 34 rows