In [1]:
# theory: 'possible control' drugs are likely placed on each plate
# want to find out if the MOA targets are 'similar' for these drugs (same dose and time) across the multiple samples
# and the assumption is the multiple samples are from different plates
#
# input data sets are in ../data
# new data sets from this work are in data folder, below the current folder

In [2]:
import pandas as pd
import numpy as np

In [3]:
# from a previous exercise, the 'possible control drugs' are
pc_drugids = ['87d714366','9f80f3f77','8b87a7a83','5628cb3ee','d08af5d4b','292ab2c28','d50f18348','d1b47f29d']

In [4]:
times = [24,48,72]
doses = ['D1', 'D2']

In [5]:
# the drug_ids are associated through an extra file provided by sponsor
dsdf = pd.read_csv('../data/train_drug.csv')

In [6]:
# training data
train_df = pd.read_csv('../data/train_features.csv')
# drop the input features - not used in this exercise.  keep just these...
id_cols = ['sig_id', 'cp_time', 'cp_dose']
train_df2 = train_df[id_cols].copy()

In [7]:
# training targets
train_targets_df = pd.read_csv('../data/train_targets_scored.csv')

In [8]:
# train_targets_df.head()

In [9]:
# merge the drug_ids into the training data
train_df3 = train_df2.merge(dsdf, on='sig_id')

In [10]:
# only retain the rows for drugs in the pc_drugids list
train_df4 = train_df3.loc[train_df3['drug_id'].isin(pc_drugids)]

In [11]:
# now merge the target data
train_df5 = train_df4.merge(train_targets_df, on='sig_id')

In [12]:
# group by drug_id, cp_time, and cp_dose
grouped = train_df5.groupby(['drug_id', 'cp_time', 'cp_dose'])

In [13]:
dg = grouped.get_group(('9f80f3f77', 72, 'D1'))

In [14]:
# an example column name
dg.columns[181]

'serotonin_receptor_antagonist'

In [15]:
# this is the mean of that column for all the samples
dg["11-beta-hsd1_inhibitor"].mean()

0.0

In [16]:
# this is the mean for another (happens to be an MOA) - for the mean() to be 1.0, ALL the samples were 1.0
dg["egfr_inhibitor"].mean()

0.0

In [17]:
# how many rows did this drug-time-dose combo have?
len(dg)

40

In [32]:
# add this column to list of columsn to skip when calculating mean
id_cols.append('drug_id')

In [36]:
# go through all the possible control drug_ids, time, dose combos...
for n in pc_drugids:
    for t in times:
        for d in doses:
            # get the group from the grouped dataframe
            dg = grouped.get_group((n,t,d))
            # print header
            print(f"{n} - {t} - {d}    {len(dg)} rows")
            # go across all the columns that aren't id_cols
            for i,c in enumerate([col for col in dg.columns if col not in id_cols]):
                # calculate the mean
                m = dg[c].mean()
                # if the mean is greater than 0.0, then this column is an MOA
                if(m > 0.0):
                    # so print it
                    print(f"{i} - {m} - {c}")

87d714366 - 24 - D1    121 rows
136 - 1.0 - nfkb_inhibitor
163 - 1.0 - proteasome_inhibitor
87d714366 - 24 - D2    114 rows
136 - 1.0 - nfkb_inhibitor
163 - 1.0 - proteasome_inhibitor
87d714366 - 48 - D1    131 rows
136 - 1.0 - nfkb_inhibitor
163 - 1.0 - proteasome_inhibitor
87d714366 - 48 - D2    116 rows
136 - 1.0 - nfkb_inhibitor
163 - 1.0 - proteasome_inhibitor
87d714366 - 72 - D1    123 rows
136 - 1.0 - nfkb_inhibitor
163 - 1.0 - proteasome_inhibitor
87d714366 - 72 - D2    113 rows
136 - 1.0 - nfkb_inhibitor
163 - 1.0 - proteasome_inhibitor
9f80f3f77 - 24 - D1    41 rows
109 - 1.0 - hmgcr_inhibitor
9f80f3f77 - 24 - D2    38 rows
109 - 1.0 - hmgcr_inhibitor
9f80f3f77 - 48 - D1    45 rows
109 - 1.0 - hmgcr_inhibitor
9f80f3f77 - 48 - D2    41 rows
109 - 1.0 - hmgcr_inhibitor
9f80f3f77 - 72 - D1    40 rows
109 - 1.0 - hmgcr_inhibitor
9f80f3f77 - 72 - D2    41 rows
109 - 1.0 - hmgcr_inhibitor
8b87a7a83 - 24 - D1    34 rows
63 - 1.0 - cdk_inhibitor
8b87a7a83 - 24 - D2    31 rows
63 - 1.