In [69]:
import pandas as pd
import numpy as np
import datetime

In [70]:
drugs_df = pd.read_csv('../data/raw/POG500_drug_data.tsv', sep='\t')

## Utility function

In [71]:
def zscore(X, mu, sigma):
    return (X - mu)/sigma
vect_zscore = np.vectorize(zscore)

In [72]:
def get_days_since_biopsy(row):
    start_date = datetime.date.fromisoformat(row['start_date'])
    end_date = datetime.date.fromisoformat(row['end_date'])
    biopsy_date = datetime.date.fromisoformat(row['biopsy_date'])
    if end_date < biopsy_date:
        return -1
    elif start_date < biopsy_date:
        return (end_date - biopsy_date).days
    else:
        return (end_date - start_date).days

## Determine days_on_tx_since_biopsy

In [73]:
drugs_df['days_on_tx_since_biopsy'] = drugs_df.apply(get_days_since_biopsy, axis=1)

Remove entries with days < 0

In [74]:
drugs_df = drugs_df[drugs_df['days_on_tx_since_biopsy'] > 0]

## Determine zscores for days_on_tx

In [75]:
# drug_data_all = drug_data_all[drug_data_all['cancer_cohort']=='BRCA']
drugs_stds = drugs_df.groupby('drug_name').agg(np.std)['days_on_tx_since_biopsy']
drugs_means = drugs_df.groupby('drug_name').agg(np.mean)['days_on_tx_since_biopsy']
drugs_df = drugs_df.join(drugs_stds, on='drug_name', rsuffix='_std')
drugs_df = drugs_df.join(drugs_means, on='drug_name', rsuffix='_mean')
# Drop NaN values; drugs that exist once
# drug_data = drug_data.dropna()
# drug_data = drug_data[drug_data['days_on_tx'] != 0]

## Calculate zscores

In [78]:
drugs_df['zscore'] = vect_zscore(drugs_df['days_on_tx_since_biopsy'], drugs_df['days_on_tx_since_biopsy_mean'], drugs_df['days_on_tx_since_biopsy_std'])


## Filter out NaN zscores

In [96]:
drugs_df = drugs_df[np.logical_not(drugs_df['zscore'].isna())]

## Print to file

In [97]:
drugs_df.to_csv('../data/processed/drugs_filtered.tsv', sep='\t')

In [98]:
drugs_df

Unnamed: 0,pog_id,drug_name,start_date,end_date,days_on_tx,treatment_number,biopsy_date,before_biop,cancer_cohort,cancer_type,...,bx_loc_radiated,total_brachy_dose,total_radio_dose,class_1,class_2,pathway_class,days_on_tx_since_biopsy,days_on_tx_since_biopsy_std,days_on_tx_since_biopsy_mean,zscore
0,POG864,TRAMETINIB,2017-10-26,2018-01-17,83,4,2017-05-10,f,OV,Low-Grade Serous Ovarian Cancer,...,f,0,0,MEK inhibitor (broad),kinase inhibitor,MAPK pathway,83,32.233523,102.000000,-0.589448
2,POG122,IRINOTECAN,2016-03-30,2016-07-20,112,5,2014-09-12,f,COLO,Colorectal Adenocarcinoma,...,f,0,2000,topoisomerase I inhibitor,DNA synthesis inhibitor,DNA synthesis,112,203.928693,186.488372,-0.365267
3,POG878,IRINOTECAN,2018-01-17,2018-01-31,14,4,2017-05-17,f,PANC,Pancreatic Adenocarcinoma,...,t,0,0,topoisomerase I inhibitor,DNA synthesis inhibitor,DNA synthesis,14,203.928693,186.488372,-0.845827
4,POG175,GEMCITABINE,2015-02-23,2015-03-23,28,3,2015-02-04,f,BRCA,Breast Invasive Ductal Carcinoma,...,f,0,5040,DNA synthesis inhibitor,antimetabolite,DNA synthesis,28,102.607759,100.456522,-0.706151
5,POG270,GEMCITABINE,2015-12-14,2016-05-10,148,2,2015-05-22,f,HCC,Hepatocellular Carcinoma,...,f,0,6400,DNA synthesis inhibitor,antimetabolite,DNA synthesis,148,102.607759,100.456522,0.463352
6,POG804,GEMCITABINE,2016-04-06,2017-04-18,377,1,2017-04-03,t,BRCA,Breast Invasive Ductal Carcinoma,...,t,0,6000,DNA synthesis inhibitor,antimetabolite,DNA synthesis,15,102.607759,100.456522,-0.832847
9,POG049,FLUDROCORTISONE,2013-12-11,2014-06-03,174,1,2013-11-05,f,ACC,Adrenocortical Carcinoma,...,f,0,0,Chemotherapy adjuvant,misc,misc,174,22.627417,190.000000,-0.707107
13,POG638,BMS-986205,2016-11-01,2016-11-29,28,5,2016-08-23,f,LYMP,B-Cell Lymphoma,...,f,0,0,IDO inhibitor (targeted),immunotherapy,Immune system,28,57.775427,70.000000,-0.726953
14,POG381,GEFITINIB,2016-05-24,2017-08-29,462,1,2016-02-10,f,LUNG,Lung Adenocarcinoma,...,f,0,0,EGFR inhibitor (broad),kinase inhibitor,Receptor Kinase,462,167.253401,188.200000,1.637037
17,POG092,PACLITAXEL,2014-06-16,2014-08-14,59,6,2014-05-21,f,BRCA,Breast Invasive Ductal Carcinoma,...,t,0,8790,taxanes,mitotic inhibitor,mitotic inhibitor,59,90.130799,107.369048,-0.536654
