In [20]:
import pandas as pd
import numpy as np
import datetime

In [21]:
drugs_df = pd.read_csv('../data/raw/POG500_drug_data.tsv', sep='\t')

## Utility functions

In [22]:
def zscore(X, mu, sigma):
    return (X - mu)/sigma
vect_zscore = np.vectorize(zscore)

In [23]:
def get_days_since_biopsy(row):
    start_date = datetime.date.fromisoformat(row['start_date'])
    end_date = datetime.date.fromisoformat(row['end_date'])
    biopsy_date = datetime.date.fromisoformat(row['biopsy_date'])
    if end_date < biopsy_date:
        return -1
    elif start_date < biopsy_date:
        return (end_date - biopsy_date).days
    else:
        return (end_date - start_date).days

## Determine days_on_tx_since_biopsy

In [24]:
drugs_df['days_on_tx_since_biopsy'] = drugs_df.apply(get_days_since_biopsy, axis=1)

In [25]:
drugs_df['response'] = drugs_df['days_on_tx_since_biopsy']

Collect days_on_tx strictly greater than 0

In [26]:
drugs_df = drugs_df[drugs_df['days_on_tx_since_biopsy'] > 0]

## Determine zscores for days_on_tx

In [27]:
# drug_data_all = drug_data_all[drug_data_all['cancer_cohort']=='BRCA']
drugs_stds = drugs_df.groupby('drug_name').agg(np.std)['days_on_tx_since_biopsy']
drugs_means = drugs_df.groupby('drug_name').agg(np.mean)['days_on_tx_since_biopsy']
drugs_df = drugs_df.join(drugs_stds, on='drug_name', rsuffix='_std')
drugs_df = drugs_df.join(drugs_means, on='drug_name', rsuffix='_mean')

## Calculate zscores

In [28]:
drugs_df['zscore'] = vect_zscore(drugs_df['days_on_tx_since_biopsy'], drugs_df['days_on_tx_since_biopsy_mean'], drugs_df['days_on_tx_since_biopsy_std'])

## Filter out NaN zscores

In [29]:
drugs_df = drugs_df[np.logical_not(drugs_df['zscore'].isna())]

## Print to file

In [30]:
drugs_df.to_csv('../data/processed/drugs_filtered.tsv', sep='\t')

In [33]:
pwd

'/projects/eerhan_prj/eerhan_prj_results/pog500_expression/notebooks'

In [34]:
X

NameError: name 'X' is not defined