In [None]:
import pandas as pd
from pathlib import Path
from drevalpy.datasets.curvecurator import preprocess, _exec_curvecurator, postprocess

## Load Data

In [None]:
RAW_FILE_PATH = Path("RawDataDrugsSingleAgents.txt")

In [None]:
raw_df = pd.read_csv(RAW_FILE_PATH, sep="\t")
# make min dosis and max dosis column: min(D1_CONC, ..., D5_CONC) / max
raw_df['MIN_DOSIS'] = raw_df[['D1_CONC', 'D2_CONC', 'D3_CONC', 'D4_CONC', 'D5_CONC']].min(axis=1)
raw_df['MAX_DOSIS'] = raw_df[['D1_CONC', 'D2_CONC', 'D3_CONC', 'D4_CONC', 'D5_CONC']].max(axis=1)
# kick out BARCODE
raw_df = raw_df.drop(columns=['BARCODE'])
raw_df

In [None]:
# transform to long format: Keep: Model, ID, DRUG_ID, Control, Blank. Match up: D1_CONC + D1_INTENSITY, D2_CONC + D2_INTENSITY, ...
raw_df.columns = ['Model', 'ID', 'DRUG_ID', 'CONC1', 'CONC2', 'CONC3', 'CONC4', 'CONC5', 'INTENSITY1', 'INTENSITY2', 'INTENSITY3', 'INTENSITY4', 'INTENSITY5', 'Control', 'Blank', 'mindose', 'maxdose']
raw_df_long = pd.wide_to_long(
    df=raw_df,
    stubnames=['CONC', 'INTENSITY'],
    j="MEASURE_NUM",
    i=['Model', 'ID', 'DRUG_ID', 'Control', 'Blank', 'mindose', 'maxdose'],
)

## Creating replicate information

In [None]:
# we will treat the 'ID' as cell line ID. 'MODEL' means that it comes from the same original tumor sample. 'ID' means the same mouse, i.e., if the same tumor sample is subsequently implanted into different mice (passages). We count technical replicates
raw_df_long['replicate'] = raw_df_long.groupby(['Model', 'ID', 'DRUG_ID', 'CONC']).cumcount()
raw_df_long['nreplicates'] = raw_df_long.groupby(['Model', 'ID', 'DRUG_ID'])['replicate'].transform('max')
# sort by ID, DRUG_ID
raw_df_long = raw_df_long.sort_values(by=['ID', 'DRUG_ID'])
raw_df_long = raw_df_long.reset_index()

In [None]:
drug_dosages = raw_df_long[["ID", "DRUG_ID", "mindose", "maxdose"]].drop_duplicates()
drug_dosages.to_csv("drug_dosages_Bruna.csv", index=False)

### Drug name to PubChem mapping

In [None]:
import pubchempy as pcp
results = {}
chembls = {}
manual_review = {}
inchis = {}
i = 0
for drug in raw_df_long['DRUG_ID'].unique():
    if i % 10 == 0:
        print(i, '/', len(raw_df_long['DRUG_ID'].unique()))
    compounds = pcp.get_compounds(identifier=drug, namespace="name")
    if len(compounds) == 0:
        manual_review[drug] = [compounds]
    else:
        results[drug] = compounds[0].cid
        inchis = compounds[0].inchi
        chembl = None
        for syn in compounds[0].synonyms or []:
            if syn.startswith("CHEMBL"):
                chembl = syn
                break
        chembls[drug] = chembl
    i += 1

In [None]:
print(f'{len(results)} matches, {len(manual_review)} not found')
manual_review |= {brd: cid for brd, cid in results.items() if cid is None}
manual_review

In [None]:
manual_review['681640'] = 'Bruna_681640'
manual_review['GW843682X (AN-13)'] = '9826308'
manual_review['Olaparib(1495) + Temozolomide(1375)'] = '23725625_5394'

In [None]:
def sanitize(val):
    if isinstance(val, int):
        return str(val)
    if isinstance(val, list):
        return ','.join([str(i) for i in val])
    return val
cids = pd.Series(results | manual_review, name="CID").apply(sanitize).astype(str)
cids

In [None]:
chembls = pd.Series(chembls, name="CHEMBL")
full_df = raw_df_long.merge(cids, left_on='DRUG_ID', right_index=True).merge(chembls, left_on='DRUG_ID', right_index=True, how='left')
full_df.rename(columns={'Model': 'Mouse_ID', 'ID': 'sample', 'DRUG_ID': 'drug_name', 'CONC': 'dose', 'INTENSITY': 'response', 'CID': 'drug'}, inplace=True)
full_df.to_csv('viabilities_processed.csv', index=None)
full_df

### DrEvalPy: run CurveCurator

In [None]:
preprocess(input_file='viabilities_processed.csv', output_dir='curvecurator', dataset_name='Bruna', cores=6)

In [None]:
_exec_curvecurator(Path('curvecurator'))

In [None]:
postprocess('curvecurator', dataset_name='Bruna')

Postprocess for Zenodo

In [None]:
viability_df = pd.read_csv('curvecurator/Bruna.csv', converters={'cell_line_name': str, 'pubchem_id': str})
full_df = pd.read_csv('viabilities_processed.csv', converters={'drug': str})[['sample', 'drug', 'drug_name', 'CHEMBL', 'mindose', 'maxdose', ]].drop_duplicates()
viability_df = viability_df.merge(full_df, left_on=['cell_line_name', 'pubchem_id'], right_on=['sample', 'drug'])
viability_df

In [None]:
# join with original data
original_measures_df = pd.read_csv('DrugResponsesAUCSamples.txt', sep='\t')
original_measures_df = original_measures_df.drop(columns=['Model', 'D1_CONC', 'D5_CONC', 'perc.iC50', 'cluster.superv'])
viability_df = viability_df.merge(original_measures_df, left_on=['cell_line_name', 'drug_name'], right_on=['ID', 'Drug'])
viability_df