In [1]:
import numpy as np
import pandas as pd
import math

In [3]:
gdsc1 = pd.read_csv("GDSC1_fitted_dose_response_25Feb20.csv")
gdsc2 = pd.read_csv("GDSC2_fitted_dose_response_25Feb20.csv")
train = pd.read_csv("drugsCom_raw/drugsComTrain_raw.tsv", delimiter = "\t")
test = pd.read_csv("drugsCom_raw/drugsComTest_raw.tsv", delimiter = "\t")

## Drug Review Datasets

In [4]:
drug_review = pd.concat([train, test], axis = 0)

In [6]:
drug_review.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [7]:
drug_review = drug_review.drop("Unnamed: 0", axis = 1)

In [10]:
# drop na values 
drug_review = drug_review.dropna() # (213869 × 7)
drug_review.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [12]:
# find all misspellings of "Cancer"
drug_review[drug_review["condition"].str.contains("Cance")].head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
54,Opdivo,Non-Small Cell Lung Cance,"""My mother died from lung cancer. Her last hop...",1.0,"March 2, 2017",6
181,Ibrance,"Breast Cancer, Metastatic","""I was prompted to write this after reading so...",9.0,"November 23, 2016",48
317,Anastrozole,Breast Cance,"""I have been on anastozole for a year and half...",6.0,"June 27, 2016",40
904,Femara,"Breast Cancer, Adjuvant","""joint pain, no energy, no enthusiasm for soci...",10.0,"May 13, 2016",39
1059,Xalkori,Non-Small Cell Lung Cance,"""My Father was given 6-8 weeks to live after h...",10.0,"December 19, 2015",11


In [14]:
# replaced all misspellings of Cancer
drug_review["condition"] = drug_review.condition.str.replace("Cance", "Cancer")

In [16]:
# make all drug names uppercase, consistent with the cancer drug dataset
drug_review["drugName"] = drug_review["drugName"].str.upper()

In [30]:
# to CSV
# drug_review.to_csv("drugReviews.csv")

## Cancer Drug Dataset

In [17]:
gdsc1["DRUG_NAME"] = gdsc1["DRUG_NAME"].str.upper()
gdsc2["DRUG_NAME"] = gdsc2["DRUG_NAME"].str.upper()

In [18]:
drugs1 = list(gdsc1.DRUG_NAME)
drugs2 = list(gdsc2.DRUG_NAME)

In [20]:
# common drugs between the two datasets
drug_review[drug_review["drugName"].isin(drugs1) | drug_review["drugName"].isin(drugs2)].shape

(913, 6)

In [21]:
print(gdsc1.shape)
print(gdsc2.shape)

(310904, 19)
(135242, 19)


In [22]:
# combine the two cancer drug datasets
gdsc = pd.concat([gdsc1, gdsc2])

In [23]:
gdsc.shape

(446146, 19)

In [25]:
# separate PUTATIVE_TARGET into a separate row for each target type
gdsc = gdsc.set_index(['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COSMIC_ID',
       'CELL_LINE_NAME', 'SANGER_MODEL_ID', 'TCGA_DESC', 'DRUG_ID',
       'DRUG_NAME', 'PATHWAY_NAME', 'COMPANY_ID',
       'WEBRELEASE', 'MIN_CONC', 'MAX_CONC', 'LN_IC50', 'AUC', 'RMSE',
       'Z_SCORE']).apply(lambda x: x.str.split(',').explode())

In [26]:
# to CSV
# gdsc.reset_index().to_csv("gdsc.csv")