In [1]:
import os
import json

import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
label_dir = os.path.join("..", "data", "cleaned", "targets", "discrete-labels.json")

os.path.isfile(label_dir)

True

In [3]:
continuous_dir = os.path.join("..", "data", "cleaned", "targets", "continuous-labels.json")

os.path.isfile(continuous_dir)

True

In [4]:
with open(label_dir, "r") as f:
    labels = json.load(f)

with open(continuous_dir, "r") as f:
    ic50 = json.load(f)

In [5]:
all([k in ic50.keys() for k in labels.keys()])

True

In [6]:
data_dict = {
    "pair_id": [k for k in labels.keys()],
    "label": [v for v in labels.values()],
    "ic50": [ic50[k] for k in labels.keys()]
}

df = pd.DataFrame(data_dict)

df.head()

Unnamed: 0,pair_id,label,ic50
0,ACH-000070_ALL_176870,0,0.693305
1,ACH-000137_GBM_176870,0,2.580268
2,ACH-000008_SKCM_176870,0,2.557837
3,ACH-000740_HNSC_176870,0,0.290013
4,ACH-000697_DLBC_176870,0,1.11025


In [7]:
df["cell_line_id"] = df["pair_id"].map(lambda x: x.split("_")[0])
df["drug_id"] = df["pair_id"].map(lambda x: x.split("_")[-1])
df["cancer_type"] = df["pair_id"].map(lambda x: x.split("_")[1])

df.head()

Unnamed: 0,pair_id,label,ic50,cell_line_id,drug_id,cancer_type
0,ACH-000070_ALL_176870,0,0.693305,ACH-000070,176870,ALL
1,ACH-000137_GBM_176870,0,2.580268,ACH-000137,176870,GBM
2,ACH-000008_SKCM_176870,0,2.557837,ACH-000008,176870,SKCM
3,ACH-000740_HNSC_176870,0,0.290013,ACH-000740,176870,HNSC
4,ACH-000697_DLBC_176870,0,1.11025,ACH-000697,176870,DLBC


In [8]:
train, test = train_test_split(df, test_size=0.30, stratify=df["cancer_type"], random_state=42)
val, test = train_test_split(test, test_size=0.50, stratify=test["cancer_type"], random_state=42)

In [9]:
print("train shape:", train.shape)
print("val shape:", val.shape)
print("test shape:", test.shape)

train shape: (70744, 6)
val shape: (15159, 6)
test shape: (15160, 6)


In [10]:
all([i not in val["pair_id"] for i in train["pair_id"]])

True

In [11]:
all([i not in test["pair_id"] for i in val["pair_id"]])

True

In [12]:
dest_dir = os.path.join("..", "data", "cleaned")

train.to_csv(os.path.join(dest_dir, "train.csv"), index=False)
val.to_csv(os.path.join(dest_dir, "validation.csv"), index=False)
test.to_csv(os.path.join(dest_dir, "test.csv"), index=False)

In [13]:
train.head()

Unnamed: 0,pair_id,label,ic50,cell_line_id,drug_id,cancer_type
84261,ACH-000198_LAML_11282283,0,0.21077,ACH-000198,11282283,LAML
8231,ACH-000191_THCA_216326,0,3.225571,ACH-000191,216326,THCA
6608,ACH-000767_SCLC_6918289,0,-1.497495,ACH-000767,6918289,SCLC
81445,ACH-000572_SKCM_56965967,1,2.489474,ACH-000572,56965967,SKCM
13297,ACH-000810_SKCM_300471,1,-2.64763,ACH-000810,300471,SKCM


In [14]:
train.iloc[0]["drug_id"]

'11282283'