# Format the data matrix for the REP1k datasett

In [1]:
import pandas as pd
import numpy as np
import os
import sys
from tqdm import tqdm

In [2]:
df = pd.read_csv("../../../data/raw/rep-1m-public-tentative_v9-rep1m-data-matrix.csv", index_col=0)
cell_line_df = pd.read_csv("../../../data/raw/rep-1m-public-tentative_v9-rep1m-cell-line-meta-data.csv")
drug_df = pd.read_csv("../../../data/raw/rep-1m-public-tentative_v7-compound-dashboard-table.csv")

In [3]:
drug_df.head(1)


Unnamed: 0,BroadID,ColumnName,Name,PearsonScore,BimodalityCoefficient,ModelType,TopBiomarker(s),NumberOfSensitiveLines,Dose,ReplicateCorrelation,Target,DiseaseArea,ClinicalPhase,DrugType,MOA
0,BRD-K70642949-001-03-7,BRD-K70642949-001-03-7::2.5::PREP051,GSK2334470,,0.295389,,,10,2.5,0.43286,"AURKA, AURKB, PDPK1",,Preclinical,,phosphoinositide dependent kinase inhibitor


In [4]:
# melt df so value is "LFC.cb" and columns are "row_name" and "ColumnName"
# use the index as row_name
df_melt = pd.melt(df, var_name=["ColumnName"], value_name="LFC.cb", ignore_index=False)
df_melt.reset_index(inplace=True)
df_melt.rename(columns={"index": "row_name"}, inplace=True)

In [5]:
cell_line_df = cell_line_df.loc[:, ["ccle_name", "row_name"]]
drug_df = drug_df.loc[:, ["ColumnName", "Name", "BroadID"]]

In [6]:
out = df_melt.merge(cell_line_df, how="left")
out = out.merge(drug_df, how="left")

In [7]:
print(len(out))
out.dropna(inplace=True)
print(len(out))

1082880
1050221


In [8]:
out["pert_idose"] = 2.5
out["culture"] = "fake_culture"
out["pert_dose_unit"] = "uM"

In [9]:
# rename columns
out.rename(columns={"Name": "pert_name", "BroadID": "pert_mfc_id"}, inplace=True)

In [10]:
out = out.loc[:, ["LFC.cb","ccle_name","pert_name","pert_idose","pert_mfc_id","culture","pert_dose_unit"]]

In [51]:
# save output for each invdividual drug
for drug in tqdm(out.pert_name.unique()):
    out_drug = out.loc[out.pert_name == drug, :]
    drug_name = drug.replace(" ", "_").replace("/", "_").replace("(", "_").replace(")", "_").replace(",", "_").replace("'", "_").replace("+", "_")
    out_drug.to_csv(
        "../../../data/responses/{}.csv".format(drug_name), index=False)

  0%|          | 0/1270 [00:00<?, ?it/s]

100%|██████████| 1270/1270 [00:58<00:00, 21.76it/s]


# Example Command

In [None]:
python /scratch/users/nphill22/projects/corsello_lab/rep1k_analysis/refract/scripts/run_training.py --response_path /scratch/users/nphill22/projects/corsello_lab/rep1k_analysis/data/responses/mivebresib.csv --feature_path /scratch/users/nphill22/projects/corsello_lab/final_xgboost_model/data/pkl_depmap_public-22q1-305b_v24/x-all.pkl --output_dir /scratch/users/nphill22/projects/corsello_lab/rep1k_analysis/output

# Scramble data for all drugs

In [22]:
# save output for each invdividual drug
for drug in tqdm(out.pert_name.unique()):
    out_drug = out.loc[out.pert_name == drug, :].copy()
    # scramble the LFC.cb value
    out_drug["LFC.cb"] = np.random.permutation(out_drug["LFC.cb"].values)
    drug_name = drug.replace(" ", "_").replace("/", "_").replace("(", "_").replace(")", "_").replace(",", "_").replace("'", "_").replace("+", "_")
    out_drug.to_csv(
        "../../../data/scrambled_responses/{}.csv".format(drug_name), index=False)

  0%|          | 0/1270 [00:00<?, ?it/s]

100%|██████████| 1270/1270 [00:58<00:00, 21.63it/s]
