In [None]:
import pandas as pd
import scanpy as sc
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

### X

In [None]:
df = pd.read_csv("../data/perturb/clustered_mean_gene_expression_figs2-4.csv", sep=",", index_col=0, low_memory=False)
df = df.iloc[2:, 1:]
# convert perturb name to perturb gene name
df.columns = pd.DataFrame(df.columns)[0].str.split("_", expand=True)[1].values

# take mean of two experiment if perturb genes is the same
df = df.T
df.index.name = 'perturb'
df = df.groupby('perturb').mean()
df = df.sort_index()
col_names = list(df.columns)
col_names.sort()
df = df.loc[:, col_names]
# the perturbed gene is set to have 0 expression
df = df.fillna(0).T

df.index.name = None
df.columns.name = None
df.to_csv("../result/data/df_gene_perturb", sep="\t")
X = df.values
np.savetxt("../result/data/X_gene_perturb", X)
pd.DataFrame(df.index).to_csv("../result/data/genes_gene_perturb", header=False, index=False)

### Y

In [None]:
data = sc.read_h5ad("../data/perturb/K562_gwps_normalized_bulk_01.h5ad")
ctrl_id = list(data.obs[data.obs['core_control']].index)
# take mean of ctrl experiments
ctrl_exp = data.to_df().loc[ctrl_id, :].mean()
ctrl_exp.index = ctrl_exp.index.map(data.var['gene_name'].to_dict())
Y = ctrl_exp[list(df.index)].groupby('gene_id').mean()[list(df.index)].values
np.savetxt("../result/data/Y_gene_perturb", Y)

### train_test_split

In [None]:
X = np.loadtxt("/home/jg2447/slayman/perturb/result/data/X_gene_perturb")
Y = np.loadtxt("/home/jg2447/slayman/perturb/result/data/Y_gene_perturb")

X2, X_test, Y2, Y_test = train_test_split(X, Y, test_size=0.1, random_state=100)
X_train, X_valid, Y_train, Y_valid = train_test_split(X2, Y2, test_size=0.2222, random_state=100)

In [None]:
pd.DataFrame(X_train).to_csv("../result/data/X_train", sep="\t", header=False, index=False)
pd.DataFrame(X_valid).to_csv("../result/data/X_valid", sep="\t", header=False, index=False)
pd.DataFrame(X_test).to_csv("../result/data/X_test", sep="\t", header=False, index=False)
pd.DataFrame(Y_train).to_csv("../result/data/Y_train", sep="\t", header=False, index=False)
pd.DataFrame(Y_valid).to_csv("../result/data/Y_valid", sep="\t", header=False, index=False)
pd.DataFrame(Y_test).to_csv("../result/data/Y_test", sep="\t", header=False, index=False)

### save gene name in the order of X in the model

In [None]:
genes = pd.read_csv("../result/data/genes_gene_perturb", header=None)[0].values
X = np.loadtxt("/home/jg2447/slayman/perturb/result/data/X_gene_perturb")
Y = np.loadtxt("/home/jg2447/slayman/perturb/result/data/Y_gene_perturb")
X_df = pd.DataFrame(X)
X_df.index = genes
X2_2, X_test_2, Y2_2, Y_test_2 = train_test_split(X_df, Y, test_size=0.1, random_state=100)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(X2_2, Y2_2, test_size=0.2222, random_state=100)
genes = list(X_train_2.index) + list(X_valid_2.index) + list(X_test_2.index)
pd.DataFrame(genes).to_csv("../result/X1.gene", header=False, index=False)