In [1]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

In [2]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression

### lasso

In [5]:
mse_final_list = []
p_final_list = []

for rs in range(10):
    # read data
    X_train = pd.read_csv("../../result/input_perturb_phyloP/%d/X_train_stratified" % rs, sep="\t", index_col=0).values
    X_valid = pd.read_csv("../../result/input_perturb_phyloP/%d/X_valid_stratified" % rs, sep="\t", index_col=0).values
    X_test = pd.read_csv("../../result/input_perturb_phyloP/%d/X_test_stratified" % rs, sep="\t", index_col=0).values
    Y_train = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_train_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_valid = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_valid_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_test = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_test_gene = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).index

    dag = pd.read_csv("../../result/network_perturb_phyloP/DAGMA_thresholdAdaptive.tsv", sep="\t", header=None)
    id2genes = pd.read_csv("../../result/network_perturb_phyloP/valid_genes", sep="\t").set_index("ID")['genes'].to_dict()
    dag[0] = dag[0].map(id2genes)
    dag[1] = dag[1].map(id2genes)
    dag_genes = list(set.union(set(dag[0]), set(dag[1])))
    test_inDAG = Y_test_gene.isin(dag_genes)
    
    # model
    para_list = np.arange(0.01,0.2,0.01)

    mse_list = []
    for alpha in para_list:
        lr = Lasso(alpha=alpha)
        lr.fit(X_train, Y_train)
        y_valid_pred = lr.predict(X_valid)
        mse_list.append(mean_squared_error(Y_valid, y_valid_pred))
    best_para = para_list[np.argmin(mse_list)]

    lr = Lasso(alpha=best_para)
    lr.fit(np.concatenate([X_train, X_valid]), np.concatenate([Y_train, Y_valid]))
    Y_pred = lr.predict(X_test)
    mse_final_list.append(mean_squared_error(Y_test, Y_pred))
    p_final_list.append(pearsonr(Y_test, Y_pred))

In [6]:
np.mean(mse_final_list), np.std(mse_final_list)

(0.16361379220214434, 0.061838294547170075)

In [7]:
np.mean(p_final_list), np.std(p_final_list)

(0.1612640836099545, 0.164969210231839)

### linear

In [8]:
mse_final_list = []
p_final_list = []

for rs in range(10):
    # read data
    X_train = pd.read_csv("../../result/input_perturb_phyloP/%d/X_train_stratified" % rs, sep="\t", index_col=0).values
    X_valid = pd.read_csv("../../result/input_perturb_phyloP/%d/X_valid_stratified" % rs, sep="\t", index_col=0).values
    X_test = pd.read_csv("../../result/input_perturb_phyloP/%d/X_test_stratified" % rs, sep="\t", index_col=0).values
    Y_train = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_train_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_valid = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_valid_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_test = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_test_gene = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).index

    dag = pd.read_csv("../../result/network_perturb_phyloP/DAGMA_thresholdAdaptive.tsv", sep="\t", header=None)
    id2genes = pd.read_csv("../../result/network_perturb_phyloP/valid_genes", sep="\t").set_index("ID")['genes'].to_dict()
    dag[0] = dag[0].map(id2genes)
    dag[1] = dag[1].map(id2genes)
    dag_genes = list(set.union(set(dag[0]), set(dag[1])))
    test_inDAG = Y_test_gene.isin(dag_genes)
    
    # model
    
    lr = LinearRegression()
    lr.fit(np.concatenate([X_train, X_valid]), np.concatenate([Y_train, Y_valid]))
    Y_pred = lr.predict(X_test)
    mse_final_list.append(mean_squared_error(Y_test, Y_pred))
    p_final_list.append(pearsonr(Y_test, Y_pred))

In [9]:
np.mean(mse_final_list), np.std(mse_final_list)

(2.091870480229457, 0.2278779535009667)

In [10]:
np.mean(p_final_list), np.std(p_final_list)

(0.177622474275748, 0.2249316084587924)