In [None]:
import feyn
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
import pysr

In [None]:
from feyn.plots import plot_model_summary

# Functions

In [None]:
def get_train_test_types(dataset, random_seed=1024, target="phenotype_reg"):
    UNWANTED_COLUMNS_FOR_TRAINING = ["run", "phenotype"]
    data = dataset[dataset.columns.difference(UNWANTED_COLUMNS_FOR_TRAINING)].dropna()

    # Let's record the categorical data types in our dataset (note features will be treated as numerical by default).
    stypes = {}
    for f in data.columns:
        if data[f].dtype == "object":
            stypes[f] = "c"

    # Split
    train, test = train_test_split(
        data, test_size=0.33, stratify=data[target], random_state=random_seed
    )

    return train, test, stypes

In [None]:
def get_best_model(
    training_data, stypes, target="phenotype_reg", epochs=20, random_seed=1024
):

    ql = feyn.QLattice(random_seed=random_seed)

    models = ql.auto_run(
        data=training_data,
        output_name=target,
        kind="classification",
        stypes=stypes,
        n_epochs=epochs,
        criterion = "bic"
    )

    return models

#    best = models[0]
#    return best


In [None]:
def modsum(models, train, test):
    model_list = []
    auc_list_train = []
    auc_list_test = []
    accuracy_train = []
    accuracy_test = []
    bic_list = []
    feat_list = []
    function_list = []
    loss_list = []
    i = 1
    for x in models:
        model_list.append(str(i))
        auc_list_train.append(str(x.roc_auc_score(train).round(2)))
        auc_list_test.append(str(x.roc_auc_score(test).round(2)))
        accuracy_train.append(str(x.accuracy_score(train).round(2)))
        accuracy_test.append(str(x.accuracy_score(test).round(2)))
        bic_list.append(str(x.bic.round(2)))
        feat_list.append(len(x.features))
        function_list.append(x.sympify(symbolic_lr=False, symbolic_cat=True, include_weights=False))
        loss_list.append(x.loss_value)
        i += 1
    df = pd.DataFrame(
        list(zip(model_list, auc_list_train, auc_list_test, accuracy_train,accuracy_test,bic_list, feat_list, function_list, loss_list)),
        columns=['Model', 'AUC Train', 'AUC Test', 'Accuracy Train', 'Accuracy Test', 'BIC', 'N. Features', 'Functional form', 'Loss'])

    return (df)

In [None]:
def save_model(model, train, test, filename):
    model.plot(train, test, filename=f"./Symbolic_results/{filename}_summary.html")
    model.plot_signal(train, filename=f"./Symbolic_results/{filename}_signal.svg")
    model.save(f"./Symbolic_results/{filename}_model.json")

# Dataset

In [None]:
dataset = pd.read_csv("./crpc_run2_normalizado.csv")

In [None]:
dataset_prim_met = dataset.loc[dataset['phenotype'].isin(["CRPC", "Metastatic"])]

In [None]:
dataset_prim_met = dataset_prim_met.replace({"phenotype_reg": 1},0)

In [None]:
dataset_prim_met = dataset_prim_met.replace({"phenotype_reg": 2},1)

In [None]:
dataset_prim_met

In [None]:
pd.unique(dataset_prim_met["phenotype_reg"])

In [None]:
dataset_norm_met = dataset.loc[dataset['phenotype'].isin(['Solid_Tissue_Normal','Metastatic'])]

In [None]:
dataset_norm_met = dataset_norm_met.replace({"phenotype_reg": 2},1)

In [None]:
pd.unique(dataset_norm_met["phenotype_reg"])

In [None]:
dataset_norm_met

In [None]:
dataset_norm_prim = dataset.loc[dataset['phenotype'].isin(["Solid_Tissue_Normal","CRPC"])]

In [None]:
pd.unique(dataset_norm_prim["phenotype"])

In [None]:
dataset_norm_prim

# Models

## MetNorm

In [None]:
#Split data in train and test
training_NM, test_NM, stypes_NM = get_train_test_types(dataset=dataset_norm_met,random_seed=1024, target="phenotype_reg")

In [None]:
#Build models
model_NM = get_best_model(training_data=training_NM, stypes=stypes_NM, target="phenotype_reg", epochs=20, random_seed=1024)

In [None]:
save_model(model=model_NM[0], train=training_NM, test=test_NM, filename="norm_met")

In [None]:
#Save models
df_summary_NM = modsum(models=model_NM, train=training_NM, test=test_NM)
df_summary_NM.to_csv("./Symbolic_results/model_MN_summary.csv", index=False)
df_summary_NM

# MetPri

In [None]:
#Split data in train and test
training_PM, test_PM, stypes_PM = get_train_test_types(dataset=dataset_prim_met,random_seed=1024, target="phenotype_reg")

In [None]:
#Build models
model_PM = get_best_model(training_data=training_PM, stypes=stypes_PM, target="phenotype_reg", epochs=20, random_seed=1024)

In [None]:
save_model(model=model_PM[0], train=training_PM, test=test_PM, filename="prim_met")

In [None]:
#Save models
df_summary_PM = modsum(model_PM, training_PM, test_PM)
df_summary_PM.to_csv("./Symbolic_results/model_PM_summary.csv", index=False)
df_summary_PM

## PriNorm

In [None]:
#Split data in train and test
training_NP, test_NP, stypes_NP = get_train_test_types(dataset=dataset_norm_prim,random_seed=1024, target="phenotype_reg")

In [None]:
#Build models
model_NP = get_best_model(training_data=training_NP, stypes=stypes_NP, target="phenotype_reg", epochs=20, random_seed=1024)

In [None]:
save_model(model=model_NP[0], train=training_NP, test=test_NP, filename="norm_prim")

In [None]:
#Save models
df_summary_NP = modsum(model_NP, training_NP, test_NP)
df_summary_NP.to_csv("./Symbolic_results/model_NP_summary.csv", index=False)
df_summary_NP