# Feature importance - supervised learning evaluation, transcriptomics

This notebook sets up the basis for a supervised learning-driven approach to evaluate feature importance, leveraging the multiple omics types available.

### Import Packages

In [None]:
import pickle

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

import sparc_multiomics.constants as const
from sparc_multiomics.machine_learning import train_model

### Load data

Load the transcriptomics, genomics, proteomics and metadata. Merge it, keep only colon samples and UC/CD samples.

In [None]:
transcriptomics_data = pd.read_parquet("transcriptomics_batch_corrected.parquet")
transcriptomics_data = transcriptomics_data.drop(columns=["original_batch"]).add_prefix(
    "transcriptomics:"
)
transcriptomics_columns = transcriptomics_data.drop(
    columns=["transcriptomics:sample_id"]
).columns
genomics = pd.read_parquet("genomics_annotated.parquet").add_prefix("genomics:")
genomics_columns = genomics.drop(columns=["genomics:sample_id"]).columns
proteomics = pd.read_parquet("proteomics_processed.parquet").add_prefix("proteomics:")
proteomics_columns = proteomics.drop(columns=["proteomics:sample_id"]).columns
metadata = pd.read_parquet("collapsed_metadata.parquet")
metadata = metadata[
    (metadata["diagnosis"] != "ibd_unclassified")
    & (metadata["transcriptomics"].notna())
    & (metadata["genomics_array"].notna())
    & (metadata["proteomics"].notna())
    & (metadata["simple_tissue"] == "colon")
]

In [None]:
merged_transcriptomics = pd.merge(
    metadata,
    transcriptomics_data,
    left_on="transcriptomics",
    right_on="transcriptomics:sample_id",
    how="left",
)
merged_transcriptomics_proteomics = pd.merge(
    merged_transcriptomics,
    proteomics,
    left_on="proteomics",
    right_on="proteomics:sample_id",
    how="left",
)
merged_transcriptomics_proteomics_genomics = pd.merge(
    merged_transcriptomics_proteomics,
    genomics,
    left_on="genomics_array",
    right_on="genomics:sample_id",
    how="left",
)

### Processing data subsets

By iteratively splitting the data according to the available disease scores, it is then saved into the dictionary as mentioned above. Furthermore, columns with low variance (**<5%**) are excluded, and ANOVA is deployed against the target variable to identify most usable features.

In [None]:
data_dictionary = {}
features_columns = (
    list(transcriptomics_columns) + list(proteomics_columns) + list(genomics_columns)
)
current_table = merged_transcriptomics_proteomics_genomics[
    merged_transcriptomics_proteomics_genomics["diagnosis"].notna()
]

print(
    f"\n=====diagnosis:{current_table['diagnosis'].value_counts()}=====\n",
)
# barplot with class distribution

sns.countplot(x="diagnosis", data=current_table)
plt.show()

features_table = current_table[features_columns]

mapping_dictionary = {"uc": 0, "cd": 1}
labels_vector = current_table["diagnosis"].map(mapping_dictionary)

data_dictionary["diagnosis"] = {
    "features": features_table.reset_index(drop=True),
    "labels": labels_vector.reset_index(drop=True),
    "mapping": mapping_dictionary,
    "ids": current_table[["patient_id", "interval_id"]].reset_index(drop=True),
}

### Machine Learning

Train an Extreme Gradient Boosting classifier on our data, performing nested k-fold cross-validation.

In [None]:
methods_dictionary_classification = {
    "XGB": [
        XGBClassifier(random_state=const.RANDOM_SEED),
        {
            "n_estimators": [100, 250, 500],
            "max_depth": [3, 5, 7],
            "eval_metric": ["logloss", "error", "auc", "aucpr"],
        },
    ],
}

output_file_name = f"results/feature_importance_diagnosis.csv"
unique_classes = len(data_dictionary["diagnosis"]["labels"].unique())
print(f"$=$=$=$Processing diagnosis with XGB$=$=$=$")
print(
    f"Number of classes: {unique_classes}, which are: {', '.join([str(x) for x in data_dictionary['diagnosis']['mapping'].keys()])}"
)
estimator, predictions, features_importances, scaler = train_model(
    methods_dictionary_classification["XGB"],
    data_dictionary["diagnosis"],
    writing_tag=f"classification_diagnosis",
    identifier_column_tag="patient_id",
)

# Write estimator to disk
with open("results/uc_cd_prediction_model.pkl", "wb") as output_file:
    pickle.dump(estimator, output_file)

print(f"Successfully generated:{output_file_name}")

importances_dataframe = pd.DataFrame(
    features_importances,
    index=list(data_dictionary["diagnosis"]["features"].columns),
)

importances_dataframe.columns = ["Importance"]
importances_dataframe["full_name"] = importances_dataframe.index

importances_dataframe[["omic", "Gene Name"]] = importances_dataframe[
    "full_name"
].str.split(":", expand=True)

transcriptomics_importances = (
    importances_dataframe.loc[importances_dataframe["omic"] == "transcriptomics"]
    .sort_values("Importance", ascending=False)
    .iloc[0:10, :]
)
proteomics_importances = (
    importances_dataframe.loc[importances_dataframe["omic"] == "proteomics"]
    .sort_values("Importance", ascending=False)
    .iloc[0:10, :]
)
genomics_importances = (
    importances_dataframe.loc[importances_dataframe["omic"] == "genomics"]
    .sort_values("Importance", ascending=False)
    .iloc[0:10, :]
)

# Make three barplots, onr for each omic, in the same plot
fig, ax = plt.subplots(1, 3, figsize=(10, 10))
sns.barplot(
    x=[x.replace("transcriptomics:", "") for x in transcriptomics_importances.index],
    y=transcriptomics_importances["Importance"],
    ax=ax[0],
    palette=["#16EB96" for x in transcriptomics_importances.index],
)
ax[0].set_title("Transcriptomics")

sns.barplot(
    x=[
        x.replace("proteomics:", "").split("_")[0] for x in proteomics_importances.index
    ],
    y=proteomics_importances["Importance"],
    ax=ax[1],
    palette=["#F4B183" for x in proteomics_importances.index],
)
ax[1].set_title("Proteomics")
sns.barplot(
    x=[x.replace("genomics:", "").split("_")[0] for x in genomics_importances.index],
    y=genomics_importances["Importance"],
    ax=ax[2],
    palette=["#3B3838" for x in proteomics_importances.index],
)
ax[2].set_title("Genomics")

for current_ax in ax:
    current_ax.set_xticklabels(current_ax.get_xticklabels(), rotation=90)

    current_ax.set_yticklabels(current_ax.get_yticklabels(), rotation=45, fontsize=6)
plt.subplots_adjust(wspace=0.5)
# Add a title to the plot
plt.suptitle(f"Feature importance for diagnosis using XGB")
plt.savefig(f"results/feature_importance_diagnosis.png")
plt.show()
plt.clf()
importances_dataframe.sort_values(by=["Importance"], ascending=False).to_csv(
    "results/features_importance.csv", index=False
)
np.savetxt("results/uc_cd_predictions.csv", predictions, delimiter=",")