# MOFA for multi-omics data segmentation on UC and colon tissue

### Imports

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from mofapy2.run.entry_point import entry_point
from sparc_multiomics.utils import (
    prepare_table_MOFA,
    prepare_covariates_MOFA,
)

### Data Loading

In [4]:
proteomics = pd.read_parquet("proteomics_processed.parquet")
transcriptomics = pd.read_parquet("transcriptomics_batch_corrected.parquet")
genomics = pd.read_parquet("genomics_annotated.parquet")
metadata = pd.read_parquet("collapsed_metadata.parquet")

INFO:aiobotocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


### Data processing

Subsetting the data to only include colon samples and uc patients

In [8]:
subset_metadata = metadata[
    (metadata["diagnosis"] == "uc") & (metadata["simple_tissue"] == "colon")
]
metadata_columns = subset_metadata.columns

Columns of interest for later analysis

In [10]:
clinical_columns = [
    "sex",
    "diagnosis",
    "endo_category",
    "sub_location",
    "characteristics_bio_material",
    "macroscopic_appearance",
    "mayo_6_score",
    "scdai_score",
    "mayo_9_score",
    "disease_activity_60",
    "perianal",
    "abdominal_pain_score",
    "global_assessment_score",
    "rectal_bleeding_score",
    "stool_freq_score",
    "batch",
    "simple_tissue",
]

Drop the missing values

In [11]:
no_na_metadata = subset_metadata[
    (subset_metadata["proteomics"].notna())
    & (subset_metadata["transcriptomics"].notna())
    & (subset_metadata["genomics_array"].notna())
]

One-hot encode metadata variables for further use as covariates and set the `patient_interval_id` as unique identifier

In [12]:
one_hot_encoded_metadata = []
for current_clinical_feature in clinical_columns:
    one_hot_encoded_metadata.append(
        pd.get_dummies(
            no_na_metadata[current_clinical_feature],
            prefix=current_clinical_feature,
        )
    )
one_hot_encoded_metadata = pd.concat(one_hot_encoded_metadata, axis=1)
one_hot_encoded_metadata_columns = list(one_hot_encoded_metadata.columns)
encoded_metadata = pd.concat([no_na_metadata, one_hot_encoded_metadata], axis=1)

encoded_metadata["patient_id"] = "P" + encoded_metadata["patient_id"].astype(str)
encoded_metadata["patient_interval_id"] = (
    encoded_metadata["patient_id"].astype(str)
    + "_"
    + encoded_metadata["interval_id"].astype(str)
)

Standardize the transcriptomics features

In [14]:
transcriptomics_features = transcriptomics.drop(columns=["sample_id", "original_batch"])
transcriptomics_columns = list(transcriptomics_features.columns)
scaler = StandardScaler()
transcriptomics_features = pd.DataFrame(
    scaler.fit_transform(transcriptomics_features),
    transcriptomics_features.index,
    columns=transcriptomics_features.columns,
)
transcriptomics_features["sample_id"] = transcriptomics["sample_id"]
print(transcriptomics_features.shape, transcriptomics.shape)

(3304, 19442) (3304, 19443)


Standardize the proteomics features

In [15]:
proteomics_features = proteomics.drop(columns=["sample_id"])
scaler = StandardScaler()
proteomics_features = pd.DataFrame(
    scaler.fit_transform(proteomics_features),
    proteomics_features.index,
    columns=proteomics_features.columns,
)
proteomics_columns = list(proteomics_features.columns)
proteomics_features["sample_id"] = proteomics["sample_id"]
print(proteomics_features.shape, proteomics.shape)

(2250, 2940) (2250, 2940)


Standardize the genomics features, only for feeding MOFA

In [16]:
genomics_features = genomics.drop(columns=["sample_id"])
scaler = StandardScaler()
genomics_features = pd.DataFrame(
    scaler.fit_transform(genomics_features),
    genomics_features.index,
    columns=genomics_features.columns,
)

genomics_features = genomics_features.loc[:, genomics_features.var() > 0.00]
genomics_columns = list(genomics_features.columns)
genomics_features["sample_id"] = genomics["sample_id"]

Chain the merge of all omics types

In [21]:
merged_transcriptomics = encoded_metadata.merge(
    transcriptomics_features,
    left_on="transcriptomics",
    right_on="sample_id",
    how="left",
)

merged_transcriptomics_proteomics = (
    merged_transcriptomics.drop(columns=["sample_id"])
    .merge(proteomics_features, left_on="proteomics", right_on="sample_id", how="left")
    .drop(columns=["sample_id"])
)

merged_transcriptomics_proteomics_genomics = merged_transcriptomics_proteomics.merge(
    genomics_features, left_on="genomics_array", right_on="sample_id", how="left"
)

merged_transcriptomics_proteomics_genomics = (
    merged_transcriptomics_proteomics_genomics.loc[
        merged_transcriptomics_proteomics_genomics["macroscopic_appearance"].isin(
            ["normal", "inflamed"]
        )
    ]
)

### Format the data for MOFA

In [25]:
transcriptomics_block = prepare_table_MOFA(
    merged_transcriptomics_proteomics_genomics,
    sample_id_column_name="patient_interval_id",
    view_column_name="transcriptomics",
    group_column_name="macroscopic_appearance",
    features_columns_names=transcriptomics_columns,
)
proteomics_block = prepare_table_MOFA(
    merged_transcriptomics_proteomics_genomics,
    sample_id_column_name="patient_interval_id",
    view_column_name="proteomics",
    group_column_name="macroscopic_appearance",
    features_columns_names=proteomics_columns,
)
genomics_block = prepare_table_MOFA(
    merged_transcriptomics_proteomics_genomics,
    sample_id_column_name="patient_interval_id",
    view_column_name="genomics",
    group_column_name="macroscopic_appearance",
    features_columns_names=genomics_columns,
)
covariates = prepare_covariates_MOFA(
    merged_transcriptomics_proteomics_genomics,
    sample_id_column_name="patient_interval_id",
    group_column_name="macroscopic_appearance",
    covariates_columns_names=[
        "sex_female",
        "sex_male",
    ],
)

# Run MOFA

In [29]:
# initialise the entry point
ent = entry_point()


        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        


In [30]:
# scale_views: if views have different ranges/variances, it is good practice to scale each view to unit variance. Default is False
ent.set_data_options(scale_views=True)

Scaling views to unit variance...



In [31]:
ready_data = pd.concat(
    [
        transcriptomics_block,
        proteomics_block,
        genomics_block,
    ],
    axis=0,
).sort_values(by=["view", "group", "sample"])

In [32]:
ent.set_data_df(
    ready_data,
    likelihoods=["gaussian", "gaussian", "gaussian"],
)



Loaded group='inflamed' view='genomics' with N=46 samples and D=166896 features...
Loaded group='inflamed' view='proteomics' with N=46 samples and D=2939 features...
Loaded group='inflamed' view='transcriptomics' with N=46 samples and D=19441 features...
Loaded group='normal' view='genomics' with N=130 samples and D=166896 features...
Loaded group='normal' view='proteomics' with N=130 samples and D=2939 features...
Loaded group='normal' view='transcriptomics' with N=130 samples and D=19441 features...




In [33]:
ent.set_model_options(
    factors=25,  # In the example 10 are used, in the paper 25, doesn't matter much, as it will go automatically go down anyway
    spikeslab_weights=True,
    ard_weights=True,
    ard_factors=False,
)
ent.set_train_options(
    convergence_mode="fast",
    dropR2=0.02,  # When this is commented, factors are not dropped; changed to 2% because that was referred on the paper
    gpu_mode=False,
    seed=42,
    weight_views=True,
)
ent.set_covariates(
    covariates,
    covariates_names=[
        "sex_female",
        "sex_male",
    ],
)
ent.set_smooth_options(
    scale_cov=False, model_groups=False
)  # Model groups is set to False because we are not using it, the Default True usually leads to an error


Model options:
- Automatic Relevance Determination prior on the factors: False
- Automatic Relevance Determination prior on the weights: True
- Spike-and-slab prior on the factors: False
- Spike-and-slab prior on the weights: True
Likelihoods:
- View 0 (genomics): gaussian
- View 1 (proteomics): gaussian
- View 2 (transcriptomics): gaussian



weight_views set to True. Weighting the ELBO (the objective function) based on the number of features per view

Loaded 2 covariate(s) for each sample...




In [34]:
ent.build()

In [35]:
ent.run()



######################################
## Training the model with seed 42 ##
######################################


ELBO before training: -63750969.73 

Iteration 1: time=9.44, ELBO=-6112203.44, deltaELBO=57638766.291 (90.41237575%), Factors=24
Iteration 2: time=8.98, ELBO=-5496251.98, deltaELBO=615951.452 (0.96618366%), Factors=23
Iteration 3: time=8.44, ELBO=-5237566.93, deltaELBO=258685.049 (0.40577430%), Factors=22
Iteration 4: time=8.28, ELBO=-5118076.33, deltaELBO=119490.605 (0.18743339%), Factors=21
Iteration 5: time=7.76, ELBO=-5041806.41, deltaELBO=76269.917 (0.11963727%), Factors=20
Iteration 6: time=7.44, ELBO=-4995609.84, deltaELBO=46196.575 (0.07246411%), Factors=19
Iteration 7: time=7.00, ELBO=-4962076.35, deltaELBO=33533.491 (0.05260075%), Factors=18
Iteration 8: time=6.78, ELBO=-4941093.44, deltaELBO=20982.908 (0.03291386%), Factors=17
Iteration 9: time=6.52, ELBO=-4919781.70, deltaELBO=21311.743 (0.03342968%), Factors=16
Iteration 10: time=5.95, ELBO=-4911521.81, d

In [36]:
# Save the output
ent.save(outfile="mofa_inflammation_uc_colon.h5")

Saving model in mofa_inflammation_uc_colon.h5...
upload: ./mofa_inflammation_cd_colon.h5 to s3://enveda-data-dx/mofa/mofa_inflammation_uc_colon.h5
