In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
import os
import sys
sys.path.append('..')

In [None]:
from pyMultiOmics.common import set_log_level_warning
_ = set_log_level_warning()

In [None]:
from pyMultiOmics.loader import load_affinity_data
from pyMultiOmics.base import SingleOmicsData, MultiOmicsData
from pyMultiOmics.constants import CIC_COMPOUNDS, INFERENCE_T_TEST
from pyMultiOmics.analysis import AnalysisPipeline

# Analysis of CiC Affinity Biomarkers Data

In [None]:
# This cell is tagged 'parameters'
file_name = None

In [None]:
print('Input file name is', file_name)

In [None]:
data_df, sample_metadata_df, feature_metadata_df = load_affinity_data(file_name)

The following are characteristics of the data. Note that some analytes were dropped due to low detection in the samples.

In [None]:
print('Number of analytes =', data_df.shape[0])
print('Number of samples =', data_df.shape[1])
print('Groups =', sample_metadata_df['group'].unique().tolist())

In [None]:
cic_data = SingleOmicsData(CIC_COMPOUNDS, data_df, sample_metadata_df)
mo = MultiOmicsData()
mo.add_data([cic_data])

## 1. Data Processing

To process the data, first we perform log transformation followed by min-max scaling. Below is the heatmap of the data after processing.

In [None]:
# This cell is tagged 'parameters'
normalise = 'minmax'
log = True

In [None]:
dtype = CIC_COMPOUNDS
return_fig = True

ap = AnalysisPipeline(mo, None)
_ = ap.heatmap(dtype, normalise=normalise, log=log, return_fig=return_fig)

## 2. Clustering using k-means

K-means clustering is used to identify clusters that are similar to each other. The elbow (silhouette) method is used to automatically pick the best K (number of clusters). This is reported below.

In [None]:
cluster_labels, centroids, silhouette_scores = ap.cluster(dtype, normalise=normalise, log=log, return_fig=return_fig)

## 3. Principal Component Analysis (PCA)

The plot below shows the PCA projection of samples, coloured by their cluster labels. Different shapes separate samples in different groups.

In [None]:
_, design_df = ap.multi_omics_data.get_dfs(dtype)
pc1, pc2 = ap.PCA(dtype, normalise=normalise, log=log, n_components=5, style=design_df['group'], hue=cluster_labels, return_fig=return_fig)

## 4. Case-vs-control Analysis

Here we perform case-control analysis. T-tests were performed to compare the means of the case vs control groups specified below, with corrections for multiple tests using the Benjamini/Hochberg method.

In [None]:
# This cell is tagged 'parameters'
case_group = 'disease'
control_group = 'control'

In [None]:
de_method = INFERENCE_T_TEST
ap.run_de(de_method, dtype, case_group, control_group)
de_df = ap.get_de_results(dtype, case_group, control_group, de_method)
de_df

### a. Volcano Plot

The following shows volcano plot of the test results.

In [None]:
p_value_colname = 'padj_%s_vs_%s' % (case_group, control_group)
fc_colname = 'FC_%s_vs_%s' % (case_group, control_group)

In [None]:
# This cell is tagged 'parameters'
p_value_thresh = 0.05
fc_iqr_thresh = 1.5
top_n = 10

In [None]:
ap.volcano(de_df, p_value_colname, p_value_thresh, fc_colname, fc_iqr_thresh=fc_iqr_thresh, top_n=top_n)

### b. Significantly-changing Analytes Ordered by Fold Changes (Descending)

The following is a list of significantly-changing analytes sorted by their fold changes in descending order.

In [None]:
fc_sort_order = 'desc'
sorted_df_asc = ap.de_sort_and_filter(de_df, p_value_colname, p_value_thresh, fc_colname, 
                                      fc_sort_order=fc_sort_order, top_n=top_n, fc_iqr_thresh=fc_iqr_thresh)
sorted_df_asc

### c. Significantly-changing Analytes Ordered by Fold Changes (Ascending)

The following is a list of significantly-changing analytes sorted by their fold changes in ascending order.

In [None]:
fc_sort_order = 'asc'
sorted_df_desc = ap.de_sort_and_filter(de_df, p_value_colname, p_value_thresh, fc_colname, 
                                      fc_sort_order=fc_sort_order, top_n=top_n, fc_iqr_thresh=fc_iqr_thresh)
sorted_df_desc