In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pylab as plt
import matplotlib

import numpy as np
import pandas as pd

In [4]:
sys.path.append('..')

from pyMultiOmics.base import *
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info, set_log_level_debug
from pyMultiOmics.constants import *

2021-08-13 16:29:12.047 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyMultiOmics base classes

### Load the processed aging data from [1]

[1] [Ahadi, Sara, et al. "Personal aging markers and ageotypes revealed by deep longitudinal profiling." Nature medicine 26.1 (2020): 83-90.](https://www.nature.com/articles/s41591-019-0719-5)

In [5]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'aging_data'))
DATA_FOLDER

'C:\\Users\\joewa\\Work\\git\\pyMultiOmics\\notebooks\\test_data\\aging_data'

### Read the individual dataframes

In [6]:
def get_measurements(data_folder, filename):
    df = pd.read_csv(os.path.join(data_folder, filename), sep='\t', index_col='SampleID')
    df = df.transpose()
    df.index.name = IDENTIFIER_COL
    df.columns.name = SAMPLE_COL
    return df

In [7]:
transcript_df = get_measurements(DATA_FOLDER, 'transcriptomic.txt')
protein_df = get_measurements(DATA_FOLDER, 'proteomics.txt')
compound_df = get_measurements(DATA_FOLDER, 'metabolomics.txt')
targeted_assays_df = get_measurements(DATA_FOLDER, 'targeted_assays.txt')
clinical_df = pd.read_csv(os.path.join(DATA_FOLDER, 'clinic.txt'), sep='\t', index_col='sample')

### Create single omics data container objects

Some data cleaning is done upon loading in `SingleOmicsData`:
- Duplicate values are removed from the rows and columns
- Duplicate sample names are removed
- Measurements with missing metadata are removed
- Metadata with missing measurements are removed too

In [8]:
transcript_data = SingleOmicsData(TRANSCRIPTOMICS, transcript_df, clinical_df)
transcript_data



Transcriptomics data with (10346, 795) measurements

In [9]:
protein_data = SingleOmicsData(PROTEOMICS, protein_df, clinical_df)
protein_data



Proteomics data with (307, 901) measurements

In [10]:
compound_data = SingleOmicsData(METABOLOMICS, compound_df, clinical_df)
compound_data



Metabolomics data with (729, 889) measurements

In [11]:
targeted_assay_data = SingleOmicsData('targeted_assay', targeted_assays_df, clinical_df)
targeted_assay_data



targeted_assay data with (66, 921) measurements

### Getting values

You can get data out of the container by using the `data_df` and `design_df` attributes.

Notice that after the data is loaded and cleaned, the number of samples in the measurement dataframe (`data_df`) is the same as the number of rows in the sample metadata dataframe (`design_df`).

In [12]:
protein_data.data_df

sample,ZOZOW1T-1013,ZOZOW1T-1015,ZOZOW1T-1021,ZOZOW1T-1022,ZOZOW1T-1023,ZOZOW1T-1025,ZOZOW1T-1042,ZOZOW1T-1043,ZOZOW1T-2001,ZOZOW1T-2002,...,ZJTKAE3-04,ZJTKAE3-06,ZJTKAE3-07,ZJTKAE3-10,ZJTKAE3-1011,ZJTKAE3-2012,ZJTKAE3-2013,ZJTKAE3-2015,ZJTKAE3-6021,ZJTKAE3-6031
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IGLL5,2.889091,3.557581,3.007525,0.863756,0.824791,2.400602,2.199302,1.753843,2.309603,2.504444,...,1.946944,2.256987,2.21693,2.05761,0.501687,2.500616,2.392775,0.408994,6.597984,2.136551
MASP2,-3.48524,-5.187031,-3.538002,-3.620376,-3.374524,-3.472215,-3.682544,-2.585078,-1.681774,-2.925132,...,-3.771724,-3.78852,-3.505324,-3.594691,-5.597678,-3.522353,-3.323587,-5.489279,-3.821952,-3.320252
APOL1,2.132138,2.344245,2.075924,1.965186,1.733176,2.317851,2.306059,2.426814,2.002156,1.736642,...,2.014235,1.760715,2.056368,1.560633,2.039669,2.020096,1.259214,2.259605,1.000675,1.934733
CEP290,-4.976074,-3.159375,-5.796386,-6.757326,-4.988127,-5.464828,-6.994283,-6.979972,-4.257054,-7.527584,...,-4.94051,-4.754507,-4.353314,-4.547183,-4.349794,-4.83113,-5.601685,-4.252906,-4.090896,-4.82384
CD5L,-1.264688,-1.788122,-0.099187,1.682255,-1.224756,-0.404611,-0.883587,-0.57593,0.316327,-0.316505,...,-0.577816,-0.958755,-0.535393,-0.408866,-0.097291,-0.722484,-0.495903,-0.508294,-0.930555,-0.899162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SubjectID,69-001,69-001,69-001,69-001,69-001,69-001,69-001,69-001,69-001,69-001,...,70-1015,70-1015,70-1015,70-1015,70-1015,70-1015,70-1015,70-1015,70-1015,70-1015
CL1,D7,D30,D1,D3,D15,D35,D1,D4,,,...,,,,,,D1,D4,D22,,
CL2,,,,,,,parainfluenza 3,,,,...,,,,,,flu?,flu?,flu?,,
CL3,Infection_Late,Infection_Recovery_Late,Infection_Early,Infection_Middle,Infection_Recovery_Early,Infection_Recovery_Late,Infection_Early,Infection_Middle,,,...,,,,,,Imz_Early,Imz_Middle,Imz_Recovery_Early,,


In [13]:
protein_data.design_df

Unnamed: 0_level_0,A1C,AG,ALB,ALCRU,ALKP,ALT,AST,BASO,BASOAB,BUN,...,TGL,TP,UALB,UALBCR,WBC,SubjectID,CL1,CL2,CL3,CL4
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZOZOW1T-1013,6.0,8,4.0,,96.0,48,22,0.6,0.04,19.0,...,43.0,6.3,,,6.0,69-001,D7,,Infection_Late,Infection
ZOZOW1T-1015,5.9,8,4.2,,103.0,77,120,0.9,0.04,21.0,...,75.0,6.5,,,5.0,69-001,D30,,Infection_Recovery_Late,Infection_L
ZOZOW1T-1021,6.3,,,173.5,,,,1.0,0.09,,...,46.0,,7,<30,8.9,69-001,D1,,Infection_Early,Infection
ZOZOW1T-1022,6.1,7,4.2,278.2,69.0,40,27,0.5,0.05,15.0,...,41.0,6.6,16,<30,10.8,69-001,D3,,Infection_Middle,Infection
ZOZOW1T-1023,6.3,13,4.2,412.8,66.0,53,31,0.6,0.04,19.0,...,57.0,6.7,18,<30,7.0,69-001,D15,,Infection_Recovery_Early,Infection_L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZJTKAE3-2012,5.1,5,4.2,,83.0,26,20,1.1,0.05,14.0,...,96.0,7.5,,,4.9,70-1015,D1,flu?,Imz_Early,Imz
ZJTKAE3-2013,5.0,9,4.2,,73.0,28,24,0.6,0.04,18.0,...,85.0,7.7,,,5.8,70-1015,D4,flu?,Imz_Middle,Imz
ZJTKAE3-2015,5.0,6,4.1,,80.0,26,27,1.0,0.05,12.0,...,46.0,7.3,,,5.1,70-1015,D22,flu?,Imz_Recovery_Early,Imz_L
ZJTKAE3-6021,5.1,10,3.9,25.6,68.0,31,24,1.2,0.06,12.0,...,62.0,7.2,<5,<30,5.5,70-1015,,,,Healthy


### Create a multi-omics data container objects

In [14]:
publication = 'Ahadi, Sara, et al. "Personal aging markers and ageotypes revealed by deep longitudinal profiling." Nature medicine 26.1 (2020): 83-90.'
url = 'https://www.nature.com/articles/s41591-019-0719-5'

In [15]:
mo = MultiOmicsData(publication=publication, url=url)
mo.add_data([transcript_data, protein_data, compound_data, targeted_assay_data])
mo

Multi-omics data container
- publication: Ahadi, Sara, et al. "Personal aging markers and ageotypes revealed by deep longitudinal profiling." Nature medicine 26.1 (2020): 83-90.
- URL: https://www.nature.com/articles/s41591-019-0719-5
- Views: 4 modalities
	 - Transcriptomics data with (10346, 795) measurements
	 - Proteomics data with (307, 901) measurements
	 - Metabolomics data with (729, 889) measurements
	 - targeted_assay data with (66, 921) measurements

### Run mofa on the mo object

In [16]:
df = mo.to_mofa()
df

Unnamed: 0,feature,value,view
0,ZOZOW1T-1013,3.815272,Transcriptomics
1,ZOZOW1T-1013,2.346694,Transcriptomics
2,ZOZOW1T-1013,0.0,Transcriptomics
3,ZOZOW1T-1013,9.86533,Transcriptomics
4,ZOZOW1T-1013,0.0,Transcriptomics
...,...,...,...
60781,ZJTKAE3-6031,181.152491,targeted_assay
60782,ZJTKAE3-6031,10719.26959,targeted_assay
60783,ZJTKAE3-6031,805.705435,targeted_assay
60784,ZJTKAE3-6031,1358.829364,targeted_assay


TODO:
- Keqing: modify the methods in https://github.com/glasgowcompbio/pyMultiOmics/blob/keqingw/pyMultiOmics/mofax.py, so it can run MOFA on the `mo` object above.
- Joe: modify other codes in pyMultiOmics to use this object too 