In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import os, sys

In [3]:
import pylab as plt
import matplotlib

import numpy as np
import pandas as pd

In [4]:
os.getcwd()
print(sys.path)

['/Users/wangkeqing/pyMultiOmics/notebooks', '/Library/Frameworks/Python.framework/Versions/3.8/lib/python38.zip', '/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8', '/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/lib-dynload', '', '/Users/wangkeqing/.local/share/virtualenvs/pyMultiOmics-hFnbBj-T/lib/python3.8/site-packages', '/Users/wangkeqing/.local/share/virtualenvs/pyMultiOmics-hFnbBj-T/lib/python3.8/site-packages/IPython/extensions', '/Users/wangkeqing/.ipython']


In [5]:
sys.path.append('..')

from pyMultiOmics.base import *
from pyMultiOmics.mapping import Mapper
from pyMultiOmics.common import set_log_level_info, set_log_level_debug
from pyMultiOmics.constants import *

2021-08-24 23:26:27.301 | INFO     | pyMultiOmics.reactome:get_neo4j_driver:24 - Created graph database driver for bolt://localhost:7687 (neo4j)


# Demonstration of pyMultiOmics base classes

### Load the processed aging data from [1]

[1] [Ahadi, Sara, et al. "Personal aging markers and ageotypes revealed by deep longitudinal profiling." Nature medicine 26.1 (2020): 83-90.](https://www.nature.com/articles/s41591-019-0719-5)

In [6]:
DATA_FOLDER = os.path.abspath(os.path.join('test_data', 'aging_data'))
DATA_FOLDER

'/Users/wangkeqing/pyMultiOmics/notebooks/test_data/aging_data'

### Read the individual dataframes

In [7]:
def get_measurements(data_folder, filename):
    df = pd.read_csv(os.path.join(data_folder, filename), sep='\t', index_col='SampleID')
    try:
        df = df.drop(columns=['SubjectID', 'CL1', 'CL2', 'CL3', 'CL4'])
    except KeyError:
        pass
    df = df.transpose()
    df.index.name = IDENTIFIER_COL
    df.columns.name = SAMPLE_COL
    return df

In [8]:
transcript_df = get_measurements(DATA_FOLDER, 'transcriptomic.txt')
protein_df = get_measurements(DATA_FOLDER, 'proteomics.txt')
compound_df = get_measurements(DATA_FOLDER, 'metabolomics.txt')
targeted_assays_df = get_measurements(DATA_FOLDER, 'targeted_assays.txt')
clinical_df = pd.read_csv(os.path.join(DATA_FOLDER, 'clinic.txt'), sep='\t', index_col='sample')

### Create single omics data container objects

Some data cleaning is done upon loading in `SingleOmicsData`:
- Duplicate values are removed from the rows and columns
- Duplicate sample names are removed
- Measurements with missing metadata are removed
- Metadata with missing measurements are removed too

In [9]:
transcript_data = SingleOmicsData(GENES, transcript_df, clinical_df)
transcript_data



genes data with (10346, 795) measurements

In [10]:
protein_data = SingleOmicsData(PROTEINS, protein_df, clinical_df)
protein_data



proteins data with (302, 901) measurements

In [11]:
compound_data = SingleOmicsData(COMPOUNDS, compound_df, clinical_df)
compound_data



compounds data with (724, 889) measurements

In [12]:
targeted_assay_data = SingleOmicsData('targeted_assay', targeted_assays_df, clinical_df)
targeted_assay_data



targeted_assay data with (66, 921) measurements

### Getting values

You can get data out of the container by using the `data_df` and `design_df` attributes.

Notice that after the data is loaded and cleaned, the number of samples in the measurement dataframe (`data_df`) is the same as the number of rows in the sample metadata dataframe (`design_df`).

In [13]:
protein_data.data_df

sample,ZOZOW1T-1013,ZOZOW1T-1015,ZOZOW1T-1021,ZOZOW1T-1022,ZOZOW1T-1023,ZOZOW1T-1025,ZOZOW1T-1042,ZOZOW1T-1043,ZOZOW1T-2001,ZOZOW1T-2002,...,ZJTKAE3-04,ZJTKAE3-06,ZJTKAE3-07,ZJTKAE3-10,ZJTKAE3-1011,ZJTKAE3-2012,ZJTKAE3-2013,ZJTKAE3-2015,ZJTKAE3-6021,ZJTKAE3-6031
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IGLL5,2.889091,3.557581,3.007525,0.863756,0.824791,2.400602,2.199302,1.753843,2.309603,2.504444,...,1.946944,2.256987,2.216930,2.057610,0.501687,2.500616,2.392775,0.408994,6.597984,2.136551
MASP2,-3.485240,-5.187031,-3.538002,-3.620376,-3.374524,-3.472215,-3.682544,-2.585078,-1.681774,-2.925132,...,-3.771724,-3.788520,-3.505324,-3.594691,-5.597678,-3.522353,-3.323587,-5.489279,-3.821952,-3.320252
APOL1,2.132138,2.344245,2.075924,1.965186,1.733176,2.317851,2.306059,2.426814,2.002156,1.736642,...,2.014235,1.760715,2.056368,1.560633,2.039669,2.020096,1.259214,2.259605,1.000675,1.934733
CEP290,-4.976074,-3.159375,-5.796386,-6.757326,-4.988127,-5.464828,-6.994283,-6.979972,-4.257054,-7.527584,...,-4.940510,-4.754507,-4.353314,-4.547183,-4.349794,-4.831130,-5.601685,-4.252906,-4.090896,-4.823840
CD5L,-1.264688,-1.788122,-0.099187,1.682255,-1.224756,-0.404611,-0.883587,-0.575930,0.316327,-0.316505,...,-0.577816,-0.958755,-0.535393,-0.408866,-0.097291,-0.722484,-0.495903,-0.508294,-0.930555,-0.899162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AFG3L2,-4.092226,-2.354904,-4.657650,-4.457742,-4.352034,-4.439872,-4.843580,-4.500824,-4.983054,-3.499346,...,-3.225100,-4.213418,-3.465116,-3.811641,-4.036045,-3.431323,-5.292956,-3.861400,-3.963952,-3.241324
LYVE1,-3.221125,-4.209470,-2.812472,-3.100214,-3.225586,-3.158289,-3.871134,-3.532022,-2.661305,-4.178078,...,-4.134120,-3.953778,-3.154791,-3.175768,-3.089301,-3.600972,-3.795883,-3.036173,-5.263563,-3.819436
FCGBP,-1.842757,-3.957724,-1.663895,-1.893479,-1.946335,-1.297884,-1.544130,-1.890414,-1.405691,-0.909679,...,-3.798137,-3.516523,-2.155853,-2.270864,-1.416367,-2.425012,-3.887947,-1.470154,-2.481273,-2.353275
ZNF10,-4.067586,-5.360170,-1.682703,-4.537362,-4.634648,-4.164654,-4.263330,-4.330606,-3.239887,-4.721896,...,-7.047778,-4.841547,-4.631941,-3.898807,-5.269417,-5.150124,-3.760686,-5.475474,-3.685620,-6.722829


In [14]:
protein_data.design_df

Unnamed: 0_level_0,A1C,AG,ALB,ALCRU,ALKP,ALT,AST,BASO,BASOAB,BUN,...,TGL,TP,UALB,UALBCR,WBC,SubjectID,CL1,CL2,CL3,CL4
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZOZOW1T-1013,6.0,8,4.0,,96.0,48,22,0.6,0.04,19.0,...,43.0,6.3,,,6.0,69-001,D7,,Infection_Late,Infection
ZOZOW1T-1015,5.9,8,4.2,,103.0,77,120,0.9,0.04,21.0,...,75.0,6.5,,,5.0,69-001,D30,,Infection_Recovery_Late,Infection_L
ZOZOW1T-1021,6.3,,,173.5,,,,1.0,0.09,,...,46.0,,7,<30,8.9,69-001,D1,,Infection_Early,Infection
ZOZOW1T-1022,6.1,7,4.2,278.2,69.0,40,27,0.5,0.05,15.0,...,41.0,6.6,16,<30,10.8,69-001,D3,,Infection_Middle,Infection
ZOZOW1T-1023,6.3,13,4.2,412.8,66.0,53,31,0.6,0.04,19.0,...,57.0,6.7,18,<30,7.0,69-001,D15,,Infection_Recovery_Early,Infection_L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZJTKAE3-2012,5.1,5,4.2,,83.0,26,20,1.1,0.05,14.0,...,96.0,7.5,,,4.9,70-1015,D1,flu?,Imz_Early,Imz
ZJTKAE3-2013,5.0,9,4.2,,73.0,28,24,0.6,0.04,18.0,...,85.0,7.7,,,5.8,70-1015,D4,flu?,Imz_Middle,Imz
ZJTKAE3-2015,5.0,6,4.1,,80.0,26,27,1.0,0.05,12.0,...,46.0,7.3,,,5.1,70-1015,D22,flu?,Imz_Recovery_Early,Imz_L
ZJTKAE3-6021,5.1,10,3.9,25.6,68.0,31,24,1.2,0.06,12.0,...,62.0,7.2,<5,<30,5.5,70-1015,,,,Healthy


### Create a multi-omics data container objects

In [15]:
publication = 'Ahadi, Sara, et al. "Personal aging markers and ageotypes revealed by deep longitudinal profiling." Nature medicine 26.1 (2020): 83-90.'
url = 'https://www.nature.com/articles/s41591-019-0719-5'

In [16]:
mo = MultiOmicsData(publication=publication, url=url)
mo.add_data([transcript_data, protein_data, compound_data, targeted_assay_data])
mo

Multi-omics data container
- publication: Ahadi, Sara, et al. "Personal aging markers and ageotypes revealed by deep longitudinal profiling." Nature medicine 26.1 (2020): 83-90.
- URL: https://www.nature.com/articles/s41591-019-0719-5
- Views: 4 modalities
	 - genes data with (10346, 795) measurements
	 - proteins data with (302, 901) measurements
	 - compounds data with (724, 889) measurements
	 - targeted_assay data with (66, 921) measurements

### Run mofa on the mo object

In [23]:
mo.views

{'genes': genes data with (10346, 796) measurements,
 'proteins': proteins data with (302, 901) measurements,
 'compounds': compounds data with (724, 889) measurements,
 'targeted_assay': targeted_assay data with (66, 921) measurements}

In [20]:
df

sample,ZOZOW1T-1013,ZOZOW1T-1015,ZOZOW1T-1021,ZOZOW1T-1022,ZOZOW1T-1023,ZOZOW1T-1025,ZOZOW1T-1042,ZOZOW1T-1043,ZOZOW1T-2001,ZOZOW1T-2002,...,ZJTKAE3-06,ZJTKAE3-07,ZJTKAE3-10,ZJTKAE3-1011,ZJTKAE3-2012,ZJTKAE3-2013,ZJTKAE3-2014,ZJTKAE3-2015,ZJTKAE3-6021,ZJTKAE3-6031
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,3.815272,4.137146,3.955534,2.881936,2.995792,3.187035,3.041129,3.219981,3.274713,3.369378,...,4.476458,4.915436,5.249926,3.844098,4.296846,4.089736,4.729858,2.953600,4.069730,4.827457
A1BG-AS1,2.346694,1.507901,1.692865,2.269881,2.498541,2.493848,1.382372,0.000000,0.000000,2.430992,...,4.996630,5.137870,5.282991,1.847195,0.716583,0.849071,2.443865,1.505909,4.069730,2.217970
A1CF,0.000000,1.507901,0.000000,0.711177,1.112889,1.729615,0.000000,1.129777,0.000000,1.905120,...,3.126202,2.814934,1.466973,0.800981,0.000000,2.538052,2.114611,3.514358,3.769122,1.498860
A2M,9.865330,10.531241,9.138813,9.958461,8.763937,9.519908,8.877236,8.574555,7.724120,7.589681,...,5.053715,4.593434,4.683796,8.918436,7.758228,8.116149,10.617362,9.634210,12.231276,9.873193
A2M-AS1,0.000000,0.000000,0.000000,0.000000,0.000000,1.109826,0.000000,0.000000,0.000000,0.000000,...,7.497880,7.674131,7.160227,0.455379,0.000000,0.849071,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDB,0.000000,0.000000,0.000000,0.711177,1.112889,1.109826,0.850786,0.000000,0.000000,0.000000,...,1.552295,1.004426,0.000000,0.800981,0.000000,1.379998,0.000000,0.690055,0.000000,0.000000
ZXDC,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.717454,0.000000,0.912425,0.800981,0.716583,0.000000,0.000000,0.000000,0.000000,0.000000
ZYX,2.093986,2.707498,0.000000,2.608126,2.498541,3.187035,2.541242,3.219981,0.000000,2.102484,...,2.462038,0.000000,2.179137,2.345558,2.076022,3.513165,1.077410,2.704896,2.056051,2.217970
ZZEF1,0.000000,0.000000,0.000000,0.711177,0.000000,0.000000,0.850786,0.000000,0.000000,0.631412,...,0.717454,2.329004,0.000000,0.000000,1.193186,0.849071,0.000000,1.154901,0.000000,0.000000


In [21]:
df['sample'] = df.index
df = df.melt(id_vars='sample',var_name='feature')

In [22]:
df

Unnamed: 0,sample,feature,value
0,A1BG,ZOZOW1T-1013,3.815272
1,A1BG-AS1,ZOZOW1T-1013,2.346694
2,A1CF,ZOZOW1T-1013,0.000000
3,A2M,ZOZOW1T-1013,9.865330
4,A2M-AS1,ZOZOW1T-1013,0.000000
...,...,...,...
8225065,ZXDB,ZJTKAE3-6031,0.000000
8225066,ZXDC,ZJTKAE3-6031,0.000000
8225067,ZYX,ZJTKAE3-6031,2.217970
8225068,ZZEF1,ZJTKAE3-6031,0.000000


In [24]:
df = mo.to_mofa()
df

Unnamed: 0,sample,feature,value,view
0,A1BG,ZOZOW1T-1013,3.815272,genes
1,A1BG-AS1,ZOZOW1T-1013,2.346694,genes
2,A1CF,ZOZOW1T-1013,0.000000,genes
3,A2M,ZOZOW1T-1013,9.865330,genes
4,A2M-AS1,ZOZOW1T-1013,0.000000,genes
...,...,...,...,...
60781,ENA78,ZJTKAE3-6031,181.152491,targeted_assay
60782,CHEX1,ZJTKAE3-6031,10719.269590,targeted_assay
60783,CHEX2,ZJTKAE3-6031,805.705435,targeted_assay
60784,CHEX3,ZJTKAE3-6031,1358.829364,targeted_assay


TODO:
- Keqing: modify the methods in https://github.com/glasgowcompbio/pyMultiOmics/blob/keqingw/pyMultiOmics/mofax.py, so it can run MOFA on the `mo` object above.
- Joe: modify other codes in pyMultiOmics to use this object too 