# Association between covariates and PCs of gene expression data

Same pipeline as `exp/PCA/ipynb/02_PCs_vs_covariates.ipynb` but using YARN-normalized GTEx RNA-Seq data.

Most of the code blocks are identical to those in `exp/PCA/ipynb/02_PCs_vs_covariates.ipynb` except for "Load expression data".

In [3]:
import numpy as np
import pandas as pd
from omics.stats.PCA import run_pca
from omics.stats.MI import normalized_MI_matrix, MI

%reload_ext version_information
%version_information numpy, pandas, sklearn, omics

Software,Version
Python,2.7.12 64bit [GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
IPython,5.1.0
OS,Linux 2.6.32 431.3.1.el6.x86_64 x86_64 with centos 6.8 Final
numpy,1.11.2
pandas,0.19.1
sklearn,0.18.1
omics,0.1.4
Wed Jan 04 10:50:03 2017 EST,Wed Jan 04 10:50:03 2017 EST


# Setup

In [113]:
# Input files from dbGaP
EXPRESSION = "/ifs/labs/cccb/projects/gtex/GTexV4/joey/src/hdf5/gtex_sub.hdf5"

# Data to use in this study
PDATA       = "../../PCA/data/pData.pickle"
GENE_LIST   = "../../PCA/data/GTEx.genes_to_use.tsv"
SAMPLE_LIST = "../../PCA/data/GTEx.samples_to_use.tsv"

# Ouptut files
TISSUE_nMI = "../reports/Tissue-specific_MI_between_PCs_and_covariates.xlsx"  # based on YARN 30333 genes
TISSUE_PCA = "../reports/Tissue-specific_PCA_explained_variance_ratios.tsv"   # based on YARN 30333 genes
MAPPED_PCA = "../data/samples_x_mapped_PCs.tsv"  # based on YARN 30333 genes

TISSUE_nMI2 = "../reports/Tissue-specific_MI_between_PCs_and_covariates.n21088.xlsx"  # YARN subset of 21088 genes in GENE_LIST
TISSUE_PCA2 = "../reports/Tissue-specific_PCA_explained_variance_ratios.n21088.tsv"   # YARN subset of 21088 genes in GENE_LIST
MAPPED_PCA2 = "../data/samples_x_mapped_PCs.n21088.tsv"  # YARN subset of 21088 genes in GENE_LIST

# Others
RND_SEED = 161126  # for calculating MI

# Load input data

## Load expression data

In [46]:
samples = pd.read_table(SAMPLE_LIST, index_col=0, squeeze=True)  # a series (SAMPID -> SMTS)
print len(samples)

8525


In [50]:
hdf = pd.HDFStore(EXPRESSION)
exprs = hdf['exprs'][samples.index]
exprs.shape

(30333, 8525)

In [51]:
# Test
assert 'GTEX-1117F-0226-SM-5GZZ7' not in exprs  # not in data freeze due to Poor D-statistic
assert 'GTEX-111CU-1826-SM-5GZYN' in exprs

assert 'GTEX-YEC3-1426-101806-SM-5PNXX' not in exprs  # sample for DNA (SMNOTE), no sample-associated variables
assert 'GTEX-YF7O-2326-101833-SM-5CVN9' not in exprs  # sample for DNA (SMNOTE), no sample-associated variables

## Load phenotype data

In [116]:
pData = pd.read_pickle(PDATA)
print 'Now we have', pData.shape[1], 'variables:'
print pData.dtypes.value_counts()

Now we have 207 variables:
category    151
float64      56
dtype: int64


In [72]:
def impute_pData(pData):
    return pData.fillna(pData.mean(numeric_only=True))

def clean_pData(pData, cutoff=0.1):
    n, k = pData.shape  # n samples x k variables
    
    is_enough = lambda x: len(x.dropna()) > n * (1-cutoff)  # %missing values < cutoff
    is_stateful = lambda x: n > len(set(x.dropna())) > 1
    is_informative = lambda x: MI(x, x) > cutoff  # information content (entropy) > cutoff
    is_variant = lambda x: np.std(x) > 0
    
    pass1 = [i for i,x in pData.select_dtypes(['category']).iteritems() if is_enough(x) and is_stateful(x) and is_informative(x)]
    pass2 = [i for i,x in pData.select_dtypes([np.number]).iteritems() if is_enough(x) and is_variant(x)]
    print k, 'variables ->', len(pass1), 'categorical variables and', len(pass2), 'numeric variables.'
        
    return pData[sorted(pass1 + pass2)]

# Tissue-specific covariates

In [107]:
def pc_vs_covariates_MI(exprs, pData, npc=10):
    # Run PCA
    pca, exprs_new = run_pca(exprs.T, pc=npc)
    print "Total explained variance of top %d PCs: %.2f" % (npc, pca.explained_variance_ratio_.sum())
    
    # Joined feature dataframe (PCs + variables)
    assert all(exprs_new.index == pData.index)  # check data integrity
    df = pd.concat([exprs_new, pData], axis=1)
    
    # Compute normalized MI matrix
    mat, nmat = normalized_MI_matrix(df, seed=RND_SEED, verbose=True)  # MI, nMI

    return pca, exprs_new, mat, nmat


def run_all_tissues(exprs, samples, pData, TISSUE_nMI, TISSUE_PCA, MAPPED_PCA):
    xlsx = pd.ExcelWriter(TISSUE_nMI)  # output: tissue-specific MI matrix as a multi-sheet excel file
    pca_var = dict()  # explained variance ratio of top n PCs in each tissue
    pc_dfs = []  # mapped PC dataframes

    for tissue in samples.astype('category').cat.categories:
        sub_samples = samples[samples == tissue].index
        sub_exprs = exprs[sub_samples]
        sub_pData = pData.loc[sub_samples]
        sub_pData2 = impute_pData(clean_pData(sub_pData))

        print '================================'
        print tissue
        print '================================'
        print 'Expression data:', sub_exprs.shape
        print 'Phenotype data:', sub_pData.shape
        print 'Imputed pData:', sub_pData2.shape

        pca, exprs_new, mat, mat2 = pc_vs_covariates_MI(sub_exprs, sub_pData2)

        pca_var[tissue] = pca.explained_variance_ratio_
        pc_dfs.append(exprs_new)
        mat2.to_excel(xlsx, tissue)
        print

    xlsx.save()
    xlsx.close()
    pd.DataFrame(pca_var).to_csv(TISSUE_PCA2, sep="\t")
    pd.concat(pc_dfs).to_csv(MAPPED_PCA, sep="\t")

## Using YARN expression (n=30333)

In [75]:
run_all_tissues(exprs, samples, pData, TISSUE_nMI, TISSUE_PCA, MAPPED_PCA)

207 variables -> 98 categorical variables and 53 numeric variables.
Adipose Tissue
Expression data: (30333, 577)
Phenotype data: (577, 207)
Imputed pData: (577, 151)
Data dimensions (samples-by-features): (577, 30333)
Variance explained by top 10 PCs: [ 0.12976429  0.08151015  0.03790651  0.03354869  0.02493951  0.02312883
  0.01815523  0.01712829  0.01472719  0.0129264 ]
Total explained variance of top 10 PCs: 0.39
Input dataframe: 577 samples x 161 features
98 features are categorical
Computing pair-wise MI ...
Normalize MI ...

207 variables -> 88 categorical variables and 55 numeric variables.
Adrenal Gland
Expression data: (30333, 145)
Phenotype data: (145, 207)
Imputed pData: (145, 143)
Data dimensions (samples-by-features): (145, 30333)
Variance explained by top 10 PCs: [ 0.13044141  0.07513339  0.04586832  0.02815608  0.02352348  0.02217572
  0.01827786  0.01689078  0.01622014  0.01464933]
Total explained variance of top 10 PCs: 0.39
Input dataframe: 145 samples x 153 features


## Intersection with GTEx-portal RPKM expression (n=21088)

In [111]:
genes = pd.read_table(GENE_LIST, index_col=0, squeeze=True)  # a series (ENSG ID -> Description)
print len(genes), 'genes'  # 21146 genes
exprs = exprs.loc[[i.split('.')[0] for i in genes.index]]  # remove version number in the Ensembl gene ID
exprs = exprs.dropna()  # some samples are not in YARN-normalized expression profiles
print exprs.shape

(21088, 8525)


In [90]:
run_all_tissues(exprs, samples, pData, TISSUE_nMI2, TISSUE_PCA2, MAPPED_PCA2)

207 variables -> 98 categorical variables and 53 numeric variables.
Adipose Tissue
Expression data: (21088, 577)
Phenotype data: (577, 207)
Imputed pData: (577, 151)
Data dimensions (samples-by-features): (577, 21088)
Variance explained by top 10 PCs: [ 0.15055937  0.09318292  0.04387981  0.0391703   0.02665359  0.02515286
  0.02033328  0.01835177  0.01706872  0.0159765 ]
Total explained variance of top 10 PCs: 0.45
Input dataframe: 577 samples x 161 features
98 features are categorical
Computing pair-wise MI ...
Normalize MI ...

207 variables -> 88 categorical variables and 55 numeric variables.
Adrenal Gland
Expression data: (21088, 145)
Phenotype data: (145, 207)
Imputed pData: (145, 143)
Data dimensions (samples-by-features): (145, 21088)
Variance explained by top 10 PCs: [ 0.15495104  0.08179164  0.05225373  0.03210874  0.02803493  0.02530738
  0.02036338  0.01838142  0.01737991  0.01637349]
Total explained variance of top 10 PCs: 0.45
Input dataframe: 145 samples x 153 features
