In [1]:
def plot_pc_explained_variance_ratio(pca, order, figsize=(3, 1.5)):
    """Plot explained_variance_ratio in an assigned order."""
    fig = plt.figure(figsize=figsize)
    x = pca.explained_variance_ratio_
    s = pd.Series(x, index=['PC%d' % i for i in range(1, len(x)+1)])[order]
    ax = sns.barplot(s.index, s)
    ax.invert_yaxis()
    ax.set_ylabel('Explained variance ratio')
    xaxis = plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)
    sns.despine(bottom=True)
    return fig

In [2]:
def run(tissue_name, exprs, pData, meta_pData, npc=10, output=True):
    # Use tissue name as output label
    label= tissue_name.replace(' ', '_')
    
    # Run PCA
    pca, exprs_new = run_pca(exprs.T, pc=npc)
    assert all(exprs_new.index == pData.index)  # check data integrity
    print "Total explained variance of top %d PCs: %.2f" % (npc, pca.explained_variance_ratio_.sum())

    # Plot: distribution of explained variance ratios of top nPCs
    sns.set(font_scale=1.0, style='white')
    plot_explained_variance_ratio(pca, figsize=(3,1.5))
    if output:
        plt.savefig("../plots/explained_variance_ratio_dist.%s.pdf" % label, bbox_inches='tight')
        plt.close()

    # MI for PCs vs continuous covariates
    X = select_continuous_variables(pData, meta_pData)
    X = X.fillna(X.mean())  # deal with missing values: replaced by mean
    mat = [mutual_info_regression(X, y, discrete_features=False, random_state=RND_SEED) for i,y in exprs_new.iteritems()]
    MI1 = pd.DataFrame(mat, columns=X.columns, index=exprs_new.columns)
    print "%d continuous and" % MI1.shape[1],

    # MI for PCs vs discrete covariates
    X = select_discrete_variables(pData, meta_pData).rank().fillna(0)  # deal w/ missing values: as a new category
    mat = [mutual_info_regression(X, y, discrete_features=True, random_state=RND_SEED) for i,y in exprs_new.iteritems()]
    MI2 = pd.DataFrame(mat, columns=X.columns, index=exprs_new.columns)
    print "%d discrete variables are qualified." % MI2.shape[1]

    # Combined MI matrix
    MI = pd.concat([MI1, MI2], axis=1)

    # Subset the MI matrix to keep only variables w/ MI >= 0.1 with at least one PC
    to_show = [i for i,x in MI.iteritems() if not all(x < 0.15)]
    sub_MI = MI[to_show]
    print "%d variables have MI >= 0.15 w/ at least one PC." % len(to_show)
    
    # Plot: hclust of MI matrix
    sns.set(font_scale=0.8, style='white')
    cg = sns.clustermap(sub_MI.T, linewidths=.1, linecolor='gray', method="average", figsize=(3,3+(len(to_show)/10.)))
    xaxis = plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    yaxis = plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    if output:
        plt.savefig("../plots/PCs_vs_covariates_hclust.%s.pdf" % label, bbox_inches='tight')
        plt.close()

    # Plot: Explained variance ratio for the PCs at the bottom of hclust
    sns.set(font_scale=0.8, style='white')
    order = [i.get_text() for i in cg.ax_heatmap.xaxis.get_majorticklabels()]
    fig = plot_pc_explained_variance_ratio(pca, order)
    if output:
        plt.savefig("../plots/PCs_vs_covariates_hclust.%s.bottom.pdf" % label, bbox_inches='tight')
        plt.close()

    # Metadata: list of covariates at the right of hclust
    variables = [i.get_text() for i in cg.ax_heatmap.yaxis.get_majorticklabels()[::-1]]
    covariates = meta_pData.loc[variables]
    
    # All done!
    return pca, exprs_new, MI, cg, covariates

In [3]:
================================
Adipose Tissue
================================
Expression data: (21146, 577)
Phenotype data: (577, 209)
Data dimensions (samples-by-features): (577, 21146)
Variance explained by top 10 PCs: [ 0.15892685  0.14077131  0.06445746  0.0487673   0.038848    0.03683041
  0.0302921   0.02412104  0.02005551  0.01581508]
Total explained variance of top 10 PCs: 0.58
54 continuous and 148 discrete variables are qualified.
26 variables have MI >= 0.15 w/ at least one PC.

================================
Adrenal Gland
================================
Expression data: (21146, 145)
Phenotype data: (145, 209)
Data dimensions (samples-by-features): (145, 21146)
Variance explained by top 10 PCs: [ 0.15831179  0.09821024  0.0833971   0.05271907  0.03567548  0.03123482
  0.02979179  0.02467147  0.02407092  0.01935289]
Total explained variance of top 10 PCs: 0.56
54 continuous and 118 discrete variables are qualified.
31 variables have MI >= 0.15 w/ at least one PC.

================================
Blood
================================
Expression data: (21146, 511)
Phenotype data: (511, 209)
Data dimensions (samples-by-features): (511, 21146)
Variance explained by top 10 PCs: [ 0.60855129  0.10981379  0.07044027  0.0239757   0.01473755  0.01193065
  0.00888041  0.00702562  0.00596075  0.00513306]
Total explained variance of top 10 PCs: 0.87
54 continuous and 147 discrete variables are qualified.
50 variables have MI >= 0.15 w/ at least one PC.

================================
Blood Vessel
================================
Expression data: (21146, 689)
Phenotype data: (689, 209)
Data dimensions (samples-by-features): (689, 21146)
Variance explained by top 10 PCs: [ 0.20792625  0.10927092  0.07067095  0.04952433  0.03916346  0.03096583
  0.02909217  0.02356022  0.02283135  0.01851984]
Total explained variance of top 10 PCs: 0.60
54 continuous and 148 discrete variables are qualified.
32 variables have MI >= 0.15 w/ at least one PC.

================================
Brain
================================
Expression data: (21146, 1259)
Phenotype data: (1259, 209)
Data dimensions (samples-by-features): (1259, 21146)
Variance explained by top 10 PCs: [ 0.37116357  0.1668999   0.08150254  0.05850247  0.03341506  0.02605313
  0.02170531  0.01758905  0.01300181  0.01158625]
Total explained variance of top 10 PCs: 0.80
53 continuous and 113 discrete variables are qualified.
36 variables have MI >= 0.15 w/ at least one PC.

================================
Breast
================================
Expression data: (21146, 214)
Phenotype data: (214, 209)
Data dimensions (samples-by-features): (214, 21146)
Variance explained by top 10 PCs: [ 0.38297673  0.07291543  0.0494132   0.04169519  0.03175749  0.02791998
  0.0210118   0.01965641  0.01789709  0.01566845]
Total explained variance of top 10 PCs: 0.68
54 continuous and 116 discrete variables are qualified.
37 variables have MI >= 0.15 w/ at least one PC.

================================
Colon
================================
Expression data: (21146, 345)
Phenotype data: (345, 209)
Data dimensions (samples-by-features): (345, 21146)
Variance explained by top 10 PCs: [ 0.62321204  0.04983025  0.03778193  0.02722459  0.01897565  0.01414732
  0.01245682  0.01230728  0.00944069  0.00790672]
Total explained variance of top 10 PCs: 0.81
53 continuous and 132 discrete variables are qualified.
41 variables have MI >= 0.15 w/ at least one PC.

================================
Esophagus
================================
Expression data: (21146, 686)
Phenotype data: (686, 209)
Data dimensions (samples-by-features): (686, 21146)
Variance explained by top 10 PCs: [ 0.66603837  0.05357799  0.03011641  0.01731243  0.01327291  0.01273346
  0.00954508  0.00904935  0.00862757  0.00710514]
Total explained variance of top 10 PCs: 0.83
54 continuous and 135 discrete variables are qualified.
37 variables have MI >= 0.15 w/ at least one PC.

================================
Heart
================================
Expression data: (21146, 412)
Phenotype data: (412, 209)
Data dimensions (samples-by-features): (412, 21146)
Variance explained by top 10 PCs: [ 0.27898528  0.14364135  0.07700613  0.06603975  0.03068185  0.02690536
  0.02220405  0.01619941  0.0148801   0.01431865]
Total explained variance of top 10 PCs: 0.69
54 continuous and 128 discrete variables are qualified.
44 variables have MI >= 0.15 w/ at least one PC.

================================
Kidney
================================
Expression data: (21146, 32)
Phenotype data: (32, 209)
Data dimensions (samples-by-features): (32, 21146)
Variance explained by top 10 PCs: [ 0.30606432  0.149109    0.12831078  0.0630421   0.04705671  0.03953202
  0.0278944   0.02483211  0.020072    0.01670803]
Total explained variance of top 10 PCs: 0.82
53 continuous and 84 discrete variables are qualified.
74 variables have MI >= 0.15 w/ at least one PC.

================================
Liver
================================
Expression data: (21146, 119)
Phenotype data: (119, 209)
Data dimensions (samples-by-features): (119, 21146)
Variance explained by top 10 PCs: [ 0.23230774  0.12775748  0.07261617  0.0494887   0.03872814  0.03006265
  0.02703963  0.02047753  0.01944197  0.01728602]
Total explained variance of top 10 PCs: 0.64
53 continuous and 105 discrete variables are qualified.
36 variables have MI >= 0.15 w/ at least one PC.

================================
Lung
================================
Expression data: (21146, 320)
Phenotype data: (320, 209)
Data dimensions (samples-by-features): (320, 21146)
Variance explained by top 10 PCs: [ 0.18019859  0.10409909  0.07300221  0.06328673  0.0407999   0.03425103
  0.02891706  0.02545798  0.02283415  0.01753878]
Total explained variance of top 10 PCs: 0.59
55 continuous and 130 discrete variables are qualified.
34 variables have MI >= 0.15 w/ at least one PC.

================================
Muscle
================================
Expression data: (21146, 430)
Phenotype data: (430, 209)
Data dimensions (samples-by-features): (430, 21146)
Variance explained by top 10 PCs: [ 0.23038935  0.12191202  0.05834562  0.03688385  0.03532838  0.03164254
  0.02863597  0.02248285  0.02117688  0.01682046]
Total explained variance of top 10 PCs: 0.60
55 continuous and 145 discrete variables are qualified.
34 variables have MI >= 0.15 w/ at least one PC.

================================
Nerve
================================
Expression data: (21146, 304)
Phenotype data: (304, 209)
Data dimensions (samples-by-features): (304, 21146)
Variance explained by top 10 PCs: [ 0.15474089  0.09383491  0.05986417  0.05538895  0.04842558  0.03609104
  0.03103859  0.02939175  0.02207717  0.01816644]
Total explained variance of top 10 PCs: 0.55
53 continuous and 144 discrete variables are qualified.
29 variables have MI >= 0.15 w/ at least one PC.

================================
Ovary
================================
Expression data: (21146, 97)
Phenotype data: (97, 209)
Data dimensions (samples-by-features): (97, 21146)
Variance explained by top 10 PCs: [ 0.15828339  0.08988181  0.08217282  0.07461444  0.04596085  0.04007252
  0.03253675  0.0279226   0.02420507  0.0235101 ]
Total explained variance of top 10 PCs: 0.60
53 continuous and 104 discrete variables are qualified.
41 variables have MI >= 0.15 w/ at least one PC.

================================
Pancreas
================================
Expression data: (21146, 171)
Phenotype data: (171, 209)
Data dimensions (samples-by-features): (171, 21146)
Variance explained by top 10 PCs: [ 0.30863817  0.0733996   0.05888756  0.04580272  0.02907166  0.0250746
  0.02015778  0.01895487  0.01580202  0.01476973]
Total explained variance of top 10 PCs: 0.61
54 continuous and 122 discrete variables are qualified.
28 variables have MI >= 0.15 w/ at least one PC.

================================
Pituitary
================================
Expression data: (21146, 103)
Phenotype data: (103, 209)
Data dimensions (samples-by-features): (103, 21146)
Variance explained by top 10 PCs: [ 0.16331973  0.11262466  0.07750244  0.04849239  0.04484352  0.03737949
  0.03152417  0.02732353  0.02638934  0.021969  ]
Total explained variance of top 10 PCs: 0.59
53 continuous and 101 discrete variables are qualified.
21 variables have MI >= 0.15 w/ at least one PC.

================================
Prostate
================================
Expression data: (21146, 106)
Phenotype data: (106, 209)
Data dimensions (samples-by-features): (106, 21146)
Variance explained by top 10 PCs: [ 0.19571979  0.11206671  0.08279523  0.0588004   0.04684694  0.0316169
  0.02799182  0.02557162  0.02348231  0.02035524]
Total explained variance of top 10 PCs: 0.63
53 continuous and 103 discrete variables are qualified.
18 variables have MI >= 0.15 w/ at least one PC.

================================
Salivary Gland
================================
Expression data: (21146, 57)
Phenotype data: (57, 209)
Data dimensions (samples-by-features): (57, 21146)
Variance explained by top 10 PCs: [ 0.27693142  0.19750982  0.10815303  0.04901397  0.03663115  0.02913625
  0.0257539   0.02065363  0.01838468  0.01741151]
Total explained variance of top 10 PCs: 0.78
55 continuous and 89 discrete variables are qualified.
36 variables have MI >= 0.15 w/ at least one PC.

================================
Skin
================================
Expression data: (21146, 890)
Phenotype data: (890, 209)
Data dimensions (samples-by-features): (890, 21146)
Variance explained by top 10 PCs: [ 0.64137536  0.04183168  0.03153911  0.02644172  0.01892003  0.0170332
  0.01361835  0.01155517  0.01063514  0.00806067]
Total explained variance of top 10 PCs: 0.82
53 continuous and 150 discrete variables are qualified.
31 variables have MI >= 0.15 w/ at least one PC.

================================
Small Intestine
================================
Expression data: (21146, 88)
Phenotype data: (88, 209)
Data dimensions (samples-by-features): (88, 21146)
Variance explained by top 10 PCs: [ 0.40356365  0.21663036  0.05465075  0.03220703  0.02438987  0.0211678
  0.01599082  0.01452808  0.01285568  0.01051126]
Total explained variance of top 10 PCs: 0.81
57 continuous and 91 discrete variables are qualified.
29 variables have MI >= 0.15 w/ at least one PC.

================================
Spleen
================================
Expression data: (21146, 104)
Phenotype data: (104, 209)
Data dimensions (samples-by-features): (104, 21146)
Variance explained by top 10 PCs: [ 0.1320359   0.07642316  0.07235886  0.06173541  0.0527229   0.04324267
  0.03538127  0.03326936  0.02596979  0.02219679]
Total explained variance of top 10 PCs: 0.56
56 continuous and 108 discrete variables are qualified.
25 variables have MI >= 0.15 w/ at least one PC.

================================
Stomach
================================
Expression data: (21146, 192)
Phenotype data: (192, 209)
Data dimensions (samples-by-features): (192, 21146)
Variance explained by top 10 PCs: [ 0.52947631  0.06754673  0.04826835  0.03342268  0.02457803  0.01937811
  0.01640197  0.01283187  0.01245772  0.00989528]
Total explained variance of top 10 PCs: 0.77
53 continuous and 122 discrete variables are qualified.
33 variables have MI >= 0.15 w/ at least one PC.

================================
Testis
================================
Expression data: (21146, 172)
Phenotype data: (172, 209)
Data dimensions (samples-by-features): (172, 21146)
Variance explained by top 10 PCs: [ 0.35814179  0.0536551   0.04846404  0.03896193  0.03162606  0.02430365
  0.0211702   0.01729217  0.01457395  0.01391962]
Total explained variance of top 10 PCs: 0.62
54 continuous and 117 discrete variables are qualified.
20 variables have MI >= 0.15 w/ at least one PC.

================================
Thyroid
================================
Expression data: (21146, 323)
Phenotype data: (323, 209)
Data dimensions (samples-by-features): (323, 21146)
Variance explained by top 10 PCs: [ 0.12482604  0.07911093  0.07527505  0.05437503  0.0496161   0.03887301
  0.0361938   0.02763849  0.02291551  0.01909058]
Total explained variance of top 10 PCs: 0.53
54 continuous and 126 discrete variables are qualified.
32 variables have MI >= 0.15 w/ at least one PC.

================================
Uterus
================================
Expression data: (21146, 83)
Phenotype data: (83, 209)
Data dimensions (samples-by-features): (83, 21146)
Variance explained by top 10 PCs: [ 0.22071374  0.0971462   0.06501026  0.0571242   0.04590214  0.03559982
  0.03369837  0.02695024  0.02346204  0.02010126]
Total explained variance of top 10 PCs: 0.63
53 continuous and 105 discrete variables are qualified.
46 variables have MI >= 0.15 w/ at least one PC.

================================
Vagina
================================
Expression data: (21146, 96)
Phenotype data: (96, 209)
Data dimensions (samples-by-features): (96, 21146)
Variance explained by top 10 PCs: [ 0.48736293  0.07202512  0.0551418   0.03224417  0.02596113  0.01991697
  0.0187017   0.017764    0.0132256   0.01200228]
Total explained variance of top 10 PCs: 0.75
53 continuous and 99 discrete variables are qualified.
30 variables have MI >= 0.15 w/ at least one PC.

SyntaxError: invalid syntax (<ipython-input-3-9f4ead63382e>, line 1)