In [None]:
%pylab inline
# this does some imports for us
# including
# import numpy as np
# import pylab as plt
#
# and it tells the notebook to show us plots as we make them (as opposed to saving them out to files)

In [None]:
import seaborn as sns
import pandas as pd

In [None]:
x=pd.read_csv('GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_median_rpkm.gct.gz',compression='gzip',sep='\t',skiprows=2)

In [None]:
x

In [None]:
x.shape

In [None]:
x.head()

In [None]:
x=x.set_index('Description')

In [None]:
x.head()

In [None]:
x.loc['TP53']

In [None]:
vals=x[ x.columns[1:] ]

In [None]:
vals.head()

In [None]:
# find the standard deviation across genes
# add a pseudocount (0.1) to each value so we don't have any zeros
vals=vals+0.1
# axis = 1 means take the standard dev over axis#1 (columns)
vals_std=vals.std(axis=1)

# what would we get with axis=0 in the above statement?  

In [None]:
print(vals.shape)
print(vals_std.shape)

In [None]:
vals_std.sort_values( inplace=True )

In [None]:
# grab genes from the 800th to 1000th most variable (feel free to customize)
vals_most_var = vals.loc[ vals_std.index[-1000:-800] ]

In [None]:
vals_most_var.head()

In [None]:
vals_most_var.shape

In [None]:
# log-transform
vals_most_var_log = np.log2(vals_most_var)

In [None]:
vals_most_var_log.head()

In [None]:
cg = sns.clustermap(vals_most_var_log)
_ = plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

In [None]:
#that was a bit hard to read. let's just cluster on 40 of these genes
cg = sns.clustermap(vals_most_var_log[:40])
_ = plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

In [None]:
# make a scatterplot comparing all genes' expression in two selected tissues

# see seaborn jointplot 
# http://seaborn.pydata.org/generated/seaborn.jointplot.html

In [None]:
sns.jointplot('Brain - Cerebellum','Brain - Substantia nigra', data=np.log2(vals),alpha=0.1)

In [None]:
sns.jointplot('Brain - Cerebellum','Liver', data=np.log2(vals),alpha=0.1)

In [None]:
# we can do this with matplotlib too
# linear scale
plt.scatter(vals['Brain - Cerebellum'],vals['Liver'], alpha=0.1)

In [None]:
# log scale
plt.scatter(np.log2(vals['Brain - Cerebellum']), np.log2(vals['Liver']), alpha=0.1)

In [None]:
#sometimes with expression data it's helpful to visualize M vs A (mean of log2)

# M: log ratio
M = np.log2(vals['Brain - Cerebellum']) -  np.log2(vals['Liver'])

# A: average of log-scaled values
A = 0.5 * (np.log2(vals['Brain - Cerebellum']) + np.log2(vals['Liver']))

plt.scatter( A, M, alpha=0.05 )
plt.xlabel('A')
plt.ylabel('M')

In [None]:
# pandas practice, if you're already here - what's the most 
# differentially over-expressed gene in  "Brain - Cerebellum" vs "Liver"?



In [None]:
# one way - add another column, with log-ratio (cerebellum/liver), sort on it
vals2 = vals.copy()
vals2['cerebellum_vs_liver'] = np.log2(vals2['Brain - Cerebellum']) - np.log2(vals2['Liver'])
vals2=vals2.sort_values( by='cerebellum_vs_liver', ascending=False )
vals2.head()

In [None]:
#
# The TUBB4A gene encodes a brain-specific member of the beta-tubulin family that is most highly expressed in the cerebellum, putamen, and white matter ...
# SNAP25 - Synaptosomal-associated protein 25 (SNAP-25) is a t-SNARE protein that is encoded by the SNAP25 gene in humans.
# Gamma-aminobutyric acid (GABA) is the major inhibitory neurotransmitter in the mammalian brain where it acts 
#

In [None]:
# more compactly
sorted_bynp.log2(vals['Brain - Cerebellum']/vals['Liver']).sort_values(ascending=False)