In [1]:
import data
import seaborn as sns
import numpy as np
import pandas as pd
import patsy
%matplotlib inline

In [2]:
import pystan
import stanity

## load files for all cell types

In [3]:
# list types of cells
subsets = data.prep_filename_metadata()
subsets['SubSet'].unique()
all_types = list(subsets['SubSet'].unique())
print(all_types)

['CD4_Th2', 'CD8_Effector', 'B_CD5', 'B_Memory', 'CD4_Th1', 'CD4_Naive', 'B_Naive', 'CD4_Effector_Memory', 'CD8_Central_Memory', 'CD4_Central_Memory', 'CD4_Treg', 'CD8_Naive', 'CD4_Th17']


In [4]:
df = data.load_by_cell_type(all_types, metadata=subsets)

In [5]:
df = data.prep_annotated_data(df)

In [None]:
assert all(pd.notnull(df['log1p_tpm_rescaled']))

## prep gene ids

For model-estimation, we first need to map each gene_name to a numeric ID. 


In [None]:
df['gene_cat'] = df['gene_name'].astype('category')
df['gene_id'] = df['gene_cat'].cat.codes+1

In [None]:
df['B_cell'] = df['cell_type'].apply(lambda x: 1 if x == 'B' else 0)
df['T_cell'] = df['cell_type'].apply(lambda x: 1 if x != 'B' else 0)

In [None]:
df.head()

## sample genes for first pass

In [22]:
sampled_genes = df.drop_duplicates(subset='gene_name').sample(n=100).loc[:,'gene_name']
sample_df = pd.merge(df, pd.DataFrame(sampled_genes), on='gene_name', how='inner')

In [23]:
sample_df.head()

Unnamed: 0,sample_id,filename,gene_name,est_counts,tpm,log1p_tpm,log1p_counts,CCR6,CCR7,CD127,...,CXCR3,SubSet,cell_type,log1p_tpm_rescaled_type,log1p_tpm_rescaled_subset,log1p_tpm_rescaled,gene_cat,gene_id,B_cell,T_cell
0,1,ERR431566,ABRACL,342.0,105.56336,4.66874,5.83773,,,,...,-,CD4_Th2,CD4,6.746109,6.616746,3.897148,ABRACL,147,0,1
1,2,ERR431579,ABRACL,386.99988,98.88583,4.604028,5.961005,,,,...,-,CD4_Th2,CD4,5.976567,-4.287081,3.698313,ABRACL,147,0,1
2,3,ERR431600,ABRACL,353.0,97.8337,4.593439,5.869297,,,,...,-,CD4_Th2,CD4,5.850642,-6.071339,3.665777,ABRACL,147,0,1
3,4,ERR431615,ABRACL,368.0,105.79735,4.670933,5.910797,,,,...,-,CD4_Th2,CD4,6.772193,6.986325,3.903887,ABRACL,147,0,1
4,5,ERR431628,ABRACL,352.9998,99.55527,4.610708,5.869296,,,,...,-,CD4_Th2,CD4,6.056001,-3.161566,3.718837,ABRACL,147,0,1


In [39]:
def prep_stan_data(df, sample_n=None, y_col='est_counts', y_type=int):
    if sample_n:
        sampled_genes = df.drop_duplicates(subset='gene_name').sample(n=sample_n).loc[:,'gene_name']
        sample_df = pd.merge(df, pd.DataFrame(sampled_genes), on='gene_name', how='inner')
    else:
        sample_df = df
    sample_df['new_gene_cat'] = sample_df['gene_name'].astype('category')
    sample_df['new_gene_id'] = sample_df['new_gene_cat'].cat.codes+1
    sample_df['new_sample_cat'] = sample_df['sample_id'].astype('category')
    sample_df['new_sample_id'] = sample_df['new_sample_cat'].cat.codes+1
    stan_data = {'N': len(sample_df.index),
             'G': len(sample_df.new_gene_id.unique()),
             'S': len(sample_df.new_sample_id.unique()),
             'C': 2,
             'gene': sample_df.new_gene_id.values,
             'sample': sample_df.new_sample_id.values,
             'x': patsy.dmatrix('0 + B_cell + T_cell', data=sample_df, return_type='dataframe'),
             'y': sample_df[y_col].astype(y_type).values,
            }
    return stan_data

    

## fit model at level of cell-type

In [40]:
stan_data = prep_stan_data(sample_df, sample_n=100)

In [41]:
stan_code = '''
data {
    // dimensions
    int<lower=1> N;  // N obs
    int<lower=1> G;  // N genes
    int<lower=1> S;  // N samples
    int<lower=0> C;  // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc)
   
    // data
    int<lower=1, upper=G> gene[N];    // gene id for each obs
    int<lower=1, upper=S> sample[N];  // sample id for each obs
    matrix<lower=0, upper=1>[N, C] x;      // map each obs to each class (0:'- or ?', 1:'+')
    int<lower=0> y[N];                   // count/tpm for each obs
}
parameters {
    simplex[C] log_theta[G];  // loading factors for each gene, for each cell type
    real log_gene_base[G];    // constant intercept expression level for each gene, irrespective of cell type
    real log_sample_base[S];  // constant intercept expression level for each sample
}
model {
    real log_exp[N];
    
    for (n in 1:N) {
        log_exp[n] = log_sample_base[sample[n]] + log_gene_base[gene[n]] + x[n,]*log_theta[gene[n],];
    }
    y ~ poisson_log(log_exp);
}
'''

In [None]:
fit = stanity.fit(model_code=stan_code, data=stan_data, iter=5000, chains=4)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_a2ad53befabf6809fecb196221109a6a NOW.


NOT reusing model.


Process PoolWorker-68:
Process PoolWorker-69:
Process PoolWorker-75:
Process PoolWorker-76:
Process PoolWorker-70:
Process PoolWorker-74:
Process PoolWorker-77:
Traceback (most recent call last):
Process PoolWorker-73:
Process PoolWorker-79:
Process PoolWorker-66:
Process PoolWorker-78:
Process PoolWorker-80:
Traceback (most recent call last):
Process PoolWorker-67:
Process PoolWorker-72:
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Process PoolWorker-65:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Process PoolWorker-71:
Traceback (most recent call last):
Traceback (most recent call last):
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Traceback (most recent call last):
  File "/usr/lib/python2.7/

In [29]:
print(fit)

Inference for Stan model: anon_model_eb02d03197699017f5cd6b62a7033a4f.
2 chains, each with iter=100; warmup=50; thin=1; 
post-warmup draws per chain=50, total post-warmup draws=100.

                      mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
log_theta[0,0]      3.5e-4  5.1e-5 5.1e-4 3.2e-6 4.1e-5 2.0e-4 4.6e-4 1.6e-3  100.0    nan
log_theta[1,0]        0.37  5.6e-3   0.05   0.28   0.33   0.37   0.41   0.46   87.0   1.02
log_theta[2,0]        0.23    0.02   0.19 3.2e-3   0.09   0.18    0.3   0.76   60.0   1.03
log_theta[3,0]        0.18  2.1e-3   0.02   0.15   0.16   0.18   0.19   0.21   64.0    1.0
log_theta[4,0]        0.84  3.1e-3   0.03   0.79   0.82   0.84   0.86    0.9   86.0   1.01
log_theta[5,0]        0.55  6.9e-3   0.07   0.41   0.51   0.55   0.58    0.7  100.0   0.98
log_theta[6,0]        0.86  7.6e-3   0.08   0.71    0.8   0.85   0.91   0.99  100.0   0.99
log_theta[7,0]         0.4  7.1e-4 7.1e-3   0.39    0.4    0.4   0.41   0.42   98.0   1.0

## plot loading-factors for each gene location

In [34]:
log_theta = fit.extract('log_theta')['log_theta']

In [36]:
log_theta.shape

(10000, 100, 2)