In [12]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, HTML

import sys
basedir = '/Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/'
sys.path.append(basedir)

sys.path.append('/Users/simon/git/lda/code/')

from multifile_feature import SparseFeatureExtractor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
def extract_features(input_set, fragment_grouping_tol, loss_grouping_tol, 
                     loss_threshold_min_count, loss_threshold_max_val):
    
    extractor = SparseFeatureExtractor(input_set, fragment_grouping_tol, loss_grouping_tol, 
                                       loss_threshold_min_count, loss_threshold_max_val,
                                       input_type='filename')
    
    # create the grouping for the fragments
    fragment_q = extractor.make_fragment_queue()
    fragment_groups = extractor.group_features(fragment_q, extractor.fragment_grouping_tol)
    
    # create the grouping for the losses
    loss_q = extractor.make_loss_queue()
    loss_groups = extractor.group_features(loss_q, extractor.loss_grouping_tol, 
                                           check_threshold=True)
    
    # populate the counts
    extractor.create_counts(fragment_groups, loss_groups, scaling_factor)
    
    return extractor

In [14]:
scaling_factor=100             # previously set to 100 in the single file LDA 
fragment_grouping_tol=7        # grouping tolerance in ppm for the fragment
loss_grouping_tol=7            # grouping tolerance in ppm for the neutral loss
loss_threshold_min_count=5     # min. counts of loss values to occur
loss_threshold_max_val=200     # max. loss values

In [15]:
input_set = [
    (basedir + 'Pooled_Urine_pos_Method1_ms1.csv', basedir + 'Pooled_Urine_pos_Method1_ms2.csv'),
    (basedir + 'Urine5_pos_Method1_ms1.csv', basedir + 'Urine5_pos_Method1_ms2.csv'),
    (basedir + 'Urine19_pos_Method1_ms1.csv', basedir + 'Urine19_pos_Method1_ms2.csv'),
    (basedir + 'Urine20_pos_Method1_ms1.csv', basedir + 'Urine20_pos_Method1_ms2.csv'),
    (basedir + 'Urine37_pos_Method1_ms1.csv', basedir + 'Urine37_pos_Method1_ms2.csv'),
    (basedir + 'Urine44_pos_Method1_ms1.csv', basedir + 'Urine44_pos_Method1_ms2.csv'),
    (basedir + 'Urine61_pos_Method1_ms1.csv', basedir + 'Urine61_pos_Method1_ms2.csv'),
    (basedir + 'Urine64_pos_Method1_ms1.csv', basedir + 'Urine64_pos_Method1_ms2.csv'),
    (basedir + 'Urine73_pos_Method1_ms1.csv', basedir + 'Urine73_pos_Method1_ms2.csv'),
    (basedir + 'Urine74_pos_Method1_ms1.csv', basedir + 'Urine74_pos_Method1_ms2.csv'),
    (basedir + 'Urine87_pos_Method1_ms1.csv', basedir + 'Urine87_pos_Method1_ms2.csv'),
    (basedir + 'Urine90_pos_Method1_ms1.csv', basedir + 'Urine90_pos_Method1_ms2.csv'),
    (basedir + 'Urine91_pos_Method1_ms1.csv', basedir + 'Urine91_pos_Method1_ms2.csv'),
    ]




In [16]:
extractor = extract_features(input_set, fragment_grouping_tol, loss_grouping_tol, 
                            loss_threshold_min_count, loss_threshold_max_val)

Loading /Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/Pooled_Urine_pos_Method1_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/Pooled_Urine_pos_Method1_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/Urine5_pos_Method1_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/Urine5_pos_Method1_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/Urine19_pos_Method1_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/Urine19_pos_Method1_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/Urine20_pos_Method1_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/input/urine/13_files_separate_mode_method1/POS/Urine20_pos_Method1_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/input

In [17]:
for f in range(extractor.F):
    mat, vocab, ms1, ms2 = extractor.get_entry(f)
    print 'File %d' % f
    print 'Count matrix', type(mat), mat.shape
    print 'Vocab', len(vocab), 'words'
    print 'MS1 rows', type(ms1), ms1.shape[0]
    print 'MS2 rows', type(ms2), ms2.shape[0]
    print

File 0
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (810, 5766)
Vocab 5766 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 810
MS2 rows <class 'pandas.core.frame.DataFrame'> 7804

File 1
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (605, 5766)
Vocab 5766 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 605
MS2 rows <class 'pandas.core.frame.DataFrame'> 6035

File 2
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (621, 5766)
Vocab 5766 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 621
MS2 rows <class 'pandas.core.frame.DataFrame'> 5306

File 3
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (731, 5766)
Vocab 5766 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 731
MS2 rows <class 'pandas.core.frame.DataFrame'> 6987

File 4
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (715, 5766)
Vocab 5766 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 715
MS2 rows <class 'pandas.core.frame.DataFrame'> 6556

File 5
Count matrix <class 'scipy.sparse.lil.

In [18]:
global_word_index = {}
for i,v in enumerate(vocab):
    global_word_index[v] = i

In [19]:
corpus_list = []
for f in range(extractor.F):
    print "Processing file {}".format(f)
    corpus = {}
    mat, vocab, ms1, ms2 = extractor.get_entry(f)
    n_docs,n_words = mat.shape
    print n_docs,n_words
    d_pos = 0
    for d in ms1.iterrows():
        doc_name = "{}_{}".format(d[1]['mz'],d[1]['rt'])
        corpus[doc_name] = {}
        for word_index,count in zip(mat[d_pos,:].rows[0],mat[d_pos,:].data[0]):
            if count > 0:
                corpus[doc_name][vocab[word_index]] = count
        d_pos += 1
    corpus_list.append(corpus)

Processing file 0
810 5766
Processing file 1
605 5766
Processing file 2
621 5766
Processing file 3
731 5766
Processing file 4
715 5766
Processing file 5
652 5766
Processing file 6
558 5766
Processing file 7
649 5766
Processing file 8
681 5766
Processing file 9
807 5766
Processing file 10
656 5766
Processing file 11
674 5766
Processing file 12
667 5766


In [20]:
from lda import VariationalLDA,MultiFileVariationalLDA

In [None]:
mf_lda = MultiFileVariationalLDA(corpus_list=corpus_list,word_index = global_word_index,K = 300,alpha=1,eta=0.1)

Object created with 810 documents
Object created with 605 documents
Object created with 621 documents
Object created with 731 documents
Object created with 715 documents
Object created with 652 documents
Object created with 558 documents
Object created with 649 documents
Object created with 681 documents
Object created with 807 documents
Object created with 656 documents
Object created with 674 documents
Object created with 667 documents


In [None]:
mf_lda.run_vb(n_its=500,initialise=True)

Iteration: 0
382.628814432
Iteration: 1
7.52493054113
Iteration: 2
5.16881634687
Iteration: 3
4.22370252736
Iteration: 4
3.63471692611
Iteration: 5
3.23560365594
Iteration: 6
2.9547365027
Iteration: 7
2.75736819649
Iteration: 8
2.63233730848
Iteration: 9


In [None]:
from lda import MultiFileVariationalLDAPlotter
mp = MultiFileVariationalLDAPlotter(mf_lda)
mp.multi_alpha(normalise=True,names = names)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2,whiten=True)
al = np.zeros((len(corpus_list),mf_lda.K))
for i,c in enumerate(corpus_list):
    al[i,:] = mf_lda.individual_lda[i].alpha

pca.fit(al)
X = pca.transform(al)
print X.shape

import plotly as plotly
from plotly.graph_objs import *
plotly.offline.init_notebook_mode()

names = []
for m1,m2 in input_set:
    names.append(m1.split('/')[-1].split('_')[0])

data = []
data.append(
    Scatter(
        x = X[:,0],
        y = X[:,1],
        mode = 'markers',
        text = names,
#         marker = dict(
#             size = sizes,
#         ),
    )
)

plotly.offline.iplot({'data':data})

Load the metadata


In [None]:
urine_metadata = {}
with open('urine_metadata.csv','r') as f:
    heads = f.readline()
    split_heads = heads.rstrip().split(',')
    for line in f:
        split_line = line.rstrip().split(',')
        sample_id = split_line[0]
        urine_metadata[sample_id] = {}
        for i,v in enumerate(split_line):
            if i == 0 :
                continue
            if len(v)>0:
                urine_metadata[sample_id][split_heads[i]] = float(v)
urine_metadata['Pooled'] = {}

In [None]:
urine_names_short = []
for n in names:
    if n.startswith('Urine'):
        urine_names_short.append(n[5:])
    else:
        urine_names_short.append(n)

In [None]:
sizes = []
for u in urine_names_short:
    sizes.append(50.0+ 50.0 * urine_metadata[u].get('ARB',50.0))

In [None]:
sizes[0] = 50.0

In [None]:
print sizes

In [None]:
drug = 'Sex F=1'
# drug = 'ACE I'
drug = 'Any Diabetes'
drug = 'Alcohol Excess'
drug = 'Diuretic'
drug = '_blocker'
drug = 'Ca Antag'
# find the indices of the files that we have and if they have this drug or not
have = []
havenot = []
for i,u in enumerate(urine_names_short):
    status = urine_metadata[u].get(drug,-1)
    if status == 1:
        have.append(i)
    elif status == 0:
        havenot.append(i)

print have
print havenot
total_found = len(have) + len(havenot)
almat = np.zeros((len(mf_lda.individual_lda),mf_lda.K),np.float)
for i,l in enumerate(mf_lda.individual_lda):
    almat[i,:] = l.alpha.copy()
    
have_mean = almat[have,:].mean(axis=0)
havenot_mean = almat[havenot,:].mean(axis=0)
have_std = almat[have,:].std(axis=0)
havenot_std = almat[havenot,:].std(axis=0)
score = np.abs(have_mean-havenot_mean)/(have_std + havenot_std)
data = []
data.append(
    Scatter(
        x = have_mean,
        y = havenot_mean,
        mode = 'markers',
        marker = dict(
            size = 20*score,
            ),
    )
)
plotly.offline.iplot({'data':data})

best_topics = zip(range(mf_lda.K),score)
best_topics = sorted(best_topics,key=lambda x: x[1],reverse=True)

for i in range(5):
    best_topic = best_topics[i][0]
    data = []
    data.append(
        Bar(
            x = range(len(have) + len(havenot)),
            y = almat[have + havenot,best_topic],
        )
    )

    plotly.offline.iplot({'data':data})

Compute the standard deviation of each topic and display the topics with top 10 variance

In [None]:
almat = np.zeros((len(mf_lda.individual_lda),mf_lda.K),np.float)
bmat = mf_lda.individual_lda[0].beta_matrix.copy()
for i,l in enumerate(mf_lda.individual_lda):
    almat[i,:] = l.alpha.copy()
topic_std = almat.std(axis=0)
best_topics = zip(range(mf_lda.K),topic_std)
best_topics = sorted(best_topics,key = lambda x:x[1],reverse=True)

for i in range(10):
    
    
    best_topic = best_topics[i][0]
    this_topic_std = best_topics[i][1]
    print "Topic {}, std {}".format(best_topic,this_topic_std)
    n_al = zip(names,almat[:,best_topic])
    n_al = sorted(n_al,key=lambda x:x[1],reverse=True)
    for n,a in n_al:
        print "\t{} alpha = {}".format(n,a)
    print
    
    data = []
    al = almat[:,best_topic]
    scale_fac = 50.0/al.max()
    data.append(
        Scatter(
            x = X[:,0],
            y = X[:,1],
            mode = 'markers',
            text = names,
            marker = dict(
                size = scale_fac*al
            )
        )
    )
    
    word_prob = []
    for w in mf_lda.individual_lda[0].word_index:
        word_prob.append((w,bmat[best_topic,mf_lda.individual_lda[0].word_index[w]]))
    word_prob = sorted(word_prob,key = lambda x:x[1],reverse = True)
    
    cum_prob = 0
    for w,p in word_prob[:10]:
        cum_prob += p  
        print "\t\t{}: {} ({})".format(w,p,cum_prob)
    
    
    plotly.offline.iplot({'data':data})
    
    print
    print
    