<h1>Feature Extraction for Multifile LDA</h1>

In [33]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, HTML

import sys
basedir = '/Users/simon/Dropbox/beer_analysis/frans stuff/'
sys.path.append(basedir)
codedir = '/Users/simon/git/lda/code/'
sys.path.append(codedir)

from multifile_feature import SparseFeatureExtractor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Feature extraction method

In [34]:
def extract_features(input_set, fragment_grouping_tol, loss_grouping_tol, 
                     loss_threshold_min_count, loss_threshold_max_val):
    
    extractor = SparseFeatureExtractor(input_set, fragment_grouping_tol, loss_grouping_tol, 
                                       loss_threshold_min_count, loss_threshold_max_val,
                                       input_type='filename')
    
    # create the grouping for the fragments
    fragment_q = extractor.make_fragment_queue()
    fragment_groups = extractor.group_features(fragment_q, extractor.fragment_grouping_tol)
    
    # create the grouping for the losses
    loss_q = extractor.make_loss_queue()
    loss_groups = extractor.group_features(loss_q, extractor.loss_grouping_tol, 
                                           check_threshold=True)
    
    # populate the counts
    extractor.create_counts(fragment_groups, loss_groups, scaling_factor)
    
    return extractor

## Set parameters

In [35]:
scaling_factor=100             # previously set to 100 in the single file LDA 
fragment_grouping_tol=7        # grouping tolerance in ppm for the fragment
loss_grouping_tol=7            # grouping tolerance in ppm for the neutral loss
loss_threshold_min_count=5     # min. counts of loss values to occur
loss_threshold_max_val=200     # max. loss values

In [36]:
input_set = [
    (basedir+'input/beer/Beer1pos_MS1filter_Method3_ms1.csv', basedir+'input/beer/Beer1pos_MS1filter_Method3_ms2.csv'),
    (basedir+'input/beer/Beer2pos_MS1filter_Method3_ms1.csv', basedir+'input/beer/Beer2pos_MS1filter_Method3_ms2.csv'),
    (basedir+'input/beer/Beer3pos_MS1filter_Method3_ms1.csv', basedir+'input/beer/Beer3pos_MS1filter_Method3_ms2.csv'),
    (basedir+'input/urine/Urine37_pos_ms1.csv', basedir+'input/urine/Urine37_pos_ms2.csv'),
    (basedir+'input/urine/Urine44_pos_ms1.csv', basedir+'input/urine/Urine44_pos_ms2.csv'),
    (basedir+'input/urine/Urine64_pos_ms1.csv', basedir+'input/urine/Urine64_pos_ms2.csv'),
]



## Extract features

In [37]:
extractor = extract_features(input_set, fragment_grouping_tol, loss_grouping_tol, 
                            loss_threshold_min_count, loss_threshold_max_val)

Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/beer/Beer1pos_MS1filter_Method3_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/beer/Beer1pos_MS1filter_Method3_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/beer/Beer2pos_MS1filter_Method3_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/beer/Beer2pos_MS1filter_Method3_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/beer/Beer3pos_MS1filter_Method3_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/beer/Beer3pos_MS1filter_Method3_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/urine/Urine37_pos_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/urine/Urine37_pos_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/urine/Urine44_pos_ms1.csv
Loading /Users/simon/Dropbox/beer_analysis/frans stuff/input/urine/Urine44_pos_ms2.csv
Loading /Users/simon/Dropbox/beer_analysis/fra

In [38]:
print extractor.F

6


In [39]:
for f in range(extractor.F):
    mat, vocab, ms1, ms2 = extractor.get_entry(f)
    print 'File %d' % f
    print 'Count matrix', type(mat), mat.shape
    print 'Vocab', len(vocab), 'words'
    print 'MS1 rows', type(ms1), ms1.shape[0]
    print 'MS2 rows', type(ms2), ms2.shape[0]
    print

File 0
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (1282, 14313)
Vocab 14313 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 1282
MS2 rows <class 'pandas.core.frame.DataFrame'> 24612

File 1
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (1567, 14313)
Vocab 14313 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 1567
MS2 rows <class 'pandas.core.frame.DataFrame'> 30643

File 2
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (1422, 14313)
Vocab 14313 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 1422
MS2 rows <class 'pandas.core.frame.DataFrame'> 27128

File 3
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (2756, 14313)
Vocab 14313 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 2756
MS2 rows <class 'pandas.core.frame.DataFrame'> 34929

File 4
Count matrix <class 'scipy.sparse.lil.lil_matrix'> (1690, 14313)
Vocab 14313 words
MS1 rows <class 'pandas.core.frame.DataFrame'> 1690
MS2 rows <class 'pandas.core.frame.DataFrame'> 34112

File 5
Count matrix 

In [None]:
global_word_index = {}
for i,v in enumerate(vocab):
    global_word_index[v] = i

In [None]:
corpus_list = []
for f in range(extractor.F):
    print "Processing file {}".format(f)
    corpus = {}
    mat, vocab, ms1, ms2 = extractor.get_entry(f)
    n_docs,n_words = mat.shape
    print n_docs,n_words
    d_pos = 0
    for d in ms1.iterrows():
        doc_name = "{}_{}".format(d[1]['mz'],d[1]['rt'])
        corpus[doc_name] = {}
        for word_index,count in zip(mat[d_pos,:].rows[0],mat[d_pos,:].data[0]):
            if count > 0:
                corpus[doc_name][vocab[word_index]] = count
        d_pos += 1
    corpus_list.append(corpus)
                

Processing file 0
1282 14313
Processing file 1
1567 14313
Processing file 2
1422 14313
Processing file 3
2756 14313
Processing file 4
1690 14313
Processing file 5
1796 14313


In [None]:
from lda import MultiFileVariationalLDA

In [None]:
mf_lda = MultiFileVariationalLDA(corpus_list=corpus_list,word_index = global_word_index,K = 300,alpha=1,eta=0.1)

In [None]:
mf_lda.run_vb(n_its=500,initialise=True)

In [None]:
import pickle
with open('beer_v_urine.lda','w') as f:
    pickle.dump(mf_lda,f,-1)


In [None]:
from lda_plotters import MultiFileVariationalLDAPlotter
mp = MultiFileVariationalLDAPlotter(mf_lda)
mp.multi_alpha(normalise=True)

Run PCA on the alpha values and then plot

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2,whiten=True)
al = np.zeros((len(corpus_list),mf_lda.K))
for i,c in enumerate(corpus_list):
    al[i,:] = mf_lda.individual_lda[i].alpha

pca.fit(al)
X = pca.transform(al)
print X.shape

import plotly as plotly
from plotly.graph_objs import *
plotly.offline.init_notebook_mode()

names = []
for m1,m2 in input_set:
    names.append(m1.split('/')[-1])

data = []
r = 'rgb(255,0,0)'
g = 'rgb(0,255,0)'
data.append(
    Scatter(
        x = X[:,0],
        y = X[:,1],
        mode = 'markers',
        text = names,
        marker = dict(
            color = [r,r,r,g,g,g],
        )
    )
)

plotly.offline.iplot({'data':data})

In [None]:
have = [0,1,2]
havenot = [3,4,5]
# for i,u in enumerate(urine_names_short):
#     status = urine_metadata[u].get(drug,-1)
#     if status == 1:
#         have.append(i)
#     elif status == 0:
#         havenot.append(i)

print have
print havenot
total_found = len(have) + len(havenot)
almat = np.zeros((len(mf_lda.individual_lda),mf_lda.K),np.float)
for i,l in enumerate(mf_lda.individual_lda):
    almat[i,:] = l.alpha.copy()
    
have_mean = almat[have,:].mean(axis=0)
havenot_mean = almat[havenot,:].mean(axis=0)
have_std = almat[have,:].std(axis=0)
havenot_std = almat[havenot,:].std(axis=0)
score = np.abs(have_mean-havenot_mean)/(have_std + havenot_std)
data = []
data.append(
    Scatter(
        x = have_mean,
        y = havenot_mean,
        mode = 'markers',
        text = [str(i) for i in range(mf_lda.K)],
        marker = dict(
            size = 2*score,
            ),
    )
    
)
data.append(
    Scatter(
        x = [0,have_mean.max()],
        y = [0,have_mean.max()],
    )
)

layout = Layout(
    xaxis = dict(
        title = 'Mean alpha value in beer',
    ),
    yaxis = dict(
        title = 'Mean alpha value in urine',
    )
)
plotly.offline.iplot({'data':data,'layout':layout})

best_topics = zip(range(mf_lda.K),score)
best_topics = sorted(best_topics,key=lambda x: x[1],reverse=True)

for i in range(5):
    best_topic = best_topics[i][0]
    data = []
    data.append(
        Bar(
            x = range(len(have) + len(havenot)),
            y = almat[have + havenot,best_topic],
        )
    )
   
    plotly.offline.iplot({'data':data})
    print best_topics[i][0]

In [None]:

beta_mat = mf_lda.individual_lda[0].beta_matrix.copy()
for i in range(10):
    word_probs = []
    best_topic = best_topics[i][0]
    print "Topic {}, score {}".format(best_topics[i][0],best_topics[i][1])
    for word in mf_lda.word_index:
        word_pos = mf_lda.word_index[word]
        word_probs.append((word,beta_mat[best_topic,word_pos]))
    word_probs = sorted(word_probs,key = lambda x:x[1],reverse=True)
    cum_prob = 0.0
    for j in range(10):
        cum_prob += word_probs[j][1]
        print "\t{}:{} ({})".format(word_probs[j][0],word_probs[j][1],cum_prob)
    print
    print
    