# Converts from the old LDA project into the new dict format

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import gzip
import os
import cPickle
import pickle
import csv

import sys
sys.path.append('/Users/joewandy/git/MS2LDA/')
sys.path.append('/Users/joewandy/git/lda/code')

from lda import VariationalLDA

## Define some methods

In [2]:
def load_proj(project_in):
    with gzip.GzipFile(project_in, 'rb') as f:
        obj = cPickle.load(f)
    return obj

In [3]:
def copy_mat_to_dict(mat, row_labels, col_labels):

    n_row, n_col = mat.shape
    assert n_row == len(row_labels)
    assert n_col == len(col_labels)
    
    result = {}
    nnz = 0
    for i in range(n_row):
        dist = {}
        row_label = row_labels[i]
        for j in range(n_col):
            col_label = col_labels[j]
            val = mat[i, j]
            if val > 0:
                dist[col_label] = val
                nnz += 1
        result[row_label] = dist

    print nnz, '/', (mat.shape[0]*mat.shape[1]) 

In [4]:
def convert_proj_to_dict(proj, min_prob_to_keep_beta, min_prob_to_keep_theta):
    
    # build metadata
    ms1 = proj.ms1
    metadata = {}
    docs = []
    parent_peak_ids = []
    for index, row in ms1.iterrows():

        mz = row['mz']
        rt = row['rt']
        intensity = row['intensity']
        pid = row['peakID']
        
        key = '%s_%s' % (row['mz'], row['rt'])
        docs.append(key)
        parent_peak_ids.append(row['peakID'])
        
        metadata[key] = {}
        metadata[key]['parentmass'] = mz
        metadata[key]['rt'] = rt
        metadata[key]['intensity'] = intensity
        metadata[key]['id'] = pid 

    # build corpus
    vocab = proj.vocab
    mat = proj.df.values
    n_docs, n_words = mat.shape
    assert n_docs == len(docs)
    assert n_words == len(vocab)

    corpus = {}
    for d in range(n_docs):
        doc = {}
        for n in range(n_words):
            val = mat[d, n]
            if val > 0:
                word = vocab[n]
                doc[word] = val

        doc_id = docs[d]
        corpus[doc_id] = doc

    K = proj.model.K
    alpha = proj.model.posterior_alpha.tolist()

    # build the doc index
    doc_index = {}
    for d in range(len(docs)):
        doc_id = docs[d]
        doc_index[doc_id] = d

    # build the word index
    word_index = {}
    for n in range(len(vocab)):
        word = vocab[n]
        word_index[word] = n
        
    proj.do_thresholding(th_topic_word=min_prob_to_keep_beta, th_doc_topic=min_prob_to_keep_theta)
    
    # create beta
    print 'Beta'
    row_labels = ['motif_%d' for k in range(K)]
    col_labels = vocab
    beta = copy_mat_to_dict(proj.topic_word, row_labels, col_labels)

    # create theta
    print 'Theta'
    row_labels = docs
    col_labels = ['motif_%d' for k in range(K)]
    theta = copy_mat_to_dict(proj.doc_topic, row_labels, col_labels)
    
    # create phi
    print 'Phi'
    phi = {}
    for d in range(n_docs):
        doc_id = docs[d]
        pid = parent_peak_ids[d]
        contrib = proj.get_motif_contributions(pid)
        new_contrib = {}
        for word in contrib:
            word_contrib = contrib[word]
            new_word_contrib = {}
            for topic in word_contrib:
                topic_name = 'motif_%d' % topic
                val = word_contrib[topic]
                new_word_contrib[topic_name] = val
            new_contrib[word] = new_word_contrib
        phi[doc_id] = new_contrib

    # create the final dict
    lda_dict = {}
    lda_dict['corpus'] = corpus
    lda_dict['word_index'] = word_index
    lda_dict['doc_index'] = doc_index
    lda_dict['K'] = K
    lda_dict['alpha'] = alpha
    lda_dict['beta'] = beta
    lda_dict['theta'] = theta
    lda_dict['phi'] = phi
    lda_dict['doc_metadata'] = metadata
    
    return lda_dict

In [5]:
def load_topic_metadata(filename):
    topic_metadata = {}
    for item in csv.reader(open(filename), skipinitialspace=True):
        key = int(item[0])
        topic_name = 'motif_%d' % key
        val = item[1]
        topic_metadata[topic_name] = val
    return topic_metadata

In [6]:
def save_dict(lda_dict, filename):
    with open(filename,'w') as f:
        pickle.dump(lda_dict, f, -1)
    print 'Saved to %s' % filename

## Start converting

In [7]:
def convert(proj_dir, proj_file, annot_file, out_file):

    min_prob_to_keep_beta = 1e-3
    min_prob_to_keep_theta = 1e-2
    
    proj = load_proj(os.path.join(proj_dir, proj_file))
    topic_metadata = load_topic_metadata(os.path.join(proj_dir, annot_file))

    lda_dict = convert_proj_to_dict(proj, min_prob_to_keep_beta, min_prob_to_keep_theta)
    lda_dict['topic_metadata'] = topic_metadata
    
    save_dict(lda_dict, os.path.join(proj_dir, out_file))

In [8]:
proj_dir = '/Users/joewandy/Dropbox/MS2LDA Manuscript Sections/Supporting Information/'

In [9]:
proj_file = 'Manuscript_Beer1POSmode_EFassigner_ALLextended.project'
annot_file = 'beer1pos_annotation_Nov2015.csv'
out_file = 'beer1pos.dict'
convert(proj_dir, proj_file, annot_file, out_file)

Beta
9849 / 1271100
Theta
6483 / 384600
Phi
Saved to /Users/joewandy/Dropbox/MS2LDA Manuscript Sections/Supporting Information/beer1pos.dict


In [10]:
proj_file = 'Manuscript_Beer2POSmode_EFassigner_ALLextended.project'
annot_file = 'beer2pos_annotation_Nov2015.csv'
out_file = 'beer2pos.dict'
convert(proj_dir, proj_file, annot_file, out_file)

Beta
11823 / 1492500
Theta
9086 / 470100
Phi
Saved to /Users/joewandy/Dropbox/MS2LDA Manuscript Sections/Supporting Information/beer2pos.dict


In [11]:
proj_file = 'Manuscript_Beer3POSmode_EFassigner_ALLextended.project'
annot_file = 'beer3pos_annotation_Nov2015.csv'
out_file = 'beer3pos.dict'
convert(proj_dir, proj_file, annot_file, out_file)

Beta
10035 / 1348800
Theta
7904 / 426600
Phi
Saved to /Users/joewandy/Dropbox/MS2LDA Manuscript Sections/Supporting Information/beer3pos.dict


In [12]:
proj_file = 'Manuscript_BeerQCPOSmode_EFassigner_ALLextended.project'
annot_file = 'beer4pos_annotation_Nov2015.csv'
out_file = 'beer4pos.dict'
convert(proj_dir, proj_file, annot_file, out_file)

Beta
10924 / 1275000
Theta
7263 / 408900
Phi
Saved to /Users/joewandy/Dropbox/MS2LDA Manuscript Sections/Supporting Information/beer4pos.dict
