Notebook to give a demo of using the db with MS2LDA

In [None]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import sys

root_path = '/Users/simon'

db_path = os.path.join(root_path, 'git/motifdb/motifs/')
sys.path.append(os.path.join(root_path, 'git/motifdb/code/utilities/'))
sys.path.append(os.path.join(root_path, 'git/lda/code/'))

In [5]:
from motifdb.main import load_db
from ms2lda.loaders import LoadMGF
from ms2lda.feature_maker import MakeBinnedFeatures
from ms2lda.lda_variational import VariationalLDA
from ms2lda.reporting import alpha_report, write_csv
from ms2lda.lda_evaluation import compute_overlap_scores_from_model


Choose the databases to use

In [6]:
db_list = ['lf_005', 'massbank_binned_005']

In [7]:
db_spectra, db_metadata = load_db(db_list, db_path)

Looking in /Users/simon/git/motifdb/motifs/lf_005/*.m2m
	 Found 64
Looking in /Users/simon/git/motifdb/motifs/massbank_binned_005/*.m2m
	 Found 46
Found total of 110 motif files


In [8]:
mgf_file = os.path.join(root_path,
                        'Dropbox/BioResearch/Meta_clustering/MS2LDA/mol_families/spectral_data_ms2_BioCorrected.mgf')

In [9]:
l = LoadMGF(min_ms2_intensity=500.0)
ms1, ms2, metadata = l.load_spectra([mgf_file])

Filtering MS2 on intensity
708295 MS2 remaining


In [10]:
import csv

family_file = os.path.join(root_path,
                           'Dropbox/BioResearch/Meta_clustering/MS2LDA/mol_families/Cluster9.txt')
f_id = []
with open(family_file, 'r') as f:
    reader = csv.reader(f)
    for line in f:
        f_id.append(line.strip())


In [11]:
sub_ms1 = {}
for doc in metadata:
    fid = metadata[doc]['featid']
    if fid in f_id:
        sub_ms1[doc] = fid

In [12]:
mf = MakeBinnedFeatures(bin_width=0.005)
corpus, features = mf.make_features(ms2)

5629 documents
After removing empty words, 70006 words left


In [13]:
corpus = corpus[corpus.keys()[0]]

In [14]:
sub_corpus = {}
for doc in sub_ms1:
    sub_corpus[doc] = corpus[doc]

In [15]:
vlda = VariationalLDA(sub_corpus, K=4, normalise=1000.0, fixed_topics=db_spectra,
                      fixed_topics_metadata=db_metadata)

Found 6325 unique words
Object created with 69 documents
Normalising intensities


In [16]:
vlda.run_vb(n_its=100, initialise=True)

Initialising
Starting iterations
Iteration 0 (change = 5.71663005709) (0.60673 seconds, I think I'll finish in 1.01121666667 minutes). Alpha: (0.242087480524,8.07709990178)
Iteration 1 (change = 0.98262603863) (0.637357 seconds, I think I'll finish in 1.05163905 minutes). Alpha: (0.116431938415,4.12794092915)
Iteration 2 (change = 1.04009419131) (0.656481 seconds, I think I'll finish in 1.0722523 minutes). Alpha: (0.0735787206882,2.60571529101)
Iteration 3 (change = 0.844097817357) (0.656336 seconds, I think I'll finish in 1.06107653333 minutes). Alpha: (0.0528539841196,1.86208197798)
Iteration 4 (change = 0.745643525454) (0.67038 seconds, I think I'll finish in 1.072608 minutes). Alpha: (0.0408348892997,1.43072454755)
Iteration 5 (change = 0.709392517216) (0.660452 seconds, I think I'll finish in 1.04571566667 minutes). Alpha: (0.0330515265337,1.26176096946)
Iteration 6 (change = 0.658062877183) (0.685611 seconds, I think I'll finish in 1.0741239 minutes). Alpha: (0.0276272172016,1.12

Iteration 57 (change = 0.0166558974189) (0.919944 seconds, I think I'll finish in 0.6592932 minutes). Alpha: (0.00236117047613,0.19517448415)
Iteration 58 (change = 0.0167775887436) (0.817026 seconds, I think I'll finish in 0.5719182 minutes). Alpha: (0.00231578887655,0.194061042151)
Iteration 59 (change = 0.0181920053157) (0.745151 seconds, I think I'll finish in 0.509186516667 minutes). Alpha: (0.00227205248688,0.192912599356)
Iteration 60 (change = 0.0197403478138) (0.755854 seconds, I think I'll finish in 0.503902666667 minutes). Alpha: (0.00222987110492,0.191709912244)
Iteration 61 (change = 0.0189630053101) (0.754232 seconds, I think I'll finish in 0.4902508 minutes). Alpha: (0.00218915807049,0.190416445301)
Iteration 62 (change = 0.0165703007965) (0.784406 seconds, I think I'll finish in 0.496790466667 minutes). Alpha: (0.00214984149736,0.189070878931)
Iteration 63 (change = 0.0161564071938) (0.793688 seconds, I think I'll finish in 0.489440933333 minutes). Alpha: (0.00211185603

In [17]:
overlap_scores = compute_overlap_scores_from_model(vlda)
alpha_report(vlda, overlap_scores=overlap_scores, overlap_thresh=0.3)

lf_motif_199.m2m 311 293 265 283 312 275 247 371 237 251 0.151556311274
	document_4177 0.885347032205
	document_4176 0.867388270015
	document_3996 0.827479343602
	document_3994 0.825391676347
	document_3808 0.821016948261
	document_3172 0.806010797725
	document_3003 0.795310354009
	document_2638 0.735063523151
	document_3995 0.733186433598
	document_3851 0.716059221541
	document_3780 0.672404058037
	document_3718 0.61556797497
	document_4088 0.614578945467
	document_2787 0.602181675632
	document_3861 0.599656305135
	document_4029 0.59287058424
	document_1586 0.568964961178
	document_3679 0.56715999931
	document_2785 0.543080932347
	document_3164 0.539710836112
	document_3247 0.519292113059
	document_4258 0.516413965002
	document_3712 0.500621242505
	document_4470 0.479681357507
	document_2559 0.478421875312
	document_4878 0.455368075441
	document_2818 0.372407452576
	document_2834 0.315345248741
	document_4471 0.312923806082
lf_motif_491.m2m 251 253 211 249 225 207 159 209 173 0.129166

In [19]:
used_motifs = write_csv(vlda, overlap_scores, 'cluster9_with_motifdb.csv', metadata, p_thresh=0,
                        o_thresh=0.3)

In [21]:
# write out a motif file
with open('cluster9_usedmotifs.csv', 'w') as f:
    writer = csv.writer(f, dialect='excel')
    heads = ['name', 'short_annotation', 'annotation']
    writer.writerow(heads)
    for motif in used_motifs:
        if motif in db_metadata:
            newrow = [motif]
            if db_metadata[motif]['SHORT_ANNOTATION']:
                newrow.append(db_metadata[motif]['SHORT_ANNOTATION'])
            else:
                newrow.append('None')
            if db_metadata[motif]['ANNOTATION']:
                newrow.append(db_metadata[motif]['ANNOTATION'])
            else:
                newrow.append('None')
        else:
            newrow = [motif, 'None', 'None']
        writer.writerow(newrow)