In [1]:
!which python

/home/rsilva/miniconda3/envs/chemwalker/bin/python


In [2]:
import pandas as pd
import collections
import time
from chemwalker.rwalker import *
from chemwalker.gnps import Proteosafe
from rdkit import Chem
import json
import os
import requests                                                                
import numpy as np
import networkx as nx
import subprocess

def val_known(G, p_t, otabgnps, tlid):
    mol_probs = zip(G.nodes(), p_t.tolist())
    dprob = pd.DataFrame(list(mol_probs))
    dprob['cluster index'] = dprob.apply(lambda a: int(a[0].split('_')[0]), axis=1)
    dprob['Identifier'] = dprob.apply(lambda a: a[0].split('_')[1], axis=1)
    dprob.sort_values(['cluster index'], inplace=True)
    lrank = []
    for idx in otabgnps.index:
        if otabgnps.loc[idx, 'InChIKey1']!='':
            tmpid = tlid[tlid['cluster.index']==otabgnps.loc[idx, 'cluster.index']]
            # understand why
            if tmpid.shape[0]==0:
                continue
            tmpid.reset_index(drop=False, inplace=True)
            tprob = dprob[dprob['cluster index']==otabgnps.loc[idx, 'cluster.index']]
            #tprob.sort_values([1], inplace=True, ascending=False)
            tprob = tprob.sort_values([1], ascending=False)
            mp = np.where(tmpid['InChIKey1']==otabgnps.loc[idx, 'InChIKey1'])[0]
            if len(mp)==0:
                continue
            rp = np.where(tprob['Identifier']==tmpid.loc[mp,'Identifier'].tolist()[0])[0]
            if len(rp)==0:
                continue
            lrank.append({'cluster index': otabgnps.loc[idx, 'cluster.index'], 'metfrag': mp[0], 'rw': rp[0]})
    return lrank

In [3]:
taskid = '29d517e67067476bae97a32f2d4977e0'

nap_result = Proteosafe(taskid, 'nap')
nap_result.get_nap()
net = nap_result.net
tabgnps = nap_result.tabgnps
lid = nap_result.lid

In [4]:
net.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7
0,252,253,0.0,0,0.817927,0.827126,1
1,147,177,15.9772,0,0.88719,0.796447,1
2,151,166,1.94278,0,0.897161,0.852854,1
3,254,274,156.101,0,0.864915,0.799283,1
4,166,246,0.0,0,0.846493,0.731594,1


In [5]:
nset = list(set(net['V7']))
nset[0]

1

In [6]:
# Select connected component
ns = nset[0]
snet = net[net['V7']==ns]

# Obtain the nodes in the component
nds = list(set(snet['V1'].tolist()+snet['V2'].tolist()))

# Subselect fields of interest
otabgnps = tabgnps.loc[tabgnps['cluster.index'].isin(nds), 
                       ['cluster.index', 'parent.mass', 'RTMean', 
                        'LibraryID', 'Smiles', 'INCHI' ]]

# Obtain InChIKey for structures present in GNPS
inchikey = [ Chem.InchiToInchiKey(x) if type(x)==str else ''for x in otabgnps['INCHI']]

# Record the first block of InChIKey
otabgnps['InChIKey1'] = [x.split('-')[0] if type(x)==str else '' for x in inchikey]

# copy table to modify the copy
stabgnps = otabgnps.copy()

# Define fixed number o seeds to start walk, mostly for validation
seed_ctr = 0.1 # 10% of seeds in a given connected component
seed_ctr = np.ceil(sum(otabgnps['InChIKey1']!='')*seed_ctr)
seed_ctr = int(seed_ctr)

# Remove known structures for validation
exid = np.where(stabgnps['InChIKey1']!='')[0]
exid =  stabgnps.index[list(exid[seed_ctr:])]
stabgnps.loc[exid, 'InChIKey1'] = ''

# Remove empty entries on NAP-MetFrag result list
nidx = stabgnps.index
tlid = []
for x in nidx:
   if 'x' not in lid[x]:
       tmp = pd.DataFrame(lid[x])
       tmp['cluster.index'] = stabgnps['cluster.index'][x]
       tlid.append(tmp)

# Select entries where MetFrag recovered the correct 
# candidate structure among candidates
for i in range(stabgnps.shape[0]):
    ind = stabgnps.index[i]
    if stabgnps.loc[ind, 'InChIKey1']!='' and \
       sum(tlid[i]['InChIKey1']==stabgnps.loc[ind, 'InChIKey1']):
        tlid[i] = tlid[i][tlid[i]['InChIKey1']==stabgnps.loc[ind, 'InChIKey1']]

tlid = pd.concat(tlid)
tlid.head()

Unnamed: 0,Score,InChI,FragmenterScore_Values,MaximumTreeDepth,SmilesOfExplPeaks,MonoisotopicMass,Identifier,MolecularFormula,SMILES,FormulasOfExplPeaks,InChIKey2,InChIKey1,FragmenterScore,ExplPeaks,NoExplPeaks,NumberPeaksUsed,cluster.index
4,0.7499762,InChI=1S/C21H35N9O4/c22-14(12-13-6-2-1-3-7-13)...,405.0;1672.0;753.0;610.0;348.0;305.0;696.0;961...,2,60.060001:[N]=C([N])[N];70.07:[c]([c])[C][C][N...,477.2812,1092690,C21H35N9O4,C1=CC=C(C=C1)CC(C(=O)NC(CCCN=C(N)N)C(=O)NC(CCC...,60.060001:[CH4N3+H]+H+;70.07:[C4H6N+H]+H+;112....,UHFFFAOYSA,LZDIENNKWVXJMX,126.24255,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,12,16,84
0,1.0,InChI=1S/C12H25N5O3/c1-3-7(2)9(13)10(18)17-8(1...,853.0,2,175.100006:[C]([C][C])[C](C(=O)[O])N=C([C][N])[O],287.1957,1078270,C12H25N5O3,CCC(C)C(C(=O)NC(CCCN=C(N)N)C(=O)O)N,175.100006:[C7H12N2O3+2H]+H+,CIUDSAMLSA,HYXQKVOADYPQEA,68.2337,175.100006_999.0,1,12,145
4,0.8605754,InChI=1S/C13H24N6O6/c1-6(14)10(22)19-8(5-9(20)...,1556.0;853.0;665.0;448.0,2,175.100006:[C]([C])(C(=N[C]([C][C][O])[C][O])[...,360.1757,1078968,C13H24N6O6,CC(C(=O)NC(CC(=O)O)C(=O)NC(CCCN=C(N)N)C(=O)O)N,175.100006:[C7H12N2O3+2H]+H+;212.100006:[C8H14...,UHFFFAOYSA,PBAMJJXWDQXOJA,89.36803,175.100006_999.0;212.100006_52.5;255.100006_45...,4,18,146
22,0.53489119,InChI=1S/C13H24N6O6S/c14-6(5-26)10(22)19-8(4-9...,305.0;620.0,2,158.100006:[C]([C][C][N]C(=[N])[N])[C]C(=O)[O]...,392.1478,1078983,C13H24N6O6S,C(CC(C(=O)O)NC(=O)C(CC(=O)O)NC(=O)C(CS)N)CN=C(N)N,158.100006:[C6H12N3O2]+;273.100006:[C10H15N3O6]+,UHFFFAOYSA,FWYBFUDWUUFLDN,59.085664,158.100006_446.4;273.100006_105.3,2,20,147
4,0.431212,InChI=1S/C15H31N9O4S/c16-8(7-29)11(25)23-9(3-1...,753.0;808.0;753.0,2,60.0:[C][C][S];115.099998:[C]([C][C][N]C(=[N])...,433.222,1079098,C15H31N9O4S,C(CC(C(=O)NC(CCCN=C(N)N)C(=O)O)NC(=O)C(CS)N)CN...,60.0:[C2H2S+H]+H+;115.099998:[C5H11N3+H]+H+;34...,UHFFFAOYSA,UKVGHFORADMBEN,22.376175,60.0_235.4;115.099998_571.1;340.200012_14.1,3,22,149


In [39]:
start = time.time()

# Define fingerprint method
method = 'RDKit7-linear'

# Obtain pairwise similarity
scandpair = cand_pair(snet, tlid, method)

# Transform the edge list in a graph
G = nx.Graph()
edge_list = scandpair.apply(lambda a: tuple(a), axis=1).tolist()
G.add_weighted_edges_from(edge_list)

# Create seed list for nodes present in the graph
glib = stabgnps.loc[stabgnps['InChIKey1']!='', 'cluster.index'].tolist()
source = []
for g in glib:
[]    source.extend([x for x in G.nodes() if bool(re.search('^%s_' % g, x))])
    
p_t = random_walk(G, source)

end = time.time()
print('Time for method %s for component %s with %s nodes and %s edges.' % (method, ns, len(nds), snet.shape[0] ))
print(end - start)

Time for method RDKit7-linear for component 1 with 87 nodes and 346 edges.
2637.3210248947144


Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):


In [47]:
lr1 = val_known(G, p_t, otabgnps, tlid)
lr1

[{'cluster index': 84, 'metfrag': 0, 'rw': 0},
 {'cluster index': 145, 'metfrag': 0, 'rw': 0},
 {'cluster index': 146, 'metfrag': 0, 'rw': 0},
 {'cluster index': 147, 'metfrag': 0, 'rw': 0},
 {'cluster index': 149, 'metfrag': 0, 'rw': 0},
 {'cluster index': 150, 'metfrag': 0, 'rw': 0},
 {'cluster index': 151, 'metfrag': 0, 'rw': 0},
 {'cluster index': 152, 'metfrag': 0, 'rw': 0},
 {'cluster index': 153, 'metfrag': 0, 'rw': 0},
 {'cluster index': 155, 'metfrag': 1, 'rw': 0},
 {'cluster index': 156, 'metfrag': 36, 'rw': 32},
 {'cluster index': 158, 'metfrag': 1, 'rw': 0},
 {'cluster index': 166, 'metfrag': 8, 'rw': 2},
 {'cluster index': 167, 'metfrag': 3, 'rw': 0},
 {'cluster index': 168, 'metfrag': 5, 'rw': 5},
 {'cluster index': 169, 'metfrag': 12, 'rw': 0},
 {'cluster index': 170, 'metfrag': 16, 'rw': 61},
 {'cluster index': 171, 'metfrag': 12, 'rw': 2},
 {'cluster index': 172, 'metfrag': 26, 'rw': 0},
 {'cluster index': 173, 'metfrag': 7, 'rw': 2},
 {'cluster index': 174, 'metfrag':

In [48]:
len(lr1)

87

In [59]:
tabgnps[tabgnps['cluster.index'].isin(nds)]

Unnamed: 0,cluster.index,number.of.spectra,parent.mass,precursor.charge,precursor.mass,sum.precursor.intensity.,G1,G2,G3,G4,...,RTStdErr,ProteoSAFeClusterLink,UniqueFileSourcesCount,EvenOdd,LibraryID,NumberOrganismIDs,AllOrganisms,SpectrumID,Smiles,INCHI
82,84,1,478.289,1,478.289,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,1,Phe-Arg-Arg,1,PRIVATE-USER;,CCMSLIB00003156370,C1=CC=C(C=C1)CC(C(=O)NC(CCCN=C(N)N)C(=O)NC(CCC...,InChI=1S/C21H35N9O4/c22-14(12-13-6-2-1-3-7-13)...
143,145,1,288.203,1,288.203,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,1,Ile-Arg,1,PRIVATE-USER;,CCMSLIB00003155371,CCC(C)C(C(=O)NC(CCCN=C(N)N)C(=O)O)N,InChI=1S/C12H25N5O3/c1-3-7(2)9(13)10(18)17-8(1...
144,146,1,361.183,1,361.183,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,0,Ala-Asp-Arg,1,PRIVATE-USER;,CCMSLIB00003155402,CC(C(=O)NC(CC(=O)O)C(=O)NC(CCCN=C(N)N)C(=O)O)N,InChI=1S/C13H24N6O6/c1-6(14)10(22)19-8(5-9(20)...
145,147,1,393.155,1,393.155,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,1,Cys-Asp-Arg,1,PRIVATE-USER;,CCMSLIB00003155403,C(CC(C(=O)O)NC(=O)C(CC(=O)O)NC(=O)C(CS)N)CN=C(N)N,InChI=1S/C13H24N6O6S/c14-6(5-26)10(22)19-8(4-9...
147,149,1,434.229,1,434.229,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,1,Cys-Arg-Arg,1,PRIVATE-USER;,CCMSLIB00003155406,C(CC(C(=O)NC(CCCN=C(N)N)C(=O)O)NC(=O)C(CS)N)CN...,InChI=1S/C15H31N9O4S/c16-8(7-29)11(25)23-9(3-1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,275,1,276.167,1,276.167,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,1,Thr-Arg,1,PRIVATE-USER;,CCMSLIB00003165008,CC(C(C(=O)NC(CCCN=C(N)N)C(=O)O)N)O,InChI=1S/C10H21N5O4/c1-5(16)7(11)8(17)15-6(9(1...
275,277,1,338.182,1,338.182,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,1,Tyr-Arg,1,PRIVATE-USER;,CCMSLIB00003156310,C1=CC(=CC=C1CC(C(=O)NC(CCCN=C(N)N)C(=O)O)N)O,InChI=1S/C15H23N5O4/c16-11(8-9-3-5-10(21)6-4-9...
276,278,1,329.193,1,329.193,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,0,Gly-Pro-Arg,1,PRIVATE-USER;,CCMSLIB00003164980,C1CC(N(C1)C(=O)CN)C(=O)NC(CCCN=C(N)N)C(=O)O,InChI=1S/C13H24N6O4/c14-7-10(20)19-6-2-4-9(19)...
277,279,1,274.187,1,274.187,0,1,0,0,0,...,0,http://gnps.ucsd.edu//ProteoSAFe/result.jsp?ta...,1,1,Val-Arg,1,PRIVATE-USER;,CCMSLIB00003165009,CC(C)C(C(=O)NC(CCCN=C(N)N)C(=O)O)N,InChI=1S/C11H23N5O3/c1-6(2)8(12)9(17)16-7(10(1...


In [7]:
precursor_ion_mode_positive = {
                             '[M+H]+': [1, 1.007276],
                             '[M+NH4]+': [18, 18.033823],
                             '[M+Na]+': [23, 22.989218],
                             '[M+K]+': [39, 38.963158],
                             '[M+CH3OH+H]+': [33, 33.033489],
                             '[M+ACN+H]+': [42, 42.033823],
                             '[M+ACN+Na]+': [64, 64.015765],
                             '[M+2ACN+H]+': [83, 83.060370],
                             '[M]+': [0, 0.0]
}
precursor_ion_mode_negative = {
                              '[M-H]-': [-1, -1.007276], 
                              '[M+Cl]-': [35, 34.969402],
                              '[M+HCOO]-': [45, 44.998201],
                              '[M+CH3COO]-': [59, 59.013851],
                              '[M]-': [0, 0.0]
}

metfrag_param = {
                # data file containing mz intensity peak pairs (one per line)
                'PeakListPath': '',
                # database parameters -> how to retrieve candidates
                'MetFragDatabaseType': 'LocalPSV',
                'LocalDatabasePath': '',
                #NeutralPrecursorMolecularFormula = C9H11Cl3NO3PS
                #DatabaseSearchRelativeMassDeviation = PPM
                'NeutralPrecursorMass': 0,
                #IonizedPrecursorMass = 349.93356
                # peak matching parameters
                'FragmentPeakMatchAbsoluteMassDeviation': 0.01,
                'FragmentPeakMatchRelativeMassDeviation': 5,
                'PrecursorIonMode': 1,
                'IsPositiveIonMode': True,
                # scoring parameters
                'MetFragScoreTypes': 'FragmenterScore',
                'MetFragScoreWeights': 1.0,
                # output
                # SDF, XLS, CSV, ExtendedXLS, ExtendedFragmentsXLS
                'MetFragCandidateWriter': 'FragmentSmilesPSV',
                'SampleName': '',
                'ResultsPath': '.',
                # following parameteres can be kept as they are
                'MaximumTreeDepth': 2,
                'MetFragPreProcessingCandidateFilter': 'UnconnectedCompoundFilter',
                'MetFragPostProcessingCandidateFilter': 'InChIKeyFilter',
                'NumberThreads': 6
}

In [8]:
precursor_ion_mode_positive['[M+H]+'][1]

1.007276

In [9]:
# https://stackoverflow.com/questions/11892623/stringio-and-compatibility-with-with-statement-context-manager
from pyteomics import mgf
base_url = 'http://dorresteinappshub.ucsd.edu:5001/NAPviewer/static/downloads/{0}/{1}'
url_to_spectra = base_url.format(*[taskid, 'allspectra.mgf'])

with open('tmp.mgf', 'w+') as f:
    f.write(requests.get(url_to_spectra).text)

#allspectra = mgf.read(io.StringIO(requests.get(url_to_spectra).text))
spectra = []
with mgf.read('tmp.mgf') as reader:
    for spectrum in reader:
        spectra.append(spectrum)

In [10]:
len(spectra)

610

In [11]:
spectra[83]

{'params': {'pepmass': (478.28851, None),
  'charge': [1],
  'mslevel': '2',
  'filename': 'spec-00000.mgf',
  'instrument': 'ion trap',
  'title': 'Scan Number: 84',
  'scans': '84'},
 'm/z array': array([ 60.060001,  70.07    , 112.089996, 115.089996, 116.07    ,
        120.080002, 158.089996, 175.119995, 190.100006, 228.100006,
        245.130005, 254.160004, 287.149994, 297.170013, 305.160004,
        322.190002]),
 'intensity array': array([199.100006, 751.349976, 256.73999 , 328.769989, 199.899994,
        999.      , 185.309998, 440.459991,  96.5     ,  60.740002,
         90.510002,  74.529999, 189.910004,  49.150002,  47.549999,
        154.149994]),
 'charge array': masked_array(data=[--, --, --, --, --, --, --, --, --, --, --, --, --, --,
                    --, --],
              mask=[ True,  True,  True,  True,  True,  True,  True,  True,
                     True,  True,  True,  True,  True,  True,  True,  True],
        fill_value=0,
             dtype=int64)}

In [12]:
spec = zip(spectra[83]['m/z array'], spectra[83]['intensity array'])
spec = ['%s\t%s\n' % x for x in spec]

with open('example_data.txt', '+w') as f:
    for s in spec:
        f.write(s)

In [13]:
spectra[83]['params']['pepmass'][0]

478.28851

In [14]:
db = pd.read_csv('validation_db.psv', sep='|')
db.head()

Unnamed: 0,MonoisotopicMass,InChI,SMILES,Identifier,InChIKey2,InChIKey1,MolecularFormula,kingdom_name,superclass_name,class_name,subclass_name
0,194.05791,InChI=1/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-10...,COC1=C(C=CC(=C1)C=CC(=O)O)O,CCMSLIB00000220714,UHFFFAOYSA,KSEBMYQBYZTDHS,C10H10O4,Organic compounds,Phenylpropanoids and polyketides,Cinnamic acids and derivatives,Hydroxycinnamic acids and derivatives
1,176.09496,InChI=1/C10H12N2O/c11-4-3-7-6-12-10-2-1-8(13)5...,C1=CC2=C(C=C1O)C(=CN2)CCN,CCMSLIB00000220735,UHFFFAOYSA,QZAYGJVTTNCVMB,C10H12N2O,Organic compounds,Organoheterocyclic compounds,Indoles and derivatives,Tryptamines and derivatives
2,176.04734,InChI=1/C10H8O3/c1-6-4-10(12)13-9-5-7(11)2-3-8...,CC1=CC(=O)OC2=C1C=CC(=C2)O,CCMSLIB00000221773,UHFFFAOYSA,HSHNITRMYYLLCV,C10H8O3,,,,
3,345.07864,"InChI=1/C12H17N4O4PS/c1-8-11(3-4-20-21(17,18)1...",CC1=C(SC=[N+]1CC2=CN=C(N=C2N)C)CCOP(=O)(O)O,CCMSLIB00000221098,UHFFFAOYSA,HZSAJDVWZRBGIF,C12H18N4O4PS,Organic compounds,Organoheterocyclic compounds,Diazines,Pyrimidines and pyrimidine derivatives
4,342.11621,InChI=1/C12H22O11/c13-1-4-6(15)8(17)9(18)11(22...,C(C1C(C(C(C(O1)OCC2C(C(C(O2)(CO)O)O)O)O)O)O)O,CCMSLIB00000222747,UHFFFAOYSA,PVXPPJIGRGXGCY,C12H22O11,,,,


In [16]:
(db['MonoisotopicMass']-194.05).abs()[:5]

0      0.00791
1     17.95504
2     18.00266
3    151.02864
4    148.06621
Name: MonoisotopicMass, dtype: float64

In [36]:
def filter_db(db, prmass, ppm, inchifilt=True):
    if inchifilt:
        db = db[~db.InChIKey1.duplicated()]
    cnames = ['MonoisotopicMass', 'InChI', 'Identifier', 'InChIKey2', 'InChIKey1', 'MolecularFormula']
    comp = 10**6 *((db.MonoisotopicMass-prmass).abs()/db.MonoisotopicMass)
    db = db[comp <= ppm].reset_index(drop=True)
    return db[cnames]

In [37]:
prmass = spectra[83]['params']['pepmass'][0]-precursor_ion_mode_positive['[M+H]+'][1]
ppm = 15

db_filt = filter_db(db, prmass, ppm)
db_filt.shape

(16, 6)

In [38]:
tabgnps.shape

(608, 26)

In [39]:
np.where(tabgnps['cluster.index']==84)

(array([82]),)

In [40]:
len(lid)

608

In [51]:
lid[82]

[{'Score': '1.0000000',
  'InChI': 'InChI=1S/C20H39N5O8/c1-20(29)12(7-26)31-19(14(28)17(20)25-2)33-16-11(24)5-10(23)15(13(16)27)32-18-9(22)4-3-8(6-21)30-18/h3,9-19,25-29H,4-7,21-24H2,1-2H3',
  'FragmenterScore_Values': '1256.0;708.0;1080.0;708.0;796.0;1256.0;808.0;820.0;360.0;1144.0;1001.0;765.0;1056.0;665.0;696.0',
  'MaximumTreeDepth': '2',
  'SmilesOfExplPeaks': '60.060001:[C]([C])([C])[O];70.07:[C][C]=[C][C][N];112.089996:[C]([C]=[C][C][N])[C]([C])[N];115.089996:[C]([C]=C([C][N])[O])[C][N];116.07:[C]([C][O])([C]([C])[O])[N][C];120.080002:[C]([C][O])([C]([C])[O])O[C];158.089996:[C]1([C]([C]O[C]C1([C])[O])[O])[N][C];175.119995:[C]1([C])C([C])([C]([C]([C]O1)[O])[N][C])[O];190.100006:[C]1([C][O])C([C])([C]([C]([C]O1)[O])[N][C])[O];228.100006:[C]([C])([C]O[C]1[C]([C](C([C])([C]O1)[O])[N][C])[O])[N];245.130005:[C]([C])[C]O[C]1[C]([C](C([C])([C]([C][O])O1)[O])[N][C])[O];254.160004:[C]1[C]=C([C])O[C]([C]1[N])O[C]2[C]([C][C]([C][C]2[O])[N])[N];287.149994:[C]1[C]=[C]O[C]([C]1[N])O[C]2[C]([C]

In [41]:
pd.DataFrame(lid[82]).shape

(18, 16)

In [42]:
pd.DataFrame(lid[82]).head()

Unnamed: 0,Score,InChI,FragmenterScore_Values,MaximumTreeDepth,SmilesOfExplPeaks,MonoisotopicMass,Identifier,MolecularFormula,SMILES,FormulasOfExplPeaks,InChIKey2,InChIKey1,FragmenterScore,ExplPeaks,NoExplPeaks,NumberPeaksUsed
0,1.0,InChI=1S/C20H39N5O8/c1-20(29)12(7-26)31-19(14(...,1256.0;708.0;1080.0;708.0;796.0;1256.0;808.0;8...,2,60.060001:[C]([C])([C])[O];70.07:[C][C]=[C][C]...,477.2799,CKC20,C20H39N5O8,CC1(C(CO)OC(C(C1NC)O)OC1C(CC(C(C1O)OC1C(CC=C(C...,60.060001:[C3H5O+2H]+H+;70.07:[C4H7N]+H+;112.0...,UHFFFAOYSA,DPRGTTRLFRFZIM,168.32874,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,15,16
1,0.9073504,InChI=1S/C26H33N7O2/c1-31(2)21-7-5-18(6-8-21)1...,653.0;765.0;610.0;348.0;753.0;753.0;961.0;753....,2,70.07:[C]1[C][C][N][C]1;112.089996:[C]1[C][C](...,477.2852,SN00031196,C26H35N7O2,CN(C)c1ccc(cc1)C[N@@H+]1CCC[C@H]1c1nc(no1)c1cc...,70.07:[C4H7N]+H+;112.089996:[C6H10N2+H]+H+;115...,QFIPXVFZSA,WFEJVPBNJZYBGE,152.73315,70.07_751.3;112.089996_256.7;115.089996_328.8;...,13,16
2,0.8655592,InChI=1S/C28H36N4O3/c1-20-9-11-22(12-10-20)19-...,448.0;1396.0;710.0;613.0;961.0;1672.0;958.0;40...,2,70.07:[C]1[C][C][N+][C]1;112.089996:[C]1[C][C]...,477.2866,SN00079685,C28H37N4O3+,Cc1ccc(cc1)CNC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H]...,70.07:[C4H9N-H]+;112.089996:[C6H11N2]+H+;115.0...,DQEYMECFSA,NUYXZQRZCPVUFY,145.69849,70.07_751.3;112.089996_256.7;115.089996_328.8;...,12,16
3,0.8006809,InChI=1S/C21H35N9O4/c22-14(8-4-10-27-20(23)24)...,405.0;753.0;653.0;610.0;610.0;1061.0;305.0;696...,2,60.060001:[N]=C([N])[N];70.07:[C]([C][C])[C][N...,477.2812,1089690,C21H35N9O4,C1=CC=C(C=C1)CC(C(=O)NC(CCCN=C(N)N)C(=O)O)NC(=...,60.060001:[CH4N3+H]+H+;70.07:[C4H9N-H]+;112.08...,UHFFFAOYSA,INXWADWANGLMPJ,134.77762,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,13,16
4,0.7499762,InChI=1S/C21H35N9O4/c22-14(12-13-6-2-1-3-7-13)...,405.0;1672.0;753.0;610.0;348.0;305.0;696.0;961...,2,60.060001:[N]=C([N])[N];70.07:[c]([c])[C][C][N...,477.2812,1092690,C21H35N9O4,C1=CC=C(C=C1)CC(C(=O)NC(CCCN=C(N)N)C(=O)NC(CCC...,60.060001:[CH4N3+H]+H+;70.07:[C4H6N+H]+H+;112....,UHFFFAOYSA,LZDIENNKWVXJMX,126.24255,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,12,16


In [43]:
ndb = 'validation_db_%s.psv' % spectra[83]['params']['scans']
db_filt.to_csv(ndb, sep='|', index=None)

In [44]:
metfrag_param['SampleName'] = 'spec_%s' % spectra[83]['params']['scans']
metfrag_param['PeakListPath'] = 'example_data.txt'
metfrag_param['LocalDatabasePath'] = ndb
metfrag_param['FragmentPeakMatchRelativeMassDeviation'] = ppm

# Assuming [M+H]+
metfrag_param['NeutralPrecursorMass'] = prmass

with open('example_parameter_file.txt', '+w') as f:
    for k,v in metfrag_param.items():
        f.write(f'{k} = {v}\n')

In [45]:
subprocess.call(['java', '-jar', 'MetFrag2.3-CL.jar', 'example_parameter_file.txt'])

0

In [46]:
metres = pd.read_csv('spec_84.psv', sep='|')
metres

Unnamed: 0,Score,InChI,FragmenterScore_Values,MaximumTreeDepth,SmilesOfExplPeaks,MonoisotopicMass,Identifier,MolecularFormula,FormulasOfExplPeaks,InChIKey2,InChIKey1,FragmenterScore,ExplPeaks,NoExplPeaks,NumberPeaksUsed
0,1.0,InChI=1S/C20H39N5O8/c1-20(29)12(7-26)31-19(14(...,1256.0;708.0;1080.0;708.0;796.0;1256.0;808.0;8...,2,60.060001:[C]([C])([C])[O];70.07:[C][C]=[C][C]...,477.27986,CKC20,C20H39N5O8,60.060001:[C3H5O+2H]+H+;70.07:[C4H7N]+H+;112.0...,UHFFFAOYSA,DPRGTTRLFRFZIM,180.205031,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,16,16
1,0.918846,InChI=1S/C26H33N7O2/c1-31(2)21-7-5-18(6-8-21)1...,653.0;765.0;610.0;348.0;753.0;753.0;961.0;653....,2,70.07:[C]1[C][C][N][C]1;112.089996:[C]1[C][C](...,477.28522,SN00031196,C26H35N7O2,70.07:[C4H7N]+H+;112.089996:[C6H10N2+H]+H+;115...,QFIPXVFZSA,WFEJVPBNJZYBGE,165.580601,70.07_751.3;112.089996_256.7;115.089996_328.8;...,13,16
2,0.837951,InChI=1S/C28H36N4O3/c1-20-9-11-22(12-10-20)19-...,448.0;1396.0;710.0;613.0;961.0;1672.0;958.0;40...,2,70.07:[C]1[C][C][N+][C]1;112.089996:[C]1[C][C]...,477.28657,SN00079685,C28H37N4O3+,70.07:[C4H9N-H]+;112.089996:[C6H11N2]+H+;115.0...,DQEYMECFSA,NUYXZQRZCPVUFY,151.002983,70.07_751.3;112.089996_256.7;115.089996_328.8;...,12,16
3,0.728663,InChI=1S/C30H39NO4/c1-19-10-9-13-24-16-20(2)21...,796.0;796.0;696.0;1713.0;1061.0;796.0;1756.0;1...,2,60.060001:[C]([C])([C])[O];70.07:[C]([C]=[C])[...,477.28791,DMO68,C30H39NO4,60.060001:[C3H6O+H]+H+;70.07:[C5H8+H]+H+;112.0...,BBXOWAOSSA,LYHWACCLUSDGME,131.308827,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,13,16
4,0.68693,InChI=1S/C30H39NO4/c1-18-10-9-13-24-28(33)21(4...,696.0;1492.0;1713.0;1061.0;2021.0;2120.0;1144....,2,70.07:[C]([C])[C][C][C];112.089996:[C]([C])(C(...,477.28791,SN00380428,C30H39NO4,70.07:[C5H10]+;112.089996:[C7H10O+H]+H+;116.07...,ZPSJVCBQSA,JVHIPYJQMFNCEK,123.788181,70.07_751.3;112.089996_256.7;116.07_199.9;120....,12,16
5,0.68693,InChI=1S/C30H39NO4/c1-19-9-8-12-25-17-24(18-32...,696.0;1492.0;1713.0;1061.0;2021.0;2120.0;1144....,2,70.07:[C]([C])[C][C][C];112.089996:[C]([C])(C(...,477.28791,LKT34,C30H39NO4,70.07:[C5H10]+;112.089996:[C7H10O+H]+H+;116.07...,JUDANRDHSA,DCDYEIICKCUNNP,123.788181,70.07_751.3;112.089996_256.7;116.07_199.9;120....,12,16
6,0.67508,"InChI=1S/C30H39NO4/c1-21-12-15-26-29(2,17-9-18...",1044.0;1439.0;1148.0;1001.0;348.0;1920.0;1784....,2,70.07:[C][C][C][C][C];112.089996:[C][C][C][C](...,477.28791,SN00086822,C30H39NO4,70.07:[C5H9]+H+;112.089996:[C7H12O]+;120.08000...,XKBDTSMFSA,MZBVEIDYCNWRKM,121.652748,70.07_751.3;112.089996_256.7;120.080002_999.0;...,10,16
7,0.673344,InChI=1S/C29H36FN3O2/c1-31-25-15-23(35-2)9-10-...,610.0;753.0;1237.0;1380.0;796.0;1406.0;1044.0;...,2,70.07:[C]1[C][C]1[C][N];115.089996:[C]1[C][C]1...,477.279156,CHEBI:128406,C29H36FN3O2,70.07:[C4H7N]+H+;115.089996:[C6H11NO+H]+H+;120...,AREMUKBSSA,MUQXDRHJIFCTKS,121.339898,70.07_751.3;115.089996_328.8;120.080002_999.0;...,11,16
8,0.641716,InChI=1S/C26H35N7O2/c27-20-32-26(31-14-13-29-2...,710.0;610.0;865.0;1113.0;1213.0;918.0;710.0;65...,2,60.060001:[C]([C][N])[N];70.07:[C]([C][C][C])[...,477.285223,CHEBI:73303,C26H35N7O2,60.060001:[C2H6N2+H]+H+;70.07:[C5H10]+;112.089...,UHFFFAOYSA,CICDSYWWNDGAGD,115.640509,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,10,16
9,0.61423,InChI=1S/C30H39NO4/c32-27-21-28(35-22-23-15-17...,1201.0;610.0;908.0;1492.0;1224.0;2020.0;2120.0...,2,60.060001:[C]([C])([C])[O];70.07:[C]([C][C][C]...,477.287909,CHEBI:92041,C30H39NO4,60.060001:[C3H5O+2H]+H+;70.07:[C5H10]+;112.089...,VNDOHOEKSA,GQGRDYWMOPRROR,110.6874,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,12,16


In [54]:
metres.apply(lambda a: a.to_dict(), axis=1).tolist()

[{'Score': 1.0,
  'InChI': 'InChI=1S/C20H39N5O8/c1-20(29)12(7-26)31-19(14(28)17(20)25-2)33-16-11(24)5-10(23)15(13(16)27)32-18-9(22)4-3-8(6-21)30-18/h3,9-19,25-29H,4-7,21-24H2,1-2H3',
  'FragmenterScore_Values': '1256.0;708.0;1080.0;708.0;796.0;1256.0;808.0;820.0;360.0;1144.0;1001.0;720.0;1056.0;1068.0;665.0;696.0',
  'MaximumTreeDepth': 2,
  'SmilesOfExplPeaks': '60.060001:[C]([C])([C])[O];70.07:[C][C]=[C][C][N];112.089996:[C]([C]=[C][C][N])[C]([C])[N];115.089996:[C]([C]=C([C][N])[O])[C][N];116.07:[C]([C][O])([C]([C])[O])[N][C];120.080002:[C]([C][O])([C]([C])[O])O[C];158.089996:[C]1([C]([C]O[C]C1([C])[O])[O])[N][C];175.119995:[C]1([C])C([C])([C]([C]([C]O1)[O])[N][C])[O];190.100006:[C]1([C][O])C([C])([C]([C]([C]O1)[O])[N][C])[O];228.100006:[C]([C])([C]O[C]1[C]([C](C([C])([C]O1)[O])[N][C])[O])[N];245.130005:[C]([C])[C]O[C]1[C]([C](C([C])([C]([C][O])O1)[O])[N][C])[O];254.160004:[C]1[C]=C([C][N])O[C]([C]1[N])O[C]2[C]([C][C]([C][C]2)[N])[N];287.149994:[C]1[C]=[C]O[C]([C]1[N])O[C]2[C]([C][C]

In [47]:
metres.loc[7]

Score                                                              0.673344
InChI                     InChI=1S/C29H36FN3O2/c1-31-25-15-23(35-2)9-10-...
FragmenterScore_Values    610.0;753.0;1237.0;1380.0;796.0;1406.0;1044.0;...
MaximumTreeDepth                                                          2
SmilesOfExplPeaks         70.07:[C]1[C][C]1[C][N];115.089996:[C]1[C][C]1...
MonoisotopicMass                                                    477.279
Identifier                                                     CHEBI:128406
MolecularFormula                                                C29H36FN3O2
FormulasOfExplPeaks       70.07:[C4H7N]+H+;115.089996:[C6H11NO+H]+H+;120...
InChIKey2                                                        AREMUKBSSA
InChIKey1                                                    MUQXDRHJIFCTKS
FragmenterScore                                                      121.34
ExplPeaks                 70.07_751.3;115.089996_328.8;120.080002_999.0;...
NoExplPeaks 

In [56]:
def run_metfrag(spectrum, db, adduct='[M+H]+', ppm=15, abs_diff=0.01,
                ispositive = True, metpath='MetFrag2.3-CL.jar'):
    if ispositive:
        prmass = spectrum['params']['pepmass'][0]-precursor_ion_mode_positive[adduct][1]
        metfrag_param['PrecursorIonMode'] = precursor_ion_mode_positive[adduct][0]
    else:
        metfrag_param['IsPositiveIonMode'] = False
        prmass = spectrum['params']['pepmass'][0]-precursor_ion_mode_negative[adduct][1]
        metfrag_param['PrecursorIonMode'] = precursor_ion_mode_negative[adduct][0]
    
    spec = zip(spectrum['m/z array'], spectrum['intensity array'])
    spec = ['%s\t%s\n' % x for x in spec]

    sname = 'spec_%s_data.txt' % spectrum['params']['scans']
    with open(sname, '+w') as f:
        for s in spec:
            f.write(s)

    if type(db)==str:
        metfrag_param['MetFragDatabaseType'] = db
    else:
        db_filt = filter_db(db, prmass, ppm)
        ndb = 'validation_db_%s.psv' % spectrum['params']['scans']
        db_filt.to_csv(ndb, sep='|', index=None)
        metfrag_param['LocalDatabasePath'] = ndb
    
    cname = 'cand_%s' % spectrum['params']['scans']
    metfrag_param['SampleName'] = cname 
    metfrag_param['PeakListPath'] = sname
    metfrag_param['FragmentPeakMatchRelativeMassDeviation'] = ppm
    metfrag_param['FragmentPeakMatchAbsoluteMassDeviation'] = abs_diff
    metfrag_param['NeutralPrecursorMass'] = prmass
    
    pname = 'parameter_file_%s.txt' % spectrum['params']['scans']

    with open(pname, '+w') as f:
        for k,v in metfrag_param.items():
            f.write(f'{k} = {v}\n')
            
    subprocess.call(['java', '-jar', metpath, pname])
    cname = 'cand_%s.psv' % spectrum['params']['scans']
    metres = pd.read_csv(cname, sep='|')
    
    os.remove(sname)
    os.remove(ndb)                 
    os.remove(cname)
    os.remove(pname)                  
                     
    return metres

In [57]:
db.shape

(694297, 11)

In [58]:
run_metfrag(spectra[83], db)

Unnamed: 0,Score,InChI,FragmenterScore_Values,MaximumTreeDepth,SmilesOfExplPeaks,MonoisotopicMass,Identifier,MolecularFormula,FormulasOfExplPeaks,InChIKey2,InChIKey1,FragmenterScore,ExplPeaks,NoExplPeaks,NumberPeaksUsed
0,1.0,InChI=1S/C20H39N5O8/c1-20(29)12(7-26)31-19(14(...,1256.0;708.0;1080.0;708.0;796.0;1256.0;808.0;8...,2,60.060001:[C]([C])([C])[O];70.07:[C][C]=[C][C]...,477.27986,CKC20,C20H39N5O8,60.060001:[C3H5O+2H]+H+;70.07:[C4H7N]+H+;112.0...,UHFFFAOYSA,DPRGTTRLFRFZIM,180.205031,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,16,16
1,0.918846,InChI=1S/C26H33N7O2/c1-31(2)21-7-5-18(6-8-21)1...,653.0;765.0;610.0;348.0;753.0;753.0;961.0;653....,2,70.07:[C]1[C][C][N][C]1;112.089996:[C]1[C][C](...,477.28522,SN00031196,C26H35N7O2,70.07:[C4H7N]+H+;112.089996:[C6H10N2+H]+H+;115...,QFIPXVFZSA,WFEJVPBNJZYBGE,165.580601,70.07_751.3;112.089996_256.7;115.089996_328.8;...,13,16
2,0.837951,InChI=1S/C28H36N4O3/c1-20-9-11-22(12-10-20)19-...,448.0;1396.0;710.0;613.0;961.0;1672.0;958.0;40...,2,70.07:[C]1[C][C][N+][C]1;112.089996:[C]1[C][C]...,477.28657,SN00079685,C28H37N4O3+,70.07:[C4H9N-H]+;112.089996:[C6H11N2]+H+;115.0...,DQEYMECFSA,NUYXZQRZCPVUFY,151.002983,70.07_751.3;112.089996_256.7;115.089996_328.8;...,12,16
3,0.728663,InChI=1S/C30H39NO4/c1-19-10-9-13-24-16-20(2)21...,796.0;796.0;696.0;1713.0;1061.0;796.0;1756.0;1...,2,60.060001:[C]([C])([C])[O];70.07:[C]([C]=[C])[...,477.28791,DMO68,C30H39NO4,60.060001:[C3H6O+H]+H+;70.07:[C5H8+H]+H+;112.0...,BBXOWAOSSA,LYHWACCLUSDGME,131.308827,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,13,16
4,0.68693,InChI=1S/C30H39NO4/c1-18-10-9-13-24-28(33)21(4...,696.0;1492.0;1713.0;1061.0;2021.0;2120.0;1144....,2,70.07:[C]([C])[C][C][C];112.089996:[C]([C])(C(...,477.28791,SN00380428,C30H39NO4,70.07:[C5H10]+;112.089996:[C7H10O+H]+H+;116.07...,ZPSJVCBQSA,JVHIPYJQMFNCEK,123.788181,70.07_751.3;112.089996_256.7;116.07_199.9;120....,12,16
5,0.68693,InChI=1S/C30H39NO4/c1-19-9-8-12-25-17-24(18-32...,696.0;1492.0;1713.0;1061.0;2021.0;2120.0;1144....,2,70.07:[C]([C])[C][C][C];112.089996:[C]([C])(C(...,477.28791,LKT34,C30H39NO4,70.07:[C5H10]+;112.089996:[C7H10O+H]+H+;116.07...,JUDANRDHSA,DCDYEIICKCUNNP,123.788181,70.07_751.3;112.089996_256.7;116.07_199.9;120....,12,16
6,0.67508,"InChI=1S/C30H39NO4/c1-21-12-15-26-29(2,17-9-18...",1044.0;1439.0;1148.0;1001.0;348.0;1920.0;1784....,2,70.07:[C][C][C][C][C];112.089996:[C][C][C][C](...,477.28791,SN00086822,C30H39NO4,70.07:[C5H9]+H+;112.089996:[C7H12O]+;120.08000...,XKBDTSMFSA,MZBVEIDYCNWRKM,121.652748,70.07_751.3;112.089996_256.7;120.080002_999.0;...,10,16
7,0.673344,InChI=1S/C29H36FN3O2/c1-31-25-15-23(35-2)9-10-...,610.0;753.0;1237.0;1380.0;796.0;1406.0;1044.0;...,2,70.07:[C]1[C][C]1[C][N];115.089996:[C]1[C][C]1...,477.279156,CHEBI:128406,C29H36FN3O2,70.07:[C4H7N]+H+;115.089996:[C6H11NO+H]+H+;120...,AREMUKBSSA,MUQXDRHJIFCTKS,121.339898,70.07_751.3;115.089996_328.8;120.080002_999.0;...,11,16
8,0.641716,InChI=1S/C26H35N7O2/c27-20-32-26(31-14-13-29-2...,710.0;610.0;865.0;1113.0;1213.0;918.0;710.0;65...,2,60.060001:[C]([C][N])[N];70.07:[C]([C][C][C])[...,477.285223,CHEBI:73303,C26H35N7O2,60.060001:[C2H6N2+H]+H+;70.07:[C5H10]+;112.089...,UHFFFAOYSA,CICDSYWWNDGAGD,115.640509,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,10,16
9,0.61423,InChI=1S/C30H39NO4/c32-27-21-28(35-22-23-15-17...,1201.0;610.0;908.0;1492.0;1224.0;2020.0;2120.0...,2,60.060001:[C]([C])([C])[O];70.07:[C]([C][C][C]...,477.287909,CHEBI:92041,C30H39NO4,60.060001:[C3H5O+2H]+H+;70.07:[C5H10]+;112.089...,VNDOHOEKSA,GQGRDYWMOPRROR,110.6874,60.060001_199.1;70.07_751.3;112.089996_256.7;1...,12,16


In [None]:
!python bin/network_walk.py random-walk \
        --taskid be5004d4dd394fe8b883633fd75ea732 \
        --workflow V2 \ 
        --comp 63 \
        --db data/validation_db.psv \
        --out test_run

In [None]:
# Alan
# https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=a0b4377f6e4540268602c30fedf69b0b
# Danielle
# https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=e3b521658618404a822b238992d70955