In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
import os
import json
import ijson
from decimal import Decimal

In [2]:
filepath = "/home/ctomlin/personal/mass-spec-prediction/datasets/MoNA-export-Experimental_Spectra-json/MoNA-export-Experimental_Spectra.json"
output_dir = '/home/ctomlin/personal/mass-spec-prediction/datasets/split_files'


In [None]:
def convert(var):
    if isinstance(var, Decimal):
        return float(var)
    elif isinstance(var, dict):
        return {key: convert(val) for key, val in var.items()}
    elif isinstance(var, list):
        return [convert(x) for x in var]
    else:
        return var

def json_splitting_streaming(filepath, output_dir, chunk_size):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    
    chunk = []
    chunk_count = 0
    
    with open(filepath, 'rb') as json_file:
        parser = ijson.items(json_file, 'item')
        
        for item in parser:
            item = convert(item)
            chunk.append(item)
            
            if len(chunk) >= chunk_size:
                output_filename = os.path.join(output_dir, f'MoNA_{chunk_count + 1}.json')
                with open(output_filename, 'w') as outfile:
                    json.dump(chunk, outfile, indent=4, ensure_ascii=False)
                
                print(f"Written chunk {chunk_count + 1} with {len(chunk)} items")
                chunk = []
                chunk_count += 1
    
    if chunk:
        output_filename = os.path.join(output_dir, f'MoNA_{chunk_count + 1}.json')
        with open(output_filename, 'w') as outfile:
            json.dump(chunk, outfile, indent=4, ensure_ascii=False)
        print(f"Written chunk {chunk_count + 1} with {len(chunk)} items")

# json_splitting_streaming(filepath, output_dir, 25000)

Written chunk 1 with 25000 items
Written chunk 2 with 25000 items
Written chunk 3 with 25000 items
Written chunk 4 with 25000 items
Written chunk 5 with 25000 items
Written chunk 6 with 25000 items
Written chunk 7 with 25000 items
Written chunk 8 with 25000 items
Written chunk 9 with 25000 items
Written chunk 10 with 10785 items


In [7]:
split_files = '/home/ctomlin/personal/mass-spec-prediction/datasets/split_files'

In [20]:
def spectrum_parsing(spectrum_string):
    split_spec = spectrum_string.split(' ')

    mz_intensity_dict = {}
    for peaks in split_spec:
        mz_intensity = peaks.split(':')

        mz_ratio = mz_intensity[0]
        if ',' in mz_ratio:
            mz_ratio_rectified = mz_ratio.replace(',', '.')
            mz_ratio = float(mz_ratio_rectified)
        else:
            try:
                mz_ratio = float(mz_ratio)
            except:
                print(f'Error was found in spectrum: {spectrum_string}')
                pass

        intensity = mz_intensity[1]
        if ',' in intensity:
            intensity_rectified = intensity.replace(',', '.')
            intensity = float(intensity_rectified)
        else:
            try:
                intensity = float(intensity)
            except:
                print(f'Error was found in spectrum: {spectrum_string}')
                pass


        mz_intensity_dict[mz_ratio] = intensity

    return mz_intensity_dict

In [21]:
data = {}
with os.scandir(split_files) as files:
    for file in files:
        with open(file, "rb") as f:
            for obj in ijson.items(f, "item"):
                compound = obj['compound'][0]
                metadata = compound['metaData']
                for item in metadata:
                    if item['name'] == 'SMILES':
                        smiles = item['value']
                spectrum = spectrum_parsing(obj['spectrum'])
                data[smiles] = [spectrum]

df = pd.DataFrame(list(data.items()), columns=['SMILES', 'Spectrum'])
df.head()

Error was found in spectrum: SPECTRUM:- MS:ba13-1.raw FTMS:- p:ESI Full:ms [100.0000-1500.0000]:Scan #::1883 RT::4.11 Mass:defect: 0.00:@ 1.00,:300.00 @:1000.00 Data:points: 590:Mass Intensity:99 0:101 1229659:102 81513:104 71267:107 94712:109 3087783:110 242574:111 189340:113 142744:115 577705:116 550378:117 1531032:118 87776:119 171969:120 180513:121 462764:122 71641:123 815916:124 335598:125 2991973:126 124602:127 537093:128 63892:129 514074:131 351990:132 181127:133 383677:134 310734:135 239189:136 73414:137 2453922:138 234161:141 219690:143 230226:145 4659240:146 376414:147 5820000:148 0:149 301396:150 60892:151 1482068:152 335612:153 5318583:154 332207:155 485446:157 789048:159 147450:160 205912:161 359841:162 326975:163 974428:164 3397089:165 1671564:166 319422:167 1992030:168 134525:169 351359:171 134740:173 1339579:174 200637:175 1893541:176 216420:177 2108097:178 130172:179 592521:181 616904:183 753427:184 155466:185 301337:186 102116:187 7199855:188 537177:191 284782:192 141

ValueError: could not convert string to float: '1.00.'

In [None]:
df