In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
import os
import json
import ijson

In [2]:
filepath = "/home/ctomlin/personal/mass-spec-prediction/datasets/MoNA-export-Experimental_Spectra-json/MoNA-export-Experimental_Spectra.json"

In [None]:


output_dir = '/home/ctomlin/personal/mass-spec-prediction/datasets/split_files'

from decimal import Decimal

def convert(var):
    if isinstance(var, Decimal):
        return float(var)
    elif isinstance(var, dict):
        return {key: convert(val) for key, val in var.items()}
    elif isinstance(var, list):
        return [convert(x) for x in var]
    else:
        return var

def json_splitting_streaming(filepath, output_dir, chunk_size):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    
    chunk = []
    chunk_count = 0
    
    with open(filepath, 'rb') as json_file:
        parser = ijson.items(json_file, 'item')
        
        for item in parser:
            item = convert(item)
            chunk.append(item)
            
            if len(chunk) >= chunk_size:
                output_filename = os.path.join(output_dir, f'MoNA_{chunk_count + 1}.json')
                with open(output_filename, 'w') as outfile:
                    json.dump(chunk, outfile, indent=4, ensure_ascii=False)
                
                print(f"Written chunk {chunk_count + 1} with {len(chunk)} items")
                chunk = []
                chunk_count += 1
    
    if chunk:
        output_filename = os.path.join(output_dir, f'MoNA_{chunk_count + 1}.json')
        with open(output_filename, 'w') as outfile:
            json.dump(chunk, outfile, indent=4, ensure_ascii=False)
        print(f"Written chunk {chunk_count + 1} with {len(chunk)} items")

json_splitting_streaming(filepath, output_dir, 25000)

In [3]:
mona1 = '/home/ctomlin/personal/mass-spec-prediction/datasets/split_files/MoNA_1.json'

In [7]:
data = []
row = {}
with open(mona1, "rb") as f:
    for obj in ijson.items(f, "item"):  # iterate over array elements
        compound = obj['compound'][0]
        metadata = compound['metaData']
        for item in metadata:
            if item['name'] == 'SMILES':
                smiles = item['value']
        spectrum = obj['spectrum']
        # print(f'SMILES: {smiles}')
        # print(f'Spectrum: {spectrum}')
        row[smiles] = [spectrum]
        data.append(row)

df = pd.DataFrame(data)

In [8]:
df.head()

Unnamed: 0,O=C(O)C1=C(COC(=N)O)CSC2N1C(=O)C2N=C(O)C(=NOC)C=3OC=CC3,O=C1N(C(=O)CC2(C1)CCCC2)CCCCN3CCN(C4=NC=CC=N4)CC3,O(C=1C(=CC=CC1C)C)CC(N)C,O=C(N=C1ON=N(=C1)N2CCOCC2)OCC,OC(C=1C=CN=C2C=CC(OC)=CC21)C3N4CCC(C3)C(C4)CC,O=S(=O)(CC(=N)O)C(C=1C=CC=CC1)C=2C=CC=CC2,O=S(CC(=N)O)C(C=1C=CC=CC1)C=2C=CC=CC2,O=C(O)CS(=O)C(C=1C=CC=CC1)C=2C=CC=CC2,O=N(=O)C=1C=CC=2N=C(O)CN=C(C=3C=CC=CC3Cl)C2C1,O=C1C=C2CCC3C4CCC(O)(C(=O)CO)C4(C)CC(O)C3C2(C)CC1,...,O=C1OC(C)C(C)CCC=C(C)C(OC)C=CC(O)(C)CC1,O=C1C=C(C)C2C(OC=CC12OC3OC(C)C(O)C(O)C3O)OC4OC(CO)C(O)C(O)C4O,O=CC1=CCCC(=CC2OC(=O)C(=C)C2C(OC(=O)C(C)CC)C1OC)CO,O=C1C(N=C(O)C2N1C3NC=4C=CC=CC4C3(C2)C(C=C)(C)C)=CC5=CN=CN5,O=C(O)C1(O)CCC2C(=C(C)CO)CCC32COC1C3,O=C1C(=C(C)C)CC2C(CCC2(O)C)C(O)(C)C1,O=C(O)CCC1OC(=O)C(O)=C1C=2C=CC=CC2,OC1=CC=C(C(O)=C1)C2COC=3C(=CC=C4OC(C(=C)C)CC43)C2,O=C(OC(C=C)C1=CC=C(OC(=O)C(C)C)C(OC)=C1)C,O=C(OC1OC(CO)C(O)C(O)C1OC2OC(C)C(OC3OCC(O)C(OC4OCC(O)C(O)C4O)C3O)C(O)C2O)C56CCC(C)(C)CC6C7=CCC8C9(C)CCC(OC%10OC(COC%11OCC(O)C(O)C%11OC%12OCC(O)C(O)C%12O)C(O)C(O)C%10O)C(C)(C)C9CCC8(C)C7(C)CC5
0,[336:5.105105 337:0.800801 338:0.800801 364:10...,[150:0.600601 219:3.503504 386:100.000000 387:...,[105:5.505506 107:3.903904 108:0.800801 121:22...,[243:100.000000 244:12.112112 245:2.002002 265...,[183.0062:3.286286 183.0158:1.731732 325.1835:...,[167:100.000000 168:12.112112 169:0.800801 290...,[167:100.000000 168:13.713714 169:0.800801 274...,[122:1.201201 167:100.000000 168:12.112112 275...,[316:100.000000 317:13.313313 318:35.335335 31...,[125.0562:1.687688 189.0877:1.576577 282.1186:...,...,[50.214204:0.011894 50.826057:0.005462 51.1631...,[50.075744:0.007557 50.163952:0.009073 50.4845...,[50.119712:0.006284 50.192394:0.009202 50.4028...,[51.029543:0.008438 51.747228:0.008139 51.9563...,[51.324130:0.009822 51.699313:0.008274 52.6450...,[50.303647:0.029634 51.260217:0.032945 51.9699...,[51.006199:0.050404 51.217243:0.037193 51.2759...,[50.462040:0.014254 50.472009:0.019278 50.7193...,[50.167186:0.038268 52.728797:0.033745 52.9668...,[101.023041:10.337535 102.026311:0.093301 109....
1,[336:5.105105 337:0.800801 338:0.800801 364:10...,[150:0.600601 219:3.503504 386:100.000000 387:...,[105:5.505506 107:3.903904 108:0.800801 121:22...,[243:100.000000 244:12.112112 245:2.002002 265...,[183.0062:3.286286 183.0158:1.731732 325.1835:...,[167:100.000000 168:12.112112 169:0.800801 290...,[167:100.000000 168:13.713714 169:0.800801 274...,[122:1.201201 167:100.000000 168:12.112112 275...,[316:100.000000 317:13.313313 318:35.335335 31...,[125.0562:1.687688 189.0877:1.576577 282.1186:...,...,[50.214204:0.011894 50.826057:0.005462 51.1631...,[50.075744:0.007557 50.163952:0.009073 50.4845...,[50.119712:0.006284 50.192394:0.009202 50.4028...,[51.029543:0.008438 51.747228:0.008139 51.9563...,[51.324130:0.009822 51.699313:0.008274 52.6450...,[50.303647:0.029634 51.260217:0.032945 51.9699...,[51.006199:0.050404 51.217243:0.037193 51.2759...,[50.462040:0.014254 50.472009:0.019278 50.7193...,[50.167186:0.038268 52.728797:0.033745 52.9668...,[101.023041:10.337535 102.026311:0.093301 109....
2,[336:5.105105 337:0.800801 338:0.800801 364:10...,[150:0.600601 219:3.503504 386:100.000000 387:...,[105:5.505506 107:3.903904 108:0.800801 121:22...,[243:100.000000 244:12.112112 245:2.002002 265...,[183.0062:3.286286 183.0158:1.731732 325.1835:...,[167:100.000000 168:12.112112 169:0.800801 290...,[167:100.000000 168:13.713714 169:0.800801 274...,[122:1.201201 167:100.000000 168:12.112112 275...,[316:100.000000 317:13.313313 318:35.335335 31...,[125.0562:1.687688 189.0877:1.576577 282.1186:...,...,[50.214204:0.011894 50.826057:0.005462 51.1631...,[50.075744:0.007557 50.163952:0.009073 50.4845...,[50.119712:0.006284 50.192394:0.009202 50.4028...,[51.029543:0.008438 51.747228:0.008139 51.9563...,[51.324130:0.009822 51.699313:0.008274 52.6450...,[50.303647:0.029634 51.260217:0.032945 51.9699...,[51.006199:0.050404 51.217243:0.037193 51.2759...,[50.462040:0.014254 50.472009:0.019278 50.7193...,[50.167186:0.038268 52.728797:0.033745 52.9668...,[101.023041:10.337535 102.026311:0.093301 109....
3,[336:5.105105 337:0.800801 338:0.800801 364:10...,[150:0.600601 219:3.503504 386:100.000000 387:...,[105:5.505506 107:3.903904 108:0.800801 121:22...,[243:100.000000 244:12.112112 245:2.002002 265...,[183.0062:3.286286 183.0158:1.731732 325.1835:...,[167:100.000000 168:12.112112 169:0.800801 290...,[167:100.000000 168:13.713714 169:0.800801 274...,[122:1.201201 167:100.000000 168:12.112112 275...,[316:100.000000 317:13.313313 318:35.335335 31...,[125.0562:1.687688 189.0877:1.576577 282.1186:...,...,[50.214204:0.011894 50.826057:0.005462 51.1631...,[50.075744:0.007557 50.163952:0.009073 50.4845...,[50.119712:0.006284 50.192394:0.009202 50.4028...,[51.029543:0.008438 51.747228:0.008139 51.9563...,[51.324130:0.009822 51.699313:0.008274 52.6450...,[50.303647:0.029634 51.260217:0.032945 51.9699...,[51.006199:0.050404 51.217243:0.037193 51.2759...,[50.462040:0.014254 50.472009:0.019278 50.7193...,[50.167186:0.038268 52.728797:0.033745 52.9668...,[101.023041:10.337535 102.026311:0.093301 109....
4,[336:5.105105 337:0.800801 338:0.800801 364:10...,[150:0.600601 219:3.503504 386:100.000000 387:...,[105:5.505506 107:3.903904 108:0.800801 121:22...,[243:100.000000 244:12.112112 245:2.002002 265...,[183.0062:3.286286 183.0158:1.731732 325.1835:...,[167:100.000000 168:12.112112 169:0.800801 290...,[167:100.000000 168:13.713714 169:0.800801 274...,[122:1.201201 167:100.000000 168:12.112112 275...,[316:100.000000 317:13.313313 318:35.335335 31...,[125.0562:1.687688 189.0877:1.576577 282.1186:...,...,[50.214204:0.011894 50.826057:0.005462 51.1631...,[50.075744:0.007557 50.163952:0.009073 50.4845...,[50.119712:0.006284 50.192394:0.009202 50.4028...,[51.029543:0.008438 51.747228:0.008139 51.9563...,[51.324130:0.009822 51.699313:0.008274 52.6450...,[50.303647:0.029634 51.260217:0.032945 51.9699...,[51.006199:0.050404 51.217243:0.037193 51.2759...,[50.462040:0.014254 50.472009:0.019278 50.7193...,[50.167186:0.038268 52.728797:0.033745 52.9668...,[101.023041:10.337535 102.026311:0.093301 109....
