In [1]:
import pandas as pd
from tqdm.auto import tqdm
import random
from rdkit import Chem
import numpy as np
from collections import defaultdict
import pyarrow.parquet as pq

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
from rdkit import Chem
import pandas as pd
from tqdm.auto import tqdm
import random
from rdkit import Chem
import numpy as np
from collections import defaultdict
import pyarrow.parquet as pq
from tqdm import tqdm
import multiprocessing
from simple_gnn.preprocess import create_atoms, create_ijbonddict, extract_fingerprints
from tqdm.auto import tqdm

atom_dict = {'C': 0,
             'N': 1,
             ('C', 'aromatic'): 2,
             ('N', 'aromatic'): 3,
             'O': 4,
             'H': 5,
             ('S', 'aromatic'): 6,
             'Cl': 7,
             'S': 8,
             ('O', 'aromatic'): 9,
             'Br': 10,
             'F': 11,
             'Si': 12,
             'B': 13,
             'I': 14}

bond_dict = {'TRIPLE': 0, 'SINGLE': 1, 'AROMATIC': 2, 'DOUBLE': 3}

fingerprint_dict = {0: 0,
             1: 1,
             2: 2,
             3: 3,
             4: 4,
             5: 5,
             6: 6,
             7: 7,
             8: 8,
             9: 9,
             10: 10,
             11: 11,
             12: 12,
             13: 13,
             14: 14}

edge_dict = {}
# Function to process each chunk of data
def process_data(data_chunk):
    fingerprints_storage = []
    adjacency_storage = []
    properties_storage = []
    molecular_size_storage = []

    for x in data_chunk:
        data = x.strip().split()
        smiles, property = data

        mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
        atoms = create_atoms(mol, atom_dict)  # Assuming create_atoms is defined elsewhere
        molecular_size = len(atoms)
        i_jbond_dict = create_ijbonddict(mol, bond_dict)  # Assuming create_ijbonddict is defined elsewhere
        fingerprints = extract_fingerprints(0, atoms, i_jbond_dict, fingerprint_dict, edge_dict)  # Assuming extract_fingerprints is defined
        adjacency = Chem.GetAdjacencyMatrix(mol)

        fingerprints_storage.append(np.array(fingerprints))
        adjacency_storage.append(np.array(adjacency))
        properties_storage.append(property)
        molecular_size_storage.append(molecular_size)

    return fingerprints_storage, adjacency_storage, properties_storage, molecular_size_storage

# Read and preprocess the data
with open('BRD4_train.txt', 'r') as f:
    data_original = f.read().strip().split('\n')
    data_original = [data for data in data_original if '.' not in data.split()[0]][0:100000]

# Split data into 1000 chunks
num_chunks = 10000
chunk_size = len(data_original) // num_chunks + (len(data_original) % num_chunks > 0)
data_chunks = [data_original[i:i + chunk_size] for i in range(0, len(data_original), chunk_size)]

# Create a pool of processes and map data processing function to data chunks using tqdm
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
results = []

# Use tqdm to monitor the progress of chunk processing
for result in tqdm(pool.imap_unordered(process_data, data_chunks), total=num_chunks):
    results.append(result)

pool.close()
pool.join()

# Combine the results
fingerprints_storage = [item for sublist in results for item in sublist[0]]
adjacency_storage = [item for sublist in results for item in sublist[1]]
properties_storage = [item for sublist in results for item in sublist[2]]
molecular_size_storage = [item for sublist in results for item in sublist[3]]

# Continue as before
arr_fingerprints = np.array(fingerprints_storage, dtype=object)
np.save('BRD4_train_fingerprints.npy', arr_fingerprints)

arr_fingerprints = np.array(fingerprints_storage, dtype=object)
np.save('BRD4_train_fingerprints.npy', arr_fingerprints)

arr_adjacency = np.array(adjacency_storage, dtype=object)
np.save('BRD4_train_adjacency.npy', arr_adjacency)

arr_properties = np.array(properties_storage)
np.save('BRD4_train_properties.npy', arr_properties)

arr_molecular_size = np.array(molecular_size_storage)
np.save('BRD4_train_molecular_size.npy', arr_molecular_size)


100%|██████████| 10000/10000 [00:14<00:00, 691.02it/s]


In [4]:
import numpy as np
np.load('BRD4_train.txt_properties.npy', allow_pickle=True)

array(['0', '0', '0', ..., '0', '0', '0'], dtype='<U1')