In [2]:
from multiprocessing import Pool
import pyarrow.parquet as pq
import pymongo
from pymongo import MongoClient
from rdkit import Chem
import multiprocessing
from tqdm.auto import tqdm
from simple_gnn.preprocess import create_atoms, create_ijbonddict, extract_fingerprints
import numpy as np
from bson.binary import Binary
import pickle
import pyarrow.compute as pc

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
parquet_file = pq.ParquetFile('train.parquet')
total_records = parquet_file.metadata.num_rows
# Optionally, you can inspect the schema to confirm field names and types
print(parquet_file.schema)

column_data = parquet_file.read(columns=['buildingblock1_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list1_train = unique_values.to_pylist()

column_data = parquet_file.read(columns=['buildingblock2_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list2_train = unique_values.to_pylist()

column_data = parquet_file.read(columns=['buildingblock3_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list3_train = unique_values.to_pylist()

<pyarrow._parquet.ParquetSchema object at 0x7d01c52ed240>
required group field_id=-1 schema {
  optional int64 field_id=-1 id;
  optional binary field_id=-1 buildingblock1_smiles (String);
  optional binary field_id=-1 buildingblock2_smiles (String);
  optional binary field_id=-1 buildingblock3_smiles (String);
  optional binary field_id=-1 molecule_smiles (String);
  optional binary field_id=-1 protein_name (String);
  optional int64 field_id=-1 binds;
}



In [3]:
parquet_file = pq.ParquetFile('test.parquet')
total_records = parquet_file.metadata.num_rows
# Optionally, you can inspect the schema to confirm field names and types
print(parquet_file.schema)

column_data = parquet_file.read(columns=['buildingblock1_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list1_test = unique_values.to_pylist()

column_data = parquet_file.read(columns=['buildingblock2_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list2_test = unique_values.to_pylist()

column_data = parquet_file.read(columns=['buildingblock3_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list3_test = unique_values.to_pylist()

<pyarrow._parquet.ParquetSchema object at 0x7cc31160b980>
required group field_id=-1 schema {
  optional int64 field_id=-1 id;
  optional binary field_id=-1 buildingblock1_smiles (String);
  optional binary field_id=-1 buildingblock2_smiles (String);
  optional binary field_id=-1 buildingblock3_smiles (String);
  optional binary field_id=-1 molecule_smiles (String);
  optional binary field_id=-1 protein_name (String);
}



In [4]:
train_unique = set(unique_list1_train).union(set(unique_list2_train)).union(set(unique_list3_train) )
test_unique = set(unique_list1_test).union(set(unique_list2_test)).union(set(unique_list3_test) )

In [8]:
u = set(train_unique).union(set(test_unique))

In [9]:
import os
# Convert set to list
elements_list = list(u)

# Function to chunk the list
def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Create a directory for the files
folder_name = 'u_files'
os.makedirs(folder_name, exist_ok=True)  # Creates the directory, doesn't error if it already exists

# Chunk the list and write each chunk to a new file
file_number = 1
for chunk in chunk_list(elements_list, 200):
    file_path = os.path.join(folder_name, f'u{file_number}.txt')
    with open(file_path, 'w') as file:
        file.write('\n'.join(chunk))
    file_number += 1

print(f'Files saved in directory: {folder_name}')

Files saved in directory: u_files


In [8]:
from pymongo import MongoClient

# Connect to MongoDB (adjust the connection string as necessary)
client = MongoClient('mongodb://localhost:27017/')
db = client['belka']  # Replace 'your_database_name' with the name of your database
collection = db['train_metadata']  # Replace 'your_collection_name' with your collection name

# Initialize a set to store the unique values
unique_values = set()

# Query the collection for documents where 'condition_field' is True
query = {"binds": 1}
projection = {"molecule_smiles": 1, "_id": 0}  # Only fetch the 'value_field' from each document

# Execute the query and process the results
for document in collection.find(query, projection):
    # Assuming the 'value_field' might not be present in all documents
    if 'molecule_smiles' in document:
        unique_values.add(document['molecule_smiles'])


In [12]:
import pickle
with open('unique_binds.pkl', 'wb') as f:
    pickle.dump(unique_values, f)

In [5]:
len(test_unique)

2110

In [16]:
len(test_unique - train_unique)

965

In [19]:
len(set(unique_list1_test))

341

In [18]:
len(set(unique_list1_test)-set(unique_list1_train))

70

In [24]:
import random
l1_test = random.sample(unique_list1_train, len(unique_list1_train)//25)
l2_test = random.sample(unique_list2_train, len(unique_list2_train)//25)
l3_test = random.sample(unique_list3_train, len(unique_list3_train)//25)

In [25]:
import pickle
with open('split.pickle', 'wb') as handle:
    pickle.dump((l1_test,l2_test,l3_test), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys

In [29]:
a = Chem.AddHs(Chem.MolFromSmiles('C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H](CC#C)CC(=O)N[Dy])n2)cc1'))

In [31]:
b = AllChem.GetMorganFingerprintAsBitVect(a, radius=2, nBits=2048)

In [40]:
list(map(bool, b.ToList()))

[False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False