In [1]:
import numpy as np
import pyarrow.parquet as pq
import pandas as pd
import pyarrow.compute as pc
from tqdm.auto import tqdm
import cv2
from multiprocessing import Pool

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
parquet_file = pq.ParquetFile('../../train.parquet')
total_records = parquet_file.metadata.num_rows
# Optionally, you can inspect the schema to confirm field names and types
print(parquet_file.schema)

column_data = parquet_file.read(columns=['buildingblock1_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list1_train = unique_values.to_pylist()

column_data = parquet_file.read(columns=['buildingblock2_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list2_train = unique_values.to_pylist()

column_data = parquet_file.read(columns=['buildingblock3_smiles'])
unique_values = pc.unique(column_data.column(0))
unique_list3_train = unique_values.to_pylist()

<pyarrow._parquet.ParquetSchema object at 0x72ae81e65b80>
required group field_id=-1 schema {
  optional int64 field_id=-1 id;
  optional binary field_id=-1 buildingblock1_smiles (String);
  optional binary field_id=-1 buildingblock2_smiles (String);
  optional binary field_id=-1 buildingblock3_smiles (String);
  optional binary field_id=-1 molecule_smiles (String);
  optional binary field_id=-1 protein_name (String);
  optional int64 field_id=-1 binds;
}



KeyboardInterrupt: 

In [31]:
import pickle
with open('../spl1.pkl','wb') as f:
    pickle.dump(unique_list1_train,f)
with open('../spl2.pkl','wb') as f:
    pickle.dump(unique_list2_train,f)
with open('../spl3.pkl','wb') as f:
    pickle.dump(unique_list3_train,f)

In [40]:
blocks = list(set(unique_list2_train).union(set(unique_list3_train).union(set(unique_list1_train))))

In [4]:
len(blocks)

2110

In [5]:
blocks_to_i = {x: i for i, x in enumerate(blocks)}

In [6]:
parquet_file = pq.ParquetFile('../../test.parquet')

In [7]:
del column_data

In [9]:
def process_split(x):
    x = x.to_pandas()
    graph = np.zeros((2110,2110), dtype=bool)
    
    for k, v in x.iterrows():       
        graph[blocks_to_i[v['buildingblock1_smiles']],blocks_to_i[v['buildingblock1_smiles']]] = True
        graph[blocks_to_i[v['buildingblock2_smiles']],blocks_to_i[v['buildingblock2_smiles']]] = True
        graph[blocks_to_i[v['buildingblock3_smiles']],blocks_to_i[v['buildingblock3_smiles']]] = True

        graph[blocks_to_i[v['buildingblock1_smiles']],blocks_to_i[v['buildingblock2_smiles']]] = True
        graph[blocks_to_i[v['buildingblock2_smiles']],blocks_to_i[v['buildingblock1_smiles']]] = True

        graph[blocks_to_i[v['buildingblock1_smiles']],blocks_to_i[v['buildingblock3_smiles']]] = True
        graph[blocks_to_i[v['buildingblock3_smiles']],blocks_to_i[v['buildingblock1_smiles']]] = True

        graph[blocks_to_i[v['buildingblock2_smiles']],blocks_to_i[v['buildingblock3_smiles']]] = True
        graph[blocks_to_i[v['buildingblock3_smiles']],blocks_to_i[v['buildingblock2_smiles']]] = True
    return graph

In [10]:
BATCH = 100_000
total_records = parquet_file.metadata.num_rows
total_batches = (total_records + BATCH - 1) // BATCH  # Ceiling division  
with Pool(processes=30) as pool:  # Adjust the number of processes according to your system
        # Use imap_unordered to process data as it is read
        results = list(tqdm(pool.imap_unordered(process_split, parquet_file.iter_batches(batch_size=BATCH)),
                            total=total_batches, desc="Processing batches"))

Processing batches: 100%|██████████| 17/17 [00:10<00:00,  1.64it/s]


In [11]:
results_stack = np.stack(results)

In [12]:
results_stack.shape

(17, 2110, 2110)

In [13]:
final = results_stack.any(axis=0)

In [14]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components

In [15]:
sparse_graph = csr_matrix(final.astype(int))

In [16]:
n_components, labels = connected_components(csgraph=sparse_graph, directed=False, return_labels=True)

print(f"Number of connected components: {n_components}")

Number of connected components: 4


In [17]:
np.unique(labels, return_counts=True)

(array([0, 1, 2, 3], dtype=int32), array([ 859, 1145,   53,   53]))

In [22]:
len(labels)

2110

In [19]:
TEST_MAIN = '/home/anonymous/belka/test.parquet'
test = pd.read_parquet(TEST_MAIN)

In [29]:
def get_score(row):
    return labels[blocks_to_i[row['buildingblock1_smiles']]]
test['island'] = test.apply(get_score, axis=1)


In [56]:
test.to_parquet('/home/anonymous/belka/test_with_islands.parquet',index=False)

In [55]:
sum((test['island']==) == ((test['buildingblock3_smiles'].isin(set(blocks))) | (test['buildingblock2_smiles'].isin(set(blocks))) | (test['buildingblock1_smiles'].isin(set(blocks)))))

533813

0           True
1           True
2           True
3           True
4           True
           ...  
1674891    False
1674892    False
1674893    False
1674894    False
1674895    False
Length: 1674896, dtype: bool

In [None]:
# import pickle
# with open('test_connected.pickle','wb') as f:
#     pickle.dump((labels, np.unique(labels, return_counts=True), blocks_to_i), f)

In [None]:
import networkx as nx

In [None]:
graph = nx.Graph(sparse_graph)
cut_value, (set1, set2) = nx.stoer_wagner(graph)

stats