# 02 Basic classifier


## Local dependencies

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('../../../', 'leash_bio_kaggle'))
if not os.path.exists(module_path):
    raise RuntimeError("Cannot find the Python module `leash_bio_kaggle`")
if module_path not in sys.path:
    sys.path.append(module_path)


## Data and processing

TODO:

In [2]:
import pyarrow.dataset as ds
import pyarrow.compute as pc

PATH_TRAIN_DATA = "../../../data/train.parquet"
PATH_TEST_DATA = "../../../data/test.parquet"
DATA = ds.dataset(source=PATH_TRAIN_DATA, format="parquet")
DATA_TEST = ds.dataset(source=PATH_TEST_DATA, format="parquet")

### Protein selection

In [3]:
protein_selection = "sEH"

In [4]:
scanner_protein_bind = DATA.scanner(
    filter=(pc.field("protein_name") == protein_selection) & (pc.field("binds") == 1)
)
scanner_protein_no_bind = DATA.scanner(
    filter=(pc.field("protein_name") == protein_selection) & (pc.field("binds") == 0)
)

### Subsampling dataset

In [5]:
n_rows_bind = scanner_protein_bind.count_rows()
n_rows_no_bind = scanner_protein_no_bind.count_rows()

In [6]:
n_bind = 10_000
ratio_no_bind = 1.0
n_no_bind = int(ratio_no_bind * n_bind)

In [7]:
if n_bind > n_rows_bind:
    raise ValueError("`n_bind` is greater than `n_samples_bind`")
if n_no_bind > n_rows_no_bind:
    raise ValueError("`n_no_bind` is greater than `n_samples_no_bind`")


In [8]:
import numpy as np
import pyarrow as pa

In [9]:
bind_table = scanner_protein_bind.take(
    indices=np.random.choice(n_rows_bind, size=n_bind, replace=False)
)
no_bind_table = scanner_protein_no_bind.take(
    indices=np.random.choice(n_rows_no_bind, size=n_no_bind, replace=False)
)

In [10]:
table = pa.concat_tables([bind_table, no_bind_table])

## Features

TODO:

In [11]:
from rdkit import Chem
from rdkit.Chem import AllChem

from leash_bio_kaggle.mol import clean_mol_str

In [12]:
def get_features(smiles: str, radius: int = 3, nBits: int = 2048):
    mol = Chem.MolFromSmiles(smiles)
    features = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
    return np.array(features)

In [13]:
def process_row(row):
    smiles = row['molecule_smiles']
    fingerprint = get_features(clean_mol_str(smiles))
    return fingerprint

In [14]:
from concurrent.futures import ThreadPoolExecutor

In [15]:

def split_table_into_batches(table, batch_size):
    num_rows = len(table)
    for i in range(0, num_rows, batch_size):
        yield table.slice(i, min(batch_size, num_rows - i))


def generate_features_parallel(table, num_workers=4, batch_size=1000):
    batches = split_table_into_batches(table, batch_size)
    all_features = []

    for batch in batches:
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            batch_dict = batch.to_pydict()
            futures = [executor.submit(process_row, {key: batch_dict[key][i] for key in batch_dict}) for i in range(len(batch))]
            batch_features = np.array([future.result() for future in futures])
            all_features.append(batch_features)

    features = np.vstack(all_features)
    return features

In [16]:
features_array = generate_features_parallel(table)

## Training