In [13]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from drfp import DrfpEncoder
from functools import partial
from typing import Iterable
from sklearn.utils import shuffle
from sklearn import preprocessing
import multiprocessing

In [None]:
data = pd.read_csv("amidation_carboxylicAcid_primaryAmine_ECFP_2048.csv")

In [6]:
data_1 = data[data['yield']<100]

In [7]:
# Add the reagents to the reactants and rejoin with >> 
data_1['Reactants'] = data_1['smiles(IREACTION)(SMILES)'].str.split('>', expand=True)[0]
data_1['Agents'] = data_1['smiles(IREACTION)(SMILES)'].str.split('>', expand=True)[1]
data_1['Products'] = data_1['smiles(IREACTION)(SMILES)'].str.split('>', expand=True)[2]
data_1['Reactants+Agents'] = data_1['Reactants'] + "." + data_1["Agents"]
data_1['Reaction'] = data_1['Reactants+Agents'] + ">>" + data_1["Products"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1['Reactants'] = data_1['smiles(IREACTION)(SMILES)'].str.split('>', expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1['Agents'] = data_1['smiles(IREACTION)(SMILES)'].str.split('>', expand=True)[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_1['Products'] = data_1['s

In [8]:
# shuffle the dataframe
data_2 = shuffle(data_1)

In [9]:
def encode(smiles: Iterable, length: int = 2048, radius: int = 3) -> np.ndarray: 
    return DrfpEncoder.encode(
        smiles,
        n_folded_length=length,
        radius=radius,
        rings=True,
    )


def encode_dataset(smiles: Iterable, length: int, radius: int) -> np.ndarray:
    """Encode the reaction SMILES to drfp"""

    cpu_count = (
        multiprocessing.cpu_count()
    )  # Data gets too big for piping when splitting less in python < 2.8

    # Split reaction SMILES for multiprocessing
    k, m = divmod(len(smiles), cpu_count)
    smiles_chunks = (
        smiles[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)]
        for i in range(cpu_count)
    )

    # Run the fingerprint generation in parallel
    results = []
    with multiprocessing.Pool(cpu_count) as p:
        results = p.map(partial(encode, length=length, radius=radius), smiles_chunks)

    return np.array([item for s in results for item in s])


In [None]:
drfps = encode_dataset(data_2['Reaction'], length=2048, radius=3)

In [None]:
kf = KFold(n_splits=2)