In [2]:
import pandas as pd
import numpy as np
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
import logging

In [28]:
def shift_zeros_right(row):
    string_list = []
    non_string_list = []
    for item in row:
        if pd.isna(item):
            non_string_list.append(item)
        else:
            string_list.append(item)
    return pd.Series(string_list+non_string_list)

def parse_status(generator, smiles):
    results = generator.process(smiles)
    try: 
        processed, features = results[0], results[1:]
        if processed is None:
            logging.warning("Descriptastorus cannot process smiles %s", smiles)
        return features
    except TypeError:
        logging.warning("RDKit Error on smiles %s", smiles)
        # if processed is None, the features are are default values for the type

def get_aggregate(mixture):
        return np.array([np.mean(mixture, axis=0), np.var(mixture, axis=0), np.max(mixture, axis=0), np.min(mixture, axis=0)]).flatten()

def return_rdkit_aggregates(row):
    generator = MakeGenerator((f"rdkit2dhistogramnormalized",))
    row = row[~pd.isnull(row)][2:]
    aggregate = get_aggregate([parse_status(generator, smiles) for smiles in row])
    return pd.Series(aggregate)




In [25]:
mixture_df = pd.read_csv("Mixture_Definitions_smi_Training_set.csv")

cols = mixture_df.columns
mixture_df = mixture_df.apply(lambda x: shift_zeros_right(x), axis=1)
mixture_df.columns = cols

mixture_df.to_csv("../competition_train/mixture_smi_definitions_clean.csv", index=False)

In [6]:
smi_series_list = []
for _, row in mixture_df.iterrows():
    smi_series_list.append(return_rdkit_aggregates(row))



In [27]:
smi_df = pd.DataFrame(smi_series_list)
new_smi_df = mixture_df[['Dataset', 'Mixture Label']].copy()
new_smi_df = new_smi_df.join(smi_df, how='inner')
new_smi_df.columns = ['Dataset', 'Mixture Label'] + [f'mean_{i}' for i in range(0,200)] + [f'var_{i}' for i in range(0,200)]  + [f'max_{i}' for i in range(0,200)]  + [f'min_{i}' for i in range(0,200)] 


new_smi_df.to_csv("../competition_train/mixture_rdkit_definitions_clean.csv", index=False)