In [1]:
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 1000)

import sys
sys.path.append('../scripts/')
from utils import *
from config import *
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

## Load raw data

In [2]:
DATA_PATH = '../data/Mordred_RDKit_pybel.csv'

df = pd.read_csv(DATA_PATH, sep=';', index_col=0, low_memory=False)

metadata = df.columns[2:9].to_list()
features = df.columns[10:].to_list()
features = sorted(features) # necessary to have reproducible results with LightGBM
target = 'Taste'
rows = df[df[target].isin(['Sweet', 'Bitter'])].index.to_list()

print(f'Metadata columns number: {len(metadata)}')
print(f'Features columns number: {len(features)}')
print(f'DataFrame dimensions: {df.loc[rows, features+[target]].shape}')

Metadata columns number: 7
Features columns number: 2059
DataFrame dimensions: (2686, 2060)


## Data cleaning

### Compute baseline AUROC

In [3]:
train = df.loc[rows,features+[target]].copy()
train.reset_index(drop=True, inplace=True)
train[target].replace({'Bitter': 0, 'Sweet': 1}, inplace=True)
custom_cv = create_folds(train=train, features=features, target=target, num_folds=NUM_FOLDS, shuffle=True, seed=SEED)
evaluate(params=PARAMS, train=train, features=features, target=target, folds=custom_cv)

Training until validation scores don't improve for 20 rounds
Fold 1 AUC: 0.95235
Fold 2 AUC: 0.94892
Fold 3 AUC: 0.94635
Fold 4 AUC: 0.93690
Fold 5 AUC: 0.96570
Folds AUC: 0.95004+-0.00936
Total AUC: 0.95066


### Removing duplicated columns from feature extraction

In [4]:
dup_cols2 = [c for c in features if '.1' in c]
dup_cols1 = [c[:-2] for c in dup_cols2]

for c1,c2 in zip(dup_cols1, dup_cols2):
    if df[c1].equals(df[c2]):
        features.remove(c2)

In [5]:
print(f'Metadata columns number: {len(metadata)}')
print(f'Features columns number: {len(features)}')
print(f'DataFrame dimensions: {df.loc[rows, features+[target]].shape}')

Metadata columns number: 7
Features columns number: 2004
DataFrame dimensions: (2686, 2005)


The result shouldn't change having only removed redundant columns.

In [6]:
train = df.loc[rows,features+[target]].copy()
train.reset_index(drop=True, inplace=True)
train[target].replace({'Bitter': 0, 'Sweet': 1}, inplace=True)
custom_cv = create_folds(train=train, features=features, target=target, num_folds=NUM_FOLDS, shuffle=True, seed=SEED)
evaluate(params=PARAMS, train=train, features=features, target=target, folds=custom_cv)

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[60]	cv_agg's train auc: 0.999781 + 2.89423e-05	cv_agg's valid auc: 0.950044 + 0.00935955
Fold 1 AUC: 0.95235
Fold 2 AUC: 0.94892
Fold 3 AUC: 0.94635
Fold 4 AUC: 0.93690
Fold 5 AUC: 0.96570
Folds AUC: 0.95004+-0.00936
Total AUC: 0.95066


### Removing duplicated rows with same target

In [7]:
df_tmp1 = df[features+[target]].copy()
df_tmp2 = df[metadata].copy()

df_tmp1 = df_tmp1[df_tmp1.duplicated(keep=False)].sort_values(features).copy()
print(f'Number of duplicated rows with same target: {df_tmp1.shape[0]}\n')

Number of duplicated rows with same target: 647



In [8]:
df_tmp = pd.merge(df_tmp1, df_tmp2[['Name','Reference']], left_index=True, right_index=True)
df_tmp.head(6)

Unnamed: 0,AATS0Z,AATS0are,AATS0d,AATS0dv,AATS0i,AATS0m,AATS0p,AATS0pe,AATS0s,AATS0se,...,qed,rotors,s,sbonds,smarts,tbonds,title,Taste,Name,Reference
2733,14.72,5.5876,1.84,2.56,166.339543,57.05352,1.202728,5.651744,3.01,7.246851,...,0.443884,5,,7,,0,,Sweet,Oct-2-en-1-ol,The Good Scents Company Database
2744,14.72,5.5876,1.84,2.56,166.339543,57.05352,1.202728,5.651744,3.01,7.246851,...,0.443884,5,,7,,0,,Sweet,trans-2-Octen-1-Ol,The Good Scents Company Database
1509,15.368421,5.675263,1.789474,2.947368,166.591013,59.670816,1.195366,5.733084,3.513158,7.327178,...,0.530402,3,,5,,0,,Sweet,trans-2-Hexen-1-Ol,Fenaroli Handbook of Flavor Ingredient
2632,15.368421,5.675263,1.789474,2.947368,166.591013,59.670816,1.195366,5.733084,3.513158,7.327178,...,0.530402,3,,5,,0,,Sweet,"2-Hexen-1-ol, (2Z)-",The Good Scents Company Database
466,15.44186,5.545112,2.883721,3.023256,165.866382,60.064455,1.297987,5.624667,1.531654,7.167234,...,0.63463,0,,20,,0,,Bitter,72325,BitterDB
662,15.44186,5.545112,2.883721,3.023256,165.866382,60.064455,1.297987,5.624667,1.531654,7.167234,...,0.63463,0,,20,,0,,Bitter,644020,BitterDB


In [9]:
to_drop = df_tmp1[df_tmp1.duplicated(keep='first')].index
df.drop(index=to_drop, inplace=True)
rows = list(set(rows) - set(to_drop))

In [10]:
print(f'Metadata columns number: {len(metadata)}')
print(f'Features columns number: {len(features)}')
print(f'DataFrame dimensions: {df.loc[rows, features+[target]].shape}')

Metadata columns number: 7
Features columns number: 2004
DataFrame dimensions: (2265, 2005)


I expect a performance degradation having removed rows with the same features and the same target as if one ended up in the train fold and the other in the validation fold, the model would easily predict the latter correctly.

In [11]:
train = df.loc[rows,features+[target]].copy()
train.reset_index(drop=True, inplace=True)
train[target].replace({'Bitter': 0, 'Sweet': 1}, inplace=True)
custom_cv = create_folds(train=train, features=features, target=target, num_folds=NUM_FOLDS, shuffle=True, seed=SEED)
evaluate(params=PARAMS, train=train, features=features, target=target, folds=custom_cv)

Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[66]	cv_agg's train auc: 0.999795 + 3.92946e-05	cv_agg's valid auc: 0.928939 + 0.00613945
Fold 1 AUC: 0.93118
Fold 2 AUC: 0.93935
Fold 3 AUC: 0.92275
Fold 4 AUC: 0.92852
Fold 5 AUC: 0.92289
Folds AUC: 0.92894+-0.00614
Total AUC: 0.92851


### Removing duplicated rows with same target

In [12]:
df_tmp1 = df[features].copy()
df_tmp2 = df[metadata+[target]].copy()

In [13]:
df_tmp1 = df_tmp1[df_tmp1.duplicated(keep=False)].sort_values(features).copy()
print(f'Number of duplicated rows with different target: {df_tmp1.shape[0]}\n')

Number of duplicated rows with different target: 95



In [14]:
df_tmp = pd.merge(df_tmp1, df_tmp2[['Name','Reference']+[target]], left_index=True, right_index=True)
df_tmp.head(6)

Unnamed: 0,AATS0Z,AATS0are,AATS0d,AATS0dv,AATS0i,AATS0m,AATS0p,AATS0pe,AATS0s,AATS0se,...,qed,rotors,s,sbonds,smarts,tbonds,title,Name,Reference,Taste
661,15.241379,5.581724,2.068966,3.103448,164.893537,59.203392,1.259836,5.654434,2.976054,7.230663,...,0.617131,4,,8,,0,,643820,BitterDB,Bitter
1372,15.241379,5.581724,2.068966,3.103448,164.893537,59.203392,1.259836,5.654434,2.976054,7.230663,...,0.617131,4,,8,,0,,GERANIOL,Fenaroli Handbook of Flavor Ingredient,Sweet
350,16.296296,5.636667,3.037037,4.0,163.41027,63.513564,1.320223,5.714763,3.386317,7.268604,...,0.520631,0,,11,,0,,14525,BitterDB,Bitter
2595,16.296296,5.636667,3.037037,4.0,163.41027,63.513564,1.320223,5.714763,3.386317,7.268604,...,0.520631,0,,11,,0,,(+)-Fenchone,The Good Scents Company Database,Sweet
784,16.702703,5.735946,2.216216,4.27027,164.524799,65.127648,1.279016,5.802154,3.367117,7.365847,...,0.494023,7,,11,,0,,5355853,BitterDB,Bitter
1470,16.702703,5.735946,2.216216,4.27027,164.524799,65.127648,1.279016,5.802154,3.367117,7.365847,...,0.494023,7,,11,,0,,Neryl propionate,Fenaroli Handbook of Flavor Ingredient,Sweet


In [15]:
to_drop = df_tmp1.index
df.drop(index=to_drop, inplace=True)
rows = list(set(rows) - set(to_drop))

In [16]:
print(f'Metadata columns number: {len(metadata)}')
print(f'Features columns number: {len(features)}')
print(f'DataFrame dimensions: {df.loc[rows, features+[target]].shape}')

Metadata columns number: 7
Features columns number: 2004
DataFrame dimensions: (2195, 2005)


Now I expect an improvement in performance having removed lines with the same features but different target.

In [17]:
train = df.loc[rows,features+[target]].copy()
train.reset_index(drop=True, inplace=True)
train[target].replace({'Bitter': 0, 'Sweet': 1}, inplace=True)
custom_cv = create_folds(train=train, features=features, target=target, num_folds=NUM_FOLDS, shuffle=True, seed=SEED)
evaluate(params=PARAMS, train=train, features=features, target=target, folds=custom_cv)

Training until validation scores don't improve for 20 rounds
Fold 1 AUC: 0.94884
Fold 2 AUC: 0.95228
Fold 3 AUC: 0.94589
Fold 4 AUC: 0.94860
Fold 5 AUC: 0.94749
Folds AUC: 0.94862+-0.00210
Total AUC: 0.94813


### Removing nan columns

In [18]:
df_tmp = df.loc[rows,features].copy()

In [19]:
tmp = df_tmp.isnull().mean()
to_drop = tmp[tmp==1].index.to_list()

In [20]:
features = sorted(list(set(features) - set(to_drop)))
print(f'Metadata columns number: {len(metadata)}')
print(f'Features columns number: {len(features)}')
print(f'DataFrame dimensions: {df.loc[rows, features+[target]].shape}')

Metadata columns number: 7
Features columns number: 1690
DataFrame dimensions: (2195, 1691)


The result shouldn't change having only removed not useful columns.

In [21]:
train = df.loc[rows,features+[target]].copy()
train.reset_index(drop=True, inplace=True)
train[target].replace({'Bitter': 0, 'Sweet': 1}, inplace=True)
custom_cv = create_folds(train=train, features=features, target=target, num_folds=NUM_FOLDS, shuffle=True, seed=SEED)
evaluate(params=PARAMS, train=train, features=features, target=target, folds=custom_cv)

Training until validation scores don't improve for 20 rounds
Fold 1 AUC: 0.94884
Fold 2 AUC: 0.95228
Fold 3 AUC: 0.94589
Fold 4 AUC: 0.94860
Fold 5 AUC: 0.94749
Folds AUC: 0.94862+-0.00210
Total AUC: 0.94813


### Removing columns with no variance (constant features)

In [22]:
df_tmp = df.loc[rows, features].copy()
tmp = df_tmp.var()
to_drop = tmp[tmp == 0].index.to_list()

In [23]:
features = sorted(list(set(features) - set(to_drop)))
print(f'Metadata columns number: {len(metadata)}')
print(f'Features columns number: {len(features)}')
print(f'DataFrame dimensions: {df.loc[rows, features+[target]].shape}')

Metadata columns number: 7
Features columns number: 1555
DataFrame dimensions: (2195, 1556)


The result shouldn't change having only removed not useful columns.

In [24]:
train = df.loc[rows,features+[target]].copy()
train.reset_index(drop=True, inplace=True)
train[target].replace({'Bitter': 0, 'Sweet': 1}, inplace=True)
custom_cv = create_folds(train=train, features=features, target=target, num_folds=NUM_FOLDS, shuffle=True, seed=SEED)
evaluate(params=PARAMS, train=train, features=features, target=target, folds=custom_cv)

Training until validation scores don't improve for 20 rounds
Fold 1 AUC: 0.94884
Fold 2 AUC: 0.95228
Fold 3 AUC: 0.94589
Fold 4 AUC: 0.94860
Fold 5 AUC: 0.94749
Folds AUC: 0.94862+-0.00210
Total AUC: 0.94813


### Removing columns with very low variance (quasi-constant features)

In [25]:
df_tmp = df.loc[rows, features].copy()

th = 0.99
to_drop = []

for col in features:
    top_value = (df[col].value_counts(dropna=False)/len(df)).sort_values(ascending=False).iloc[0]
    if top_value > th:
        to_drop.append(col)

In [26]:
features = sorted(list(set(features) - set(to_drop)))
print(f'Metadata columns number: {len(metadata)}')
print(f'Features columns number: {len(features)}')
print(f'DataFrame dimensions: {df.loc[rows, features+[target]].shape}')

Metadata columns number: 7
Features columns number: 1432
DataFrame dimensions: (2195, 1433)


The result shouldn't change having only removed not useful columns.

In [27]:
train = df.loc[rows,features+[target]].copy()
train.reset_index(drop=True, inplace=True)
train[target].replace({'Bitter': 0, 'Sweet': 1}, inplace=True)
custom_cv = create_folds(train=train, features=features, target=target, num_folds=NUM_FOLDS, shuffle=True, seed=SEED)
evaluate(params=PARAMS, train=train, features=features, target=target, folds=custom_cv)

Training until validation scores don't improve for 20 rounds
Fold 1 AUC: 0.94884
Fold 2 AUC: 0.95228
Fold 3 AUC: 0.94589
Fold 4 AUC: 0.94860
Fold 5 AUC: 0.94749
Folds AUC: 0.94862+-0.00210
Total AUC: 0.94813


### Collapsing duplicate columns

In [28]:
df_tmp = df.loc[rows, features].T.copy()

In [29]:
tmp = df_tmp[df_tmp.duplicated(keep=False)]

dup_columns = tmp.groupby(list(tmp), dropna=False).apply(lambda x: tuple(x.index)).to_list()
print('Duplicated columns:\n', dup_columns)

Duplicated columns:
 [('NssssN', 'fr_quatN'), ('n11FARing', 'n11FRing'), ('NsF', 'nF'), ('n9FaHRing', 'n9FaRing'), ('nBondsT', 'tbonds'), ('fr_halogen', 'nX'), ('NddsN', 'fr_nitro'), ('NaaNH', 'fr_Ar_NH', 'fr_Nhpyrrole'), ('n5aHRing', 'n5aRing'), ('NsNH2', 'fr_NH2'), ('fr_epoxide', 'n3AHRing', 'n3HRing'), ('n3ARing', 'n3Ring'), ('NumHDonors', 'nHBDon'), ('NumRotatableBonds', 'nRot'), ('abonds', 'nAromBond', 'nBondsA'), ('FCSP3', 'FractionCSP3'), ('MolLogP', 'SLogP'), ('RingCount', 'nRing'), ('NumHAcceptors', 'nHBAcc'), ('NumHeteroatoms', 'nHetero'), ('dbonds', 'nBondsD'), ('Chi1', 'Xp-1d'), ('MaxAbsEStateIndex', 'MaxEStateIndex'), ('HeavyAtomCount', 'nHeavyAtom'), ('MWC01', 'nBondsO'), ('SpAD_A', 'SpAbs_A'), ('TPSA.1', 'TopoPSA(NO)'), ('MolMR', 'SMR'), ('SpAD_D', 'SpAbs_D'), ('ExactMolWt', 'MW'), ('MAXdS', 'MINdS')]


In [30]:
combine_columns = False

for cols in dup_columns:
    cols = list(cols)
    assert(df.loc[rows, cols].var(axis=1).sum() == 0)
    
    if combine_columns:
        new_col_name = '_'.join(cols)
        df[new_col_name] = df[cols[0]].values
        #df.drop(columns=cols, inplace=True)
        features = list(set(features) - set(cols))
        features.append(new_col_name)
    else:
        features = sorted(list(set(features) - set(cols[1:])))

In [31]:
print(f'Metadata columns number: {len(metadata)}')
print(f'Features columns number: {len(features)}')
print(f'DataFrame dimensions: {df.loc[rows, features+[target]].shape}')

Metadata columns number: 7
Features columns number: 1398
DataFrame dimensions: (2195, 1399)


The result shouldn't change having only removed redundant columns.

In [32]:
train = df.loc[rows,features+[target]].copy()
train.reset_index(drop=True, inplace=True)
train[target].replace({'Bitter': 0, 'Sweet': 1}, inplace=True)
custom_cv = create_folds(train=train, features=features, target=target, num_folds=NUM_FOLDS, shuffle=True, seed=SEED)
evaluate(params=PARAMS, train=train, features=features, target=target, folds=custom_cv)

Training until validation scores don't improve for 20 rounds
Fold 1 AUC: 0.94884
Fold 2 AUC: 0.95228
Fold 3 AUC: 0.94589
Fold 4 AUC: 0.94860
Fold 5 AUC: 0.94749
Folds AUC: 0.94862+-0.00210
Total AUC: 0.94813


## Save cleaned dataset

In [33]:
d = {'DataFrame': df, 'metadata': metadata, 'features': features, 'target': target}

with open('../data/comb.pickle', 'wb') as handle:
    pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)