In [1]:
# Preliminaries to work with the data.   
%matplotlib inline
%run __init__.py
%load_ext autoreload
%autoreload 2
from utils import loading, scoring
from gerkin import dream,params
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
DATA = '../../data'

In [2]:
# Load the data
descriptors = loading.get_descriptors(format='True')
sets = ['training','leaderboard']
all_CIDs = loading.get_CIDs(sets)
all_CID_dilutions = loading.get_CID_dilutions(sets)

In [3]:
# Load the molecular features
feature_sets = ['dragon','episuite','morgan','nspdk']
mdx = loading.get_molecular_data(feature_sets,all_CIDs)

Dragon has 4869 features for 407 molecules.
Episuite has 62 features for 407 molecules.
Morgan has 2437 features for 407 molecules.
Nspdk has 5392 features for 407 molecules.
There are now 12760 total features.


In [4]:
# Create the feature and descriptor arrays 
X_forest,good1,good2,means,stds,imputer = \
    dream.make_X(mdx, all_CID_dilutions)

The X matrix now has shape (814x5554) molecules by non-NaN good molecular descriptors


In [5]:
# Load raw perceptual data
pdx = loading.load_perceptual_data(sets)
# Create a dataframe with the selected perceptual data
Y_all = dream.make_Y(pdx, concentration='all', imputer='mask')
# Average across subjects
Y_all = Y_all.mean(axis=1,level=1)

In [6]:
# Load some hyper parameters from a file.  These were obtained independently via cross-validation
use_et, max_features, max_depth, min_samples_leaf, trans_weight, regularize, use_mask = params.get_other_params()

In [7]:
# Compute the random forest models
def compute_importance_ranks(X,Y,n_estimators=50,
                  max_features='auto',
                  max_depth=None,min_samples_leaf=1,
                  random_state=0):
    if 'mean_dilution' in X: # Drop the mean dilution feature if it is present, to avoid leak.  
        X = X.drop('mean_dilution',axis=1)
    descriptors = loading.get_descriptors(format=True)
    importances = pd.DataFrame(index=descriptors,columns=X.columns) # Empty matrix to store feature importances.  
    for i,desc in enumerate(descriptors): # For each descriptor.  
        print(desc)
        y = Y[desc] # Perceptual data for this descriptor.  
        y = y[y.notnull()] # Remove missing values (e.g. non 1/1000 intensities for leaderboard)
        x = X.loc[y.index]
        if desc=='Intensity':
            est = ExtraTreesRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,
                                      min_samples_leaf=min_samples_leaf,n_jobs=8,random_state=random_state)
        else:
            est = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,
                                      min_samples_leaf=min_samples_leaf,oob_score=False,n_jobs=8,random_state=random_state)
        
        est.fit(x,y) # Fit the model on the training data.  
        importances.loc[desc,:] = est.feature_importances_
        #importance_ranks[col,:] = np.argsort(est.feature_importances_)[::-1] # Use feature importances to get ranks.  

    return importances

In [8]:
# Load of compute the feature importances
n_estimators = 20 # Increase to 100 for the full run
if True: 
    importances = compute_importance_ranks(X_forest,Y_all,n_estimators=n_estimators)
    importances.to_pickle(os.path.join(DATA,'importances_forest.pkl'))
else:
    importances = pd.read_pickle(os.path.join(DATA,'importances_forest.pkl'))

Intensity
Pleasantness
Bakery
Sweet
Fruit
Fish
Garlic
Spices
Cold
Sour
Burnt
Acid
Warm
Musky
Sweaty
Ammonia
Decayed
Wood
Grass
Flower
Chemical


In [9]:
# Get names of molecules corresponding to Morgan fingerprint CIDs
morgan_names = pd.read_csv('../../data/CID_names_morgan.txt',delimiter='\t',header=None,names=['CID','Name']).\
               set_index('CID').groupby('CID').first()['Name']

In [10]:
# Rename feautres, e.g. from CID to molecule name for Morgan fingeprints
def rename_feature(feature):
    if feature[0] == 'morgan':
        CID = int(feature[1])
        feature = tuple(['morgan',morgan_names.loc[CID]])
    return feature

# Print importance ranks for a descriptor
def print_ranks(importances, desc):
    importances = importances.drop('dilution',axis=1) # Ignore dilution when ranking the features
    pd.set_option('display.float_format', lambda x: '%.4f' % x)
    x = importances.T.sort_values(desc,ascending=False)
    x['Rank'] = range(len(x))
    x = x[['Rank',desc]].rename(columns={desc:'Importance'})
    x.index = map(lambda z:rename_feature(z), list(x.index))
    x.index.name = desc
    display(x.head(10))

In [11]:
# Print the top 10 feature importances for each descriptor
for desc in descriptors:
    print_ranks(importances, desc)

Unnamed: 0_level_0,Rank,Importance
Intensity,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, B03[C-S])",0,0.0329
"(dragon, F03[C-S])",1,0.0129
"(dragon, LLS_01)",2,0.0109
"(dragon, SpAbs_B(s))",3,0.0067
"(dragon, SpMax8_Bh(s))",4,0.0067
"(dragon, O-057)",5,0.0066
"(episuite, EXPaws Score (Log Kow))",6,0.0054
"(dragon, SP04)",7,0.0047
"(morgan, Cyclopentene, 1-hexyl-)",8,0.0045
"(dragon, ATS2s)",9,0.0044


Unnamed: 0_level_0,Rank,Importance
Pleasantness,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, SssO)",0,0.0503
"(dragon, RDF015s)",1,0.0422
"(dragon, HGM)",2,0.0222
"(morgan, 3-Ethoxy-4-hydroxybenzaldehyde)",3,0.0161
"(dragon, P_VSA_MR_8)",4,0.0137
"(morgan, Decahydro-2-naphthyl formate)",5,0.0119
"(dragon, MATS7s)",6,0.0112
"(dragon, nHM)",7,0.01
"(dragon, ATS1e)",8,0.0097
"(dragon, GATS2s)",9,0.0096


Unnamed: 0_level_0,Rank,Importance
Bakery,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, 3-Hydroxy-4-methoxybenzaldehyde)",0,0.2629
"(morgan, Vanillin isobutyrate)",1,0.0528
"(morgan, 3-Ethoxy-4-hydroxybenzaldehyde)",2,0.0481
"(morgan, 2-ethoxy-4-formylphenyl acetate)",3,0.0258
"(morgan, 3,4-Dihydroxybenzaldehyde)",4,0.0236
"(morgan, 4-Formyl-2-methoxyphenyl acetate)",5,0.0232
"(morgan, Imidazole-2-carboxaldehyde)",6,0.0186
"(dragon, R7e+)",7,0.0179
"(morgan, ETHYL ISOVALERATE)",8,0.0123
"(dragon, SM05_AEA(ri))",9,0.0099


Unnamed: 0_level_0,Rank,Importance
Sweet,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, 3-Ethoxy-4-hydroxybenzaldehyde)",0,0.0673
"(morgan, Cyclopentenyl propionate musk)",1,0.0296
"(morgan, DIETHYL MALATE)",2,0.0276
"(morgan, ETHYL 3-HEXENOATE)",3,0.0224
"(morgan, Ethyl pentanoate)",4,0.0183
"(dragon, CATS2D_04_AL)",5,0.0179
"(morgan, 3-Hydroxy-4-methoxybenzaldehyde)",6,0.0179
"(dragon, SssO)",7,0.0163
"(morgan, 2-ethoxy-4-formylphenyl acetate)",8,0.0128
"(morgan, 24851-98-7)",9,0.0101


Unnamed: 0_level_0,Rank,Importance
Fruit,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, ETHYL 3-HEXENOATE)",0,0.0766
"(morgan, 24851-98-7)",1,0.0754
"(morgan, 3,7-dimethylocta-2,6-dienyl propanoate)",2,0.0348
"(morgan, Triethyl orthoformate)",3,0.0284
"(morgan, 2,6-Octadiene, 1-ethoxy-3,7-dimethyl-, (2Z)-)",4,0.0243
"(morgan, Ethyl caproate)",5,0.016
"(morgan, ETHYL LEVULINATE)",6,0.0142
"(morgan, Methyl jasmonate)",7,0.0139
"(dragon, Eig08_EA(bo))",8,0.0135
"(dragon, Mor10s)",9,0.0134


Unnamed: 0_level_0,Rank,Importance
Fish,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, P_VSA_m_4)",0,0.101
"(dragon, X4Av)",1,0.0376
"(morgan, tryptamine)",2,0.0358
"(dragon, R3p+)",3,0.0236
"(dragon, SssS)",4,0.0232
"(morgan, bis(1-mercaptopropyl) sulfide)",5,0.0197
"(morgan, 2-Phenylethyl isothiocyanate)",6,0.0154
"(dragon, G(O..S))",7,0.0144
"(morgan, 3-PENTANOL)",8,0.0131
"(morgan, PIPERIDINE)",9,0.013


Unnamed: 0_level_0,Rank,Importance
Garlic,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, HATS3p)",0,0.1846
"(dragon, R3p+)",1,0.101
"(dragon, P_VSA_m_4)",2,0.0398
"(dragon, Mor05m)",3,0.0383
"(dragon, S-107)",4,0.0246
"(dragon, X3Av)",5,0.0197
"(dragon, R1p+)",6,0.0177
"(dragon, Eig05_AEA(ri))",7,0.0139
"(dragon, Psi_e_0d)",8,0.0122
"(dragon, VE1_Dz(v))",9,0.0117


Unnamed: 0_level_0,Rank,Importance
Spices,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, Verdoracine)",0,0.0431
"(morgan, GAMMA-TERPINENE)",1,0.0408
"(morgan, safrole)",2,0.0183
"(morgan, Xanthorrhizol)",3,0.0178
"(morgan, Nootkatin)",4,0.011
"(dragon, Eig11_AEA(ri))",5,0.0099
"(dragon, HATS3p)",6,0.0087
"(morgan, M-CYMENE)",7,0.0085
"(morgan, Bis(methylthio)methane)",8,0.0081
"(morgan, Thymol acetate)",9,0.0075


Unnamed: 0_level_0,Rank,Importance
Cold,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, BETA-TERPINEOL)",0,0.0168
"(dragon, Mor14s)",1,0.0133
"(morgan, Ledol)",2,0.0125
"(morgan, 1-Phenylethyl propionate)",3,0.0115
"(dragon, R5i)",4,0.01
"(dragon, Eig11_EA(ed))",5,0.0088
"(morgan, 9-Decenyl acetate)",6,0.0083
"(morgan, Verbanol)",7,0.008
"(morgan, Globulol)",8,0.0068
"(morgan, (2R)-2-(3-Methylbut-2-enyl)-2,3-dihydronaphthalene-1,4-dione)",9,0.0065


Unnamed: 0_level_0,Rank,Importance
Sour,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, SpMAD_EA(dm))",0,0.0422
"(morgan, butyric acid)",1,0.0343
"(morgan, sulfur dioxide)",2,0.0259
"(dragon, Mor13m)",3,0.0251
"(dragon, GATS2e)",4,0.0234
"(morgan, citric acid)",5,0.0132
"(morgan, Citral)",6,0.0103
"(dragon, H0m)",7,0.0094
"(dragon, G3m)",8,0.0093
"(morgan, 4-PENTENOIC ACID)",9,0.0084


Unnamed: 0_level_0,Rank,Importance
Burnt,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, Difurfuryl sulfide)",0,0.0999
"(dragon, F04[C-S])",1,0.0796
"(dragon, HATS3v)",2,0.0324
"(dragon, B03[O-S])",3,0.0291
"(morgan, 2-Methyl-1,3-dithiolane)",4,0.0172
"(morgan, 2,3-Lutidine)",5,0.0157
"(dragon, R4p+)",6,0.0137
"(morgan, Ethyl 3-(furfurylthio)propionate)",7,0.0134
"(dragon, Mor08s)",8,0.0129
"(morgan, Pyrazineethanethiol)",9,0.0126


Unnamed: 0_level_0,Rank,Importance
Acid,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, ATSC2s)",0,0.0091
"(dragon, Mor07m)",1,0.009
"(morgan, 2-(1-mercaptoethyl)furan)",2,0.0088
"(dragon, P1p)",3,0.0086
"(morgan, 2-Pentanoylfuran)",4,0.0076
"(dragon, CATS2D_04_AL)",5,0.0065
"(dragon, AVS_B(p))",6,0.0062
"(dragon, SM4_B(s))",7,0.0061
"(morgan, 2-ethyl-3-methylpyrrole)",8,0.0058
"(dragon, Mor15p)",9,0.0056


Unnamed: 0_level_0,Rank,Importance
Warm,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, Mor17s)",0,0.0647
"(morgan, 3-Ethoxy-4-hydroxybenzaldehyde)",1,0.0431
"(morgan, curcumin)",2,0.0113
"(morgan, ETHYL ISOVALERATE)",3,0.009
"(dragon, R6e+)",4,0.0082
"(morgan, Ethyl 3-hydroxybutyrate)",5,0.0069
"(dragon, SpMax1_Bh(m))",6,0.0061
"(dragon, Mor15p)",7,0.0049
"(dragon, R1m)",8,0.0044
"(morgan, PIPERONAL)",9,0.0042


Unnamed: 0_level_0,Rank,Importance
Musky,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, GATS2e)",0,0.0519
"(dragon, Mor08m)",1,0.0251
"(dragon, GATS5s)",2,0.0138
"(morgan, beta-alanine)",3,0.0117
"(dragon, SpMax5_Bh(s))",4,0.0093
"(dragon, GATS2s)",5,0.0086
"(dragon, SssO)",6,0.0084
"(dragon, Mor15p)",7,0.0069
"(dragon, Mor21m)",8,0.0069
"(dragon, Mor08s)",9,0.0066


Unnamed: 0_level_0,Rank,Importance
Sweaty,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, GATS2e)",0,0.076
"(morgan, butyric acid)",1,0.0209
"(morgan, Ipsenol)",2,0.0199
"(dragon, CATS2D_01_AN)",3,0.0153
"(morgan, 2-Methyl-4-pentenoic acid)",4,0.0139
"(morgan, HOTRIENOL)",5,0.0138
"(dragon, GATS2s)",6,0.0134
"(dragon, GATS5s)",7,0.0127
"(morgan, ISOVALERIC ACID)",8,0.0121
"(dragon, GATS5m)",9,0.0105


Unnamed: 0_level_0,Rank,Importance
Ammonia,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, SssO)",0,0.0226
"(dragon, R3u)",1,0.0117
"(morgan, 31704-80-0)",2,0.0116
"(morgan, 4-Methylnonanoic acid)",3,0.0115
"(dragon, F02[C-O])",4,0.0101
"(morgan, p-Tolyl phenylacetate)",5,0.0096
"(dragon, IC3)",6,0.0076
"(morgan, (+)-Cuparene)",7,0.0076
"(morgan, nan)",8,0.0075
"(morgan, HEXANOIC ACID)",9,0.0071


Unnamed: 0_level_0,Rank,Importance
Decayed,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, P_VSA_m_4)",0,0.0726
"(dragon, Mor07p)",1,0.0369
"(morgan, Bis(methylthio)methane)",2,0.0321
"(morgan, BDBM136314)",3,0.0292
"(dragon, SM09_EA(dm))",4,0.0238
"(dragon, SM05_EA(dm))",5,0.0222
"(morgan, 1-HEXEN-3-ONE)",6,0.017
"(dragon, SM07_EA(dm))",7,0.0162
"(morgan, 4-PENTENOIC ACID)",8,0.0139
"(dragon, Mor13m)",9,0.0132


Unnamed: 0_level_0,Rank,Importance
Wood,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, 2,6-Dimethyl-4-ethylpyridine)",0,0.0245
"(morgan, 2-Ethyl-3,5-dimethylpyridine)",1,0.0239
"(morgan, 2,4,6-Trimethylpyridine)",2,0.0107
"(morgan, 2,3,5-Trimethylpyrazine)",3,0.0093
"(dragon, Mor28s)",4,0.0092
"(morgan, 10-UNDECENOIC ACID)",5,0.009
"(morgan, 2,5-Dimethyl-3-isobutylpyrazine)",6,0.0086
"(episuite, Estimated MP (oC))",7,0.0076
"(morgan, linoleic acid)",8,0.0065
"(morgan, Triethylpyrazine)",9,0.0061


Unnamed: 0_level_0,Rank,Importance
Grass,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, cis-3-Hexenyl isovalerate)",0,0.0899
"(morgan, cis-3-Hexenyl isobutyrate)",1,0.0443
"(morgan, 1,1-Dimethoxynon-2-yne)",2,0.0231
"(morgan, cis-3-Hexenyl butyrate)",3,0.0189
"(morgan, 24168-70-5)",4,0.0173
"(morgan, 3-Hexenyl 2-methylbutyrate)",5,0.0168
"(morgan, cis-3-Hexenyl angelate)",6,0.0102
"(morgan, 1-(2,2-Dimethoxyethoxy)hexane)",7,0.0093
"(morgan, Methyl jasmonate)",8,0.0079
"(dragon, MEcc)",9,0.0076


Unnamed: 0_level_0,Rank,Importance
Flower,Unnamed: 1_level_1,Unnamed: 2_level_1
"(morgan, Phenethyl pivalate)",0,0.0292
"(dragon, H_D/Dt)",1,0.0227
"(dragon, SpMax4_Bh(m))",2,0.0156
"(dragon, JGI6)",3,0.0149
"(dragon, GATS4e)",4,0.014
"(morgan, SCHEMBL77189)",5,0.0117
"(dragon, Mor21u)",6,0.0112
"(morgan, 2-ETHOXYNAPHTHALENE)",7,0.0108
"(dragon, piPC07)",8,0.0093
"(dragon, R7p)",9,0.0092


Unnamed: 0_level_0,Rank,Importance
Chemical,Unnamed: 1_level_1,Unnamed: 2_level_1
"(dragon, TPSA(Tot))",0,0.0495
"(dragon, ATSC2s)",1,0.0421
"(dragon, RDF020e)",2,0.016
"(dragon, P1p)",3,0.0145
"(dragon, RDF020i)",4,0.0139
"(dragon, SM1_Dz(m))",5,0.0125
"(dragon, SM1_Dz(Z))",6,0.0122
"(dragon, GATS4s)",7,0.011
"(morgan, 1,2,3,4-Tetrahydronaphthalene)",8,0.009
"(morgan, Decatone)",9,0.0076
