In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from keras.models import model_from_json

from sklearn import metrics

from dataUtils import DataUtils
from model_utils import ModelUtils
from scipy.stats import boxcox
from scipy.special import inv_boxcox

In [18]:
dset = DataUtils.readData('cleaned_data_emission.tsv', sep='\t')

   #      Name  Epsilon      Solvent  Quantum Yield    Solvent.1  \
0  1   Benzene      210  cyclohexane          0.053       hexane   
1  2   Toluene     2860  cyclohexane          0.170  cyclohexane   
2  3  o-Xylene      254  cyclohexane          0.170       hexane   
3  4  m-Xylene      284  cyclohexane          0.130       hexane   
4  5  p-Xylene      770  cyclohexane          0.220       hexane   

                            File                         File.1  Absorption  \
0    A01_71-43-2_Benzene.abs.txt    A01_71-43-2_Benzene.ems.txt      254.75   
1   A02_108-88-3_Toluene.abs.txt   A02_108-88-3_Toluene.ems.txt      261.75   
2   A03_95-47-6_o-Xylene.abs.txt   A03_95-47-6_o-Xylene.ems.txt      263.00   
3  A04_108-38-3_m-Xylene.abs.txt  A04_108-38-3_m-Xylene.ems.txt      265.00   
4  A05_106-42-3_p-Xylene.abs.txt  A05_106-42-3_p-Xylene.ems.txt      275.00   

            SMILES  Emission  
0      C1=CC=CC=C1     287.0  
1     CC1=CC=CC=C1     289.5  
2    CC1=CC=CC=C1C     

In [2]:
uniform_length = 279 # GLOBAL

In [13]:
word_map=DataUtils.load_wordmap_from_json('smiles_wordmap.json')

In [57]:
len(word_map)

42

#### predict wavelengths(abs, ems)

In [9]:
def load_model_and_weight(json_fn, model_weight):
    json_file = open(json_fn, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights(model_weight)
    return loaded_model

In [22]:
loaded_model_abs = load_model_and_weight('model_smiles_cnn.json','weights.best.hdf5')

In [32]:
def predict_wavelength(smiles_string = 'C1=CC=CC=C1', uniform_length_of_model = 279, word_map = None, model = None):
    indivisual_X_code = DataUtils.numeric_encoding(np.array([smiles_string]), uniform_length_of_model,word_map)
    pred_wl = model.predict(indivisual_X_code)
    return pred_wl
    
predict_wavelength(smiles_string='CC1=CC=CC=C1C',word_map=word_map, model=loaded_model_abs)

array([[226.49776]], dtype=float32)

In [29]:
loaded_model_ems = load_model_and_weight('model_smiles_ems_dropna.json','ems_dropna.best1.hdf5')

In [30]:
predict_wavelength(smiles_string='CC1=CC=CC=C1C',word_map=word_map, model=loaded_model_ems)

array([[254.44116]], dtype=float32)

### predict epsilon

In [31]:
boxcox_lambda = 0.2

In [33]:
def predict_epsilon(smiles_string = 'C1=CC=CC=C1', uniform_length_of_model = 279, word_map = None, model = None, boxcox_lambda = 0.2):
    indivisual_X_code = DataUtils.numeric_encoding(np.array([smiles_string]), uniform_length_of_model,word_map)
    pred_epsilon = inv_boxcox(model.predict(indivisual_X_code), boxcox_lambda)
    return pred_epsilon

In [38]:
dset[1,:]

array([2, 'Toluene', 2860, 'cyclohexane', 0.17, 'cyclohexane',
       'A02_108-88-3_Toluene.abs.txt', 'A02_108-88-3_Toluene.ems.txt',
       261.75, 'CC1=CC=CC=C1', 289.5], dtype=object)

In [35]:
loaded_model_epsilon = load_model_and_weight('model_smiles_epsilon_lstm.json','epsilon.best_-1.hdf5')

In [52]:
predict_epsilon(word_map=word_map,model=loaded_model_epsilon)

array([[4175.531]], dtype=float32)

#### predict QY

In [40]:
loaded_model_qy = load_model_and_weight('model_lstm_qy.json', 'weights.qy_best.hdf5')

In [48]:
def predict_qy(smiles_string = 'C1=CC=CC=C1', uniform_length_of_model = 279, word_map = None, model = None):
    indivisual_X_code = DataUtils.numeric_encoding(np.array([smiles_string]), uniform_length_of_model,word_map)
    pred_qy = model.predict(indivisual_X_code)
    return pred_qy

### wrap everything

In [124]:
def predict_wl_and_others(smiles_string, wordmap, uni_length, model_abs, model_ems, model_epsilon, model_qy, boxcox_lambda):
    abs_l = predict_wavelength(smiles_string, uni_length, wordmap, model_abs)
    ems_l = predict_wavelength(smiles_string, uni_length, wordmap, model_ems)
    epsilon = predict_epsilon(smiles_string, uni_length, wordmap, model_epsilon, boxcox_lambda)
    qy = predict_qy(smiles_string, uni_length, wordmap, model_qy)
    table = np.column_stack([abs_l, ems_l, epsilon, qy])
    data_for_plot = np.column_stack([np.array(smiles_string), abs_l[0], [1.0], ems_l[0],[1.0]])
    return table, data_for_plot


### Save out file

In [125]:
table, data_for_plot = predict_wl_and_others('C1=CC=CC=C1',word_map,uniform_length, loaded_model_abs,loaded_model_ems,loaded_model_epsilon,loaded_model_qy,0.2)

In [131]:
pd.DataFrame(table, columns=['lambda_abs', 'lambda_ems','epsilon', 'quantum_yield']).to_csv('example_table.txt', sep = '\t', index=False)

In [132]:
pd.DataFrame(data_for_plot, columns=['SMILES', 'abs_wl_max','abs_intensity','ems_wl_max','ems_intensity']).to_csv('example_plot_data.txt', sep = '\t', index=False)