In [4]:
import json 
import pandas as pd 
import numpy as np 
import pickle 

In [16]:
from atools_ml.descriptors import rdkit_descriptors

In [3]:
COF_paths = dict()
F0_paths = dict()
COF_paths['pickle'] = 'random-forest/models/everything/nbins-10/set_0/COF_all.pickle'
COF_paths['ptxt'] = 'random-forest/models/everything/nbins-10/set_0/COF_all.ptxt'
F0_paths['pickle'] = 'random-forest/models/everything/nbins-10/set_0/intercept_all.pickle'
F0_paths['ptxt'] = 'random-forest/models/everything/nbins-10/set_0/intercept_all.ptxt'


In [6]:
with open(COF_paths['pickle'] ,'rb') as m: 
    COF_model = pickle.load(m)
with open(COF_paths['ptxt'], 'rb') as f:
    COF_features = pickle.load(f)
with open(F0_paths['pickle'], 'rb') as m:
    F0_model = pickle.load(m)
with open(F0_paths['ptxt'], 'rb') as f:
    F0_features = pickle.load(f)


In [10]:
ind_descriptors = pd.read_csv('data/raw-data/descriptors-ind.csv', index_col=0)

In [56]:
def calculate_raw_descriptors(h_smiles, ch3_smiles, ind_descriptors):
    '''Retrieve h_smiles and ch3_smiles descriptors from ind_descriptor.
        
    If parameters do not exist in ind_descriptors, calculate the parameters
    from scratch (averagin over 1000 trials)
        
    Parameters
    ----------
    h_smiles : str
        h_smiles string of the interested chemistry
    ch3_smiles : str 
        ch3_smiles string of the interested chemistry
    ind_descriptors : pd.Dataframe
        Dataframe which contains the reference descriptors

    Returns
    -------
    tg_descriptors : dict
    '''
    import pandas as pd 

    final_desc_h_tg = dict()
    final_desc_ch3_tg = dict()

    tg_descriptors = dict()

    if (h_smiles not in ind_descriptors) or ch3_smiles not in ind_descriptors:
        print(f'Calculating chemical descriptors for {h_smiles} and {ch3_smiles}')
        for i in range(1000):    
            tmp_desc_h_tg = rdkit_descriptors(h_smiles)
            tmp_desc_ch3_tg = rdkit_descriptors(ch3_smiles,
                include_h_bond=True, ch3_smiles=ch3_smiles)

            for key in tmp_desc_h_tg:
                if key in final_desc_h_tg:
                    final_desc_h_tg[key].append(tmp_desc_h_tg[key])
                else:
                    final_desc_h_tg[key] = [tmp_desc_h_tg[key]]
            for key in tmp_desc_ch3_tg:
                if key in final_desc_ch3_tg:
                    final_desc_ch3_tg[key].append(tmp_desc_ch3_tg[key])
                else:
                    final_desc_ch3_tg[key] = [tmp_desc_ch3_tg[key]]
        for key in final_desc_h_tg:
            final_desc_h_tg[key] = np.mean(final_desc_h_tg[key])
        for key in final_desc_ch3_tg:
            final_desc_ch3_tg[key] = np.mean(final_desc_ch3_tg[key])

        tg_descriptors[h_smiles] = final_desc_h_tg
        tg_descriptors[ch3_smiles] = final_desc_ch3_tg
        result = pd.DataFrame.from_dict(tg_descriptors)
    else:
        print(f'{h_smiles} and {ch3_smiles} already exist in ind_descriptors')
        result = ind_descriptors[[h_smiles, ch3_smiles]]
    return result

In [78]:
def consolidate_descriptors(top_smiles, top_frac,
                            bot_smiles, bot_frac,
                            desc_df):
    '''Consolidate h_smiles and ch3_smiles descriptors
    
    Parameters
    ----------
    top_smiles : list
        List of smiles strings of the top monolayer, each chemistry 
        need to be represented by 2 elements in a tuple, i.e. (h_smiles, ch3_smiles)
    top_frac : list
        List of fraction corresponding to the list of smiles in the top
        monolyaer
    bot_smiles : list
        List of smiles strings of the bottom monolayer, each chemistry 
        need to be represented by 2 elements in a tuple, i.e. (h_smiles, ch3_smiles)
    bot_frac : list
        List of fraction corresponding to the list of smiles in the bottom
        monolayer
    desc_df : pd.DataFrame
        DataFrame which contains individual descriptors of SMILES string 
    
    Returns
    -------
    '''
    
    assert len(top_smiles) == len(top_frac)
    assert len(bot_smiles) == len(bot_frac)
    assert sum(top_frac) == 1 
    assert sum(bot_frac) == 1 
    to_drop = ['pc+-mean', 'pc+-min', 'pc--mean', 'pc--min']
    with open('data/raw-data/feature-clusters.json', 'r') as f:
        clusters = json.load(f) # this is a dict
    shape_features = clusters['shape'] # a list from the clusters dict
    top_desc = {'h': dict(), 'ch3': dict()}
    bot_desc = {'h': dict(), 'ch3': dict()}
    for key in desc_df.index:
        top_desc['h'][key] = top_desc['ch3'][key] = 0
        bot_desc['h'][key] = bot_desc['ch3'][key] = 0 
        for i in range(len(top_smiles)):
            top_desc['h'][key] += desc_df[top_smiles[i][0]][key] * top_frac[i]
            top_desc['ch3'][key] += desc_df[top_smiles[i][1]][key] * top_frac[i]
        for j in range(len(bot_smiles)):
            bot_desc['h'][key] += desc_df[bot_smiles[j][0]][key] * bot_frac[j]
            bot_desc['ch3'][key] += desc_df[bot_smiles[j][1]][key] * bot_frac[j]
    
    desc_h_df = pd.DataFrame([top_desc['h'], bot_desc['h']])
    desc_ch3_df = pd.DataFrame([top_desc['ch3'], bot_desc['ch3']])
                
    desc_df = []
    for i, df in enumerate([desc_h_df, desc_ch3_df]):
        if i == 1:
            hbond_tb = max(df['hdonors'][0], df['hacceptors'][1]) \
                       if all((df['hdonors'][0], df['hacceptors'][1])) \
                       else 0
            hbond_bt = max(df['hdonors'][1], df['hacceptors'][0]) \
                       if all((df['hdonors'][1], df['hacceptors'][0])) \
                       else 0
            hbonds = hbond_tb + hbond_bt
            df.drop(['hdonors', 'hacceptors'], 'columns', inplace=True)
        else:
            hbonds = 0
        means = df.mean()
        mins = df.min()
        means = means.rename({label: '{}-mean'.format(label)
                              for label in means.index})
        mins = mins.rename({label: '{}-min'.format(label)
                            for label in mins.index})
        desc_tmp = pd.concat([means, mins])
        desc_tmp['hbonds'] = hbonds
        desc_tmp.drop(labels=to_drop, inplace=True)
        desc_df.append(desc_tmp)

    df_h_predict = desc_df[0]
    df_ch3_predict = desc_df[1]
    df_h_predict = pd.concat([
        df_h_predict.filter(like=feature) for feature in shape_features], axis=0)
    df_ch3_predict.drop(labels=df_h_predict.keys(), inplace=True)

    df_h_predict_mean = df_h_predict.filter(like='-mean')
    df_h_predict_min = df_h_predict.filter(like='-min')
    df_ch3_predict_mean = df_ch3_predict.filter(like='-mean')
    df_ch3_predict_min = df_ch3_predict.filter(like='-min')
    
    df_predict = pd.concat([df_h_predict_mean, df_h_predict_min, 
                            df_ch3_predict_mean, df_ch3_predict_min,
                            df_ch3_predict[['hbonds']]])
   
    return df_predict

In [110]:
output = consolidate_descriptors(top_smiles=[('C', 'CC'), ('N', 'CN')],
                                 top_frac=[0.5, 0.5], 
                                 bot_smiles=[('C', 'CC')],
                                 bot_frac=[1],
                                 desc_df=ind_descriptors)

In [111]:
filtered_output = output.filter(COF_features)

In [112]:
COF_model.predict(np.asarray(filtered_output).reshape(1, -1))

array([0.15381959])

In [113]:
F0_model.predict(np.asarray(filtered_output).reshape(1, -1))

array([0.75839098])