In [1]:
import os
import json
from pathlib import Path
from natsort import natsorted
from datetime import datetime

import re
import pickle
import json
from copy import deepcopy
from pathlib import Path
from natsort import natsorted
from itertools import chain,repeat
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay

from metrics_gen import modelGroup
import rdkit
from rdkit import Chem

### Define params & defaults

In [2]:
data_path = Path.cwd().parent/"data"
csv_path = data_path/"csvs"
pkl_path = data_path/"pkls"
mod_nums = '2345'


metrics_name_list = [
	['accuracy', metrics.accuracy_score],
	['precision', metrics.precision_score],
	['recall', metrics.recall_score],
	['f1', metrics.f1_score], #for binary target],
	['jaccard', metrics.jaccard_score], #https://en.wikipedia.org/wiki/Jaccard_index],
	['roc_auc', metrics.roc_auc_score],
]

In [3]:
poly_abbrev_to_mono_name = {'PVC':'vinyl chloride',
										'PUR':"hydroxymethyl (4'-formamido-[1,1'-biphenyl]-4-yl)carbamate",
										'PE films':'ethylene',
										'EVA':'ethyl vinyl alcohol',
										'PE':'ethylene',
										'polyolefins':'ethylene',
										'PS':'styrene',
										'PMMA':'methyl methacrylate',
										'PA':'azanediyladipoylazanediylhexane-1,6-diyl', 
										'PP':'propylene', 
										}



### Load homopolymer database and raw additives data

In [30]:
df_homopoly = pd.read_pickle(pkl_path/"df_atactic_NOPE_nr_norad.pkl")
df_homopoly.columns


additives_raw = pd.read_csv(csv_path/"additives_ugduler_raw.csv", sep="|", encoding="cp1252")
addits = additives_raw[['type','name','example_plastics']]
addits.head()

Unnamed: 0,type,name,example_plastics
0,Primary Antioxidants,Igranox 1010,PE;PP;PS;ABS
1,Primary Antioxidants,"4,4’-bis(alpha,alpha-dimethylbenzyl) diphenyla...",PE;PP;PS;ABS
2,Secondary Antioxidants,Igrafos 1678,PE;PP;PS;ABS
3,Stabilizers,"2,4-dihydroxybenzophenone",PVC;PE;PA
4,Stabilizers,"Bis(2,2,6,6-tetramethyl-4-piperidyl-1-oxyl) Se...",PVC;PE;PA


### Convert additive names to SMILES (Cactus)

In [31]:
# Lookup SMILES for replacement molecules
import time
from urllib.request import urlopen
from urllib.parse import quote

def CIRconvert(ids):
	try:
		url = 'https://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
		ans = urlopen(url).read().decode('utf8')
		return ans
	except:
		return "Did not work"

In [32]:
generate_addit_smiles = False
if generate_addit_smiles == True:
	addit_smiles_dict = {}
	for name in addits.name.unique():
		print(name)

		addit_smiles_dict[name] = CIRconvert(name)
		print(f"{name} --> {addit_smiles_dict[name]}")
		time.sleep(1.0)
else:
	addit_smiles_dict = {"Igranox 1010": np.nan,
							"4,4'-bis(alpha,alpha-dimethylbenzyl) diphenylamine": np.nan,
							"Igrafos 1678": np.nan,
							"2,4-dihydroxybenzophenone": "Oc1ccc(c(O)c1)C(=O)c2ccccc2",
							"Bis(2,2,6,6-tetramethyl-4-piperidyl-1-oxyl) Sebacate": np.nan,
							"Methyl 3-(3-tert-butly-5-(2H-benzotriazol-2-YL)-4-hydroxynphenyl)propionate": np.nan,
							"Methyl salicylate": "COC(=O)c1ccccc1O",
							"ethyl-2-cyano-3,3-diphenylacrylate": "CCOC(=O)C(C#N)=C(c1ccccc1)c2ccccc2",
							"Tetraethyl 2,2'-(1,4-phenylenedimethylylidene)dimalonate": "CCOC(=O)C(=Cc1ccc(cc1)C=C(C(=O)OCC)C(=O)OCC)C(=O)OCC",
							"dioctyl phtalate": np.nan,
							"dioctyl adipate": "CCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCC",
							"tricresyl phosphate": "Cc1ccc(O[P](=O)(Oc2ccc(C)cc2)Oc3ccc(C)cc3)cc1",
							"decabromodiphenyl ether": "Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(Br)c1Br",
							"tris(2-chloroethyl) phosphate": "ClCCO[P](=O)(OCCCl)OCCCl",
							"melamine": "Nc1nc(N)nc(N)n1",
							"antimony trioxide": "[O--].[O--].[O--].[Sb+3].[Sb+3]",
							"alumina trihydrate": "O|[Al](|O)|O",
							"aluminum hydroxide": "[OH-].[OH-].[OH-].[Al+3]",
							"magnesium hydroxide": "[OH-].[OH-].[Mg++]",
							"zinc borate": "[Zn++].[Zn++].[Zn++].[O-]B([O-])[O-].[O-]B([O-])[O-]",
							"stearic acid": "CCCCCCCCCCCCCCCCCC(O)=O",
							"butyl stearate": "CCCCCCCCCCCCCCCCCC(=O)OCCCC",
							"stearyl alcohol": "CCCCCCCCCCCCCCCCCCO",
							"oleamide": "CCCCCCCC\\C=C/CCCCCCCC(N)=O",
							"calcium stearate": "[Ca++].CCCCCCCCCCCCCCCCCC([O-])=O.CCCCCCCCCCCCCCCCCC([O-])=O",
							"glycerol monostearate": "CCCCCCCCCCCCCCCCCC(O)=O.OCC(O)CO",
							"lauric diethanolamide": "CCCCCCCCCCCC(=O)N(CCO)CCO",
							"behentrimonium chloride": "[Cl-].CCCCCCCCCCCCCCCCCCCCCC[N+](C)(C)C",
							"erucamide": "CCCCCCCC\\C=C/CCCCCCCCCCCC(N)=O",
							"glyceryl monooleate": "CCCCCCCC\\C=C/CCCCCCCC(=O)OCC(O)CO",
							"ammonium lauryl sulfate": "[NH4+].CCCCCCCCCCCCO[S]([O-])(=O)=O",
							"biuret": "NC(=O)NC(N)=O",
							"1,3,5-Tris[3-(dimethylamino)propyl]hexahydro-1,3,5-triazine": "CN(C)CCCN1CN(CCCN(C)C)CN(CCCN(C)C)C1",
							"cynamide": np.nan,
							"azodicarbonamide": "NC(=O)N=NC(N)=O",
							"tributyltin oxide": "O(|[Sn](CCCC)(CCCC)CCCC)|[Sn](CCCC)(CCCC)CCCC",
							"methylarsonic acid": "C[As](O)(O)=O",
							"zinc oleate": "[Zn++].CCCCCCCC\\C=C/CCCCCCCC([O-])=O.CCCCCCCC\\C=C/CCCCCCCC([O-])=O",
							"calcium carbonate": "[Ca++].[O-]C([O-])=O",
							"kaolin": np.nan,
							"talc": "O.O=[Mg].O=[Mg].O=[Mg].O=[Si]=O.O=[Si]=O.O=[Si]=O.O=[Si]=O",
							"aramid fiber": np.nan,
							"muscovite mica": np.nan,
							"sudan I": "O=C\\1C=Cc2ccccc2C1=N/Nc3ccccc3",
							"titanium dioxide": "O=[Ti]=O",
							"cadmium sulfide": np.nan}


smi_vals = pd.DataFrame(addit_smiles_dict.items(), columns=["name","smiles"]).smiles.values
if 'addit_smi' not in addits.columns:
	addits.insert(loc=3, column="addit_smi", value=smi_vals)
addits.head()

Unnamed: 0,type,name,example_plastics,addit_smi
0,Primary Antioxidants,Igranox 1010,PE;PP;PS;ABS,
1,Primary Antioxidants,"4,4’-bis(alpha,alpha-dimethylbenzyl) diphenyla...",PE;PP;PS;ABS,
2,Secondary Antioxidants,Igrafos 1678,PE;PP;PS;ABS,
3,Stabilizers,"2,4-dihydroxybenzophenone",PVC;PE;PA,Oc1ccc(c(O)c1)C(=O)c2ccccc2
4,Stabilizers,"Bis(2,2,6,6-tetramethyl-4-piperidyl-1-oxyl) Se...",PVC;PE;PA,


Drop additives with >1 molecule in SMILES

In [33]:

addits_multiple_mols = [(i,x) for i,x in enumerate(list(addits['addit_smi'].unique())) if (isinstance(x,str)) and x.count(".") > 0]

if len(addits_multiple_mols) > 0:
	print("~~~~~~~~~~~~~~~~~~~~~~WARNING~~~~~~~~~~~~~~~~~~~~~~")
	print("~~~~~~~~~~~~~~~~~~~~~~WARNING~~~~~~~~~~~~~~~~~~~~~~")
	print("ONE OR MORE ADDITIVES HAVE TWO MOLECULES IN SMILES, MUST FIX MANUALLY")
	print(*addits_multiple_mols,sep='\n')
else:
	print("No additives have extra molecules")

addits_multiple_mols_df = addits[addits['addit_smi'].isin([x[1] for x in addits_multiple_mols])]

ONE OR MORE ADDITIVES HAVE TWO MOLECULES IN SMILES, MUST FIX MANUALLY
(10, '[O--].[O--].[O--].[Sb+3].[Sb+3]')
(12, '[OH-].[OH-].[OH-].[Al+3]')
(13, '[OH-].[OH-].[Mg++]')
(14, '[Zn++].[Zn++].[Zn++].[O-]B([O-])[O-].[O-]B([O-])[O-]')
(19, '[Ca++].CCCCCCCCCCCCCCCCCC([O-])=O.CCCCCCCCCCCCCCCCCC([O-])=O')
(20, 'CCCCCCCCCCCCCCCCCC(O)=O.OCC(O)CO')
(22, '[Cl-].CCCCCCCCCCCCCCCCCCCCCC[N+](C)(C)C')
(25, '[NH4+].CCCCCCCCCCCCO[S]([O-])(=O)=O')
(31, '[Zn++].CCCCCCCC\\C=C/CCCCCCCC([O-])=O.CCCCCCCC\\C=C/CCCCCCCC([O-])=O')
(32, '[Ca++].[O-]C([O-])=O')
(33, 'O.O=[Mg].O=[Mg].O=[Mg].O=[Si]=O.O=[Si]=O.O=[Si]=O.O=[Si]=O')


In [34]:
print("Bad Additive Smiles:\n",*list(addits_multiple_mols_df.name.unique()), sep='\n\t- ')
print("\nDropping Multiple Mols From Additives Df")
print("\t Before Drop Na",addits.shape)
addits = addits.dropna()
print("\t\t After Drop Na",addits.shape)
print("\t Before Drop Multiple Mols",addits.shape)
addits = addits.drop(index=addits_multiple_mols_df.index.values)
print("\t\t After Drop Multiple Mols",addits.shape)


Bad Additive Smiles:

	- antimony trioxide
	- aluminum hydroxide
	- magnesium hydroxide
	- zinc borate
	- calcium stearate
	- glycerol monostearate
	- behentrimonium chloride
	- ammonium lauryl sulfate
	- zinc oleate
	- calcium carbonate
	- talc

Dropping Multiple Mols From Additives Df
	 Before Drop Na (46, 4)
		 After Drop Na (35, 4)
	 Before Drop Multiple Mols (35, 4)
		 After Drop Multiple Mols (24, 4)


Drop additives with atypical elements (P, Al, As, Ti, Sn)

In [35]:
from itertools import chain

# Define disallowed elements
disallowed_elmts = ['P', 'Al', 'As', 'Ti', 'Sn']
idx_bad_elmt = []

# Get indices of disallowed elements
for elmt_str in disallowed_elmts:
	idx_bad_elmt.append(addits[addits.addit_smi.str.contains(elmt_str)].index)
 
# Flatten list of lists
idx_bad_elmt = natsorted(chain.from_iterable(idx_bad_elmt))
# Drop bad elements
addits = addits.drop(index=idx_bad_elmt)

In [36]:

if 'ex_mono' not in addits.columns:
	example_plasts = addits['example_plastics'].copy()
	addits.insert(loc=4, column='ex_mono', value=example_plasts.values)
	addits.insert(loc=4, column='ex_plastics', value=example_plasts.values)
	addits = addits.drop(columns=['example_plastics'])
    # Update placeholder entries for example mono
	for key,val in poly_abbrev_to_mono_name.items():
		addits['ex_mono'] = addits['ex_mono'].str.replace(key, val)
	# Lookup smiles for mono
	addit_mono_to_smi = {name:CIRconvert(name) for (_, name) in poly_abbrev_to_mono_name.items()}
	#* Failed to get smiles
	addit_mono_to_smi.pop('azanediyladipoylazanediylhexane-1,6-diyl') # Failed to get smiles



	addits.ex_mono = addits.ex_mono.str.split(';')
	addits = addits.explode(column='ex_mono')
	addits = addits.drop(columns=['ex_plastics'])


	addits['addit_smi'] = addits['addit_smi'].map(Chem.MolFromSmiles).map(Chem.MolToSmiles)
	addits['mono_smi'] = addits['ex_mono'].map(addit_mono_to_smi)
	addits = addits.dropna()
	addits = addits.reset_index(drop=True)

In [37]:
addits

Unnamed: 0,type,name,addit_smi,ex_mono,mono_smi
0,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C
1,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,ethylene,C=C
2,Stabilizers,Methyl salicylate,COC(=O)c1ccccc1O,vinyl chloride,ClC=C
3,Stabilizers,Methyl salicylate,COC(=O)c1ccccc1O,ethylene,C=C
4,Stabilizers,"ethyl-2-cyano-3,3-diphenylacrylate",CCOC(=O)C(C#N)=C(c1ccccc1)c1ccccc1,vinyl chloride,ClC=C
5,Stabilizers,"ethyl-2-cyano-3,3-diphenylacrylate",CCOC(=O)C(C#N)=C(c1ccccc1)c1ccccc1,ethylene,C=C
6,Stabilizers,"Tetraethyl 2,2’-(1,4-phenylenedimethylylidene)...",CCOC(=O)C(=Cc1ccc(C=C(C(=O)OCC)C(=O)OCC)cc1)C(...,vinyl chloride,ClC=C
7,Stabilizers,"Tetraethyl 2,2’-(1,4-phenylenedimethylylidene)...",CCOC(=O)C(=Cc1ccc(C=C(C(=O)OCC)C(=O)OCC)cc1)C(...,ethylene,C=C
8,Plasticizers,dioctyl adipate,CCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCC,vinyl chloride,ClC=C
9,Flame retardants,decabromodiphenyl ether,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,styrene,C=Cc1ccccc1


## Generate Descriptors

Select solvents to be used (default: all)

In [38]:
solvents_to_use = df_homopoly.set_index('solvent').solvent_smiles.to_dict()
# Delay creation of solv_smiles as we want to explode 'solv_name' 
# We explode to get all possible additive-mono-solv combos
addits['solv_name'] = None
for row in addits.itertuples():
	addits.at[row.Index, 'solv_name'] = list(solvents_to_use.keys())
addits = addits.explode(column='solv_name').reset_index(drop=True)
addits['solv_smi'] = addits.solv_name.map(solvents_to_use)
addits

Unnamed: 0,type,name,addit_smi,ex_mono,mono_smi,solv_name,solv_smi
0,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,aniline,Nc1ccccc1
1,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,acetone,CC(C)=O
2,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,Benzene,c1ccccc1
3,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,carbon tetrachloride,ClC(Cl)(Cl)Cl
4,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,cyclohexane,C1CCCCC1
...,...,...,...,...,...,...,...
9111,Colourants,sudan I,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,ethylene,C=C,bromoform,BrC(Br)Br
9112,Colourants,sudan I,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,ethylene,C=C,"2-Methyl-2,4-pentanediol",CC(O)CC(C)(C)O
9113,Colourants,sudan I,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,ethylene,C=C,sulfolane,O=S1(=O)CCCC1
9114,Colourants,sudan I,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,ethylene,C=C,"3-pentanol,",CCC(O)CC


##### Generate RDKit Descriptors: Polymer-Solvent System

In [13]:

from rdkit.Chem import rdMolDescriptors

def generate_rdmol_descriptors(descriptor_list, smiles_lst):
	"""From a descriptor list and a smiles list, returns a list and dataframe of rdMolDescriptors (with tuples handled appropriately)."""
	mol_list = [Chem.MolFromSmiles(x) for x in smiles_lst]

	rd_descriptors = []
	properties = rdMolDescriptors.Properties(descriptor_list)
	for i,entry in enumerate(mol_list):
		if entry is None or entry is np.nan:
			print("SMILES COULD NOT BE PARSED:",smiles_lst[i],entry,i)
			rd_descriptors.append('')
			continue
		else:
			properties_tuple = list(zip(properties.GetPropertyNames(), properties.ComputeProperties(entry)))
			rd_descriptors.append(properties_tuple)
	rd_descriptors = [list(x) for x in rd_descriptors]
	descriptor_df = pd.DataFrame(rd_descriptors, columns=descriptor_list)

	def get_second_elmt_tuple(x):
		if x is None:
			return np.nan
		else:
			return x[1]
	descriptor_df = descriptor_df.applymap(get_second_elmt_tuple)
	descriptor_df = descriptor_df.dropna()
	return descriptor_df #,rd_descriptors previously returned




mono_properties_subset = ['lipinskiHBA', 
		'NumHBA', 
		'lipinskiHBD', 
		'NumHBD', 
		'NumRotatableBonds',
		'NumHeteroatoms', 
		'NumAmideBonds', 
		'FractionCSP3', 
		'NumRings', 
		'NumAromaticRings', 
		'NumAliphaticRings', 
		'NumSaturatedRings', 
		'NumHeterocycles', 
		'NumSaturatedHeterocycles',
		'NumAliphaticHeterocycles',
		'NumAtomStereoCenters', 
		'tpsa',
		'chi0v',
		'chi1v',
		'chi2v',
		'chi3v',
		'chi4v',
		'kappa1',
		'kappa2',
		'kappa3']


solvent_properties_subset = ['lipinskiHBA', 
		'NumHBA', 
		'lipinskiHBD', 
		'NumHBD', 
		'NumRotatableBonds', 
		'NumHeteroatoms', 
		'NumAmideBonds', 
		'FractionCSP3', 
		'NumRings', 
		'NumAromaticRings', 
		'NumAliphaticRings', 
		'NumSaturatedRings', 
		'NumHeterocycles', 
		'NumSaturatedHeterocycles',
		'NumAliphaticHeterocycles',
		'tpsa',
		'chi0v',
		'chi1v',
		'chi2v',
		'chi3v',
		'chi4v',
		'kappa1',
		'kappa2',
		'kappa3']






#* Generate Descriptors: MONOMERS
descrips_rdk_mono = generate_rdmol_descriptors(mono_properties_subset, \
														list(addits.mono_smi))
#* Generate Descriptors: SOLVENTS
descrips_rdk_solv = generate_rdmol_descriptors(solvent_properties_subset, \
														list(addits.solv_smi))

#* Join Descriptors: Mono + Solvent
descrips_rdk_mono_solv = descrips_rdk_mono.join(descrips_rdk_solv, how='left', lsuffix='_mono', rsuffix='solvent')

descrips_rdk_mono_solv.shape
print('Finished generating Monomer-Solvent RF descriptors.')

Finished generating Monomer-Solvent RF descriptors.


##### Generate RDKit Descriptors: Additive-Solvent System

In [14]:
addit_properties_subset = ['lipinskiHBA', 
		'NumHBA', 
		'lipinskiHBD', 
		'NumHBD', 
		'NumRotatableBonds', 
		'NumHeteroatoms', 
		'NumAmideBonds', 
		'FractionCSP3', 
		'NumRings', 
		'NumAromaticRings', 
		'NumAliphaticRings', 
		'NumSaturatedRings', 
		'NumHeterocycles', 
		'NumSaturatedHeterocycles', 
		'NumAliphaticHeterocycles', 
		'NumAtomStereoCenters', 
		'tpsa',
		'chi0v',
		'chi1v',
		'chi2v',
		'chi3v',
		'chi4v',
		'kappa1',
		'kappa2',
		'kappa3']



#* Generate Descriptors: ADDITIVES
descrips_rdk_addit = generate_rdmol_descriptors(addit_properties_subset, \
														list(addits.addit_smi))
#* Generate Descriptors: SOLVENTS
descrips_rdk_solv = generate_rdmol_descriptors(solvent_properties_subset, \
														list(addits.solv_smi))

#* Join Descriptors: Addit + Solvent
descrips_rdk_addit_solv = descrips_rdk_addit.join(descrips_rdk_solv, how='left', lsuffix='_addit', rsuffix='solvent')



descrips_rdk_addit_solv.shape
print('Finished generating Additive-Solvent RF descriptors.')

Finished generating Additive-Solvent RF descriptors.


## Prediction

### Load Polymer Model

In [39]:
atnr_nope_dir = model_pkl_dir + r"2D_atactic_NOPE_nr_fm\\"
atnr_nope_short_names = ['at_NOPE_nr_atom_bd', 'at_NOPE_nr_mordred', 
					   'at_NOPE_nr_mfp', 'at_NOPE_nr_rdfp', 'at_NOPE_nr_atom_mordred',
					   'at_NOPE_nr_atom_morganfp', 'at_NOPE_nr_atom_rdfp'] # BY DESIRED ORDER
atnr_nope_long_names = ['Atom and Bond (AtNR_NOPE_FM)', 
								 'Mordred (AtNR_NOPE_FM)', 
								 'Morgan FP (AtNR_NOPE_FM)', 
								 'RDKit FP (AtNR_NOPE_FM)',
								 'Atom + Mordred (AtNR_NOPE_FM)', 
								 'Atom + Morgan FP (AtNR_NOPE_FM)', 
								 'Atom + RDKit FP (AtNR_NOPE_FM)'] 

atnr_nope_mout = modelGroup(group_id='at_NOPE_nr', group_dir=atnr_nope_dir, model_num_str=mod_nums, 
							model_names_short=atnr_nope_short_names, descriptor_names_long=atnr_nope_long_names,
							 pca_used=False)
atnr_nope_mout.load_model_data(hush=False)
atnr_nope_mout.set_y_data(f"{atnr_nope_dir + mod_nums}_atactic_NOPE_nr_y_train_data.pkl",
    					f"{atnr_nope_dir + mod_nums}_atactic_NOPE_nr_y_test_data.pkl")
atnr_nope_mout.gen_sklearn_metrics(metrics_list=metrics_name_list)
atnr_nope_mout.gen_dff(hush=False)
dff_atnr_nope = atnr_nope_mout.dff


Loading data for at_NOPE_nr...
	Loading the 0th model...
		at_NOPE_nr_atom_bd model data loaded sucessfully
		at_NOPE_nr_atom_bd kfold data loaded sucessfully
		at_NOPE_nr_atom_bd X train loaded sucessfully
		at_NOPE_nr_atom_bd X test data loaded sucessfully
	Loading the 1th model...
		at_NOPE_nr_mordred model data loaded sucessfully
		at_NOPE_nr_mordred kfold data loaded sucessfully
		at_NOPE_nr_mordred X train loaded sucessfully
		at_NOPE_nr_mordred X test data loaded sucessfully
	Loading the 2th model...
		at_NOPE_nr_mfp model data loaded sucessfully
		at_NOPE_nr_mfp kfold data loaded sucessfully
		at_NOPE_nr_mfp X train loaded sucessfully
		at_NOPE_nr_mfp X test data loaded sucessfully
	Loading the 3th model...
		at_NOPE_nr_rdfp model data loaded sucessfully
		at_NOPE_nr_rdfp kfold data loaded sucessfully
		at_NOPE_nr_rdfp X train loaded sucessfully
		at_NOPE_nr_rdfp X test data loaded sucessfully
	Loading the 4th model...
		at_NOPE_nr_atom_mordred model data loaded sucessfully
		a

#### Load polymer classifier object

In [40]:
clf_rdk_RF = dff_atnr_nope.query('model_label == "at_NOPE_nr_atom_bd_RF"').model_function.values[0]
pred_solub_mono = clf_rdk_RF.predict(descrips_rdk_mono_solv)

addits['pred_solub_mono'] = pred_solub_mono

### Subselect additives

Remove additive-poly-solv datapoints which have 
- Monomers outside of df_homopoly (because I need mono with experimental solub.)
- Datapoints with solvents not in homopolymer DB
- Other additives without experimental solubility on PubChem

In [41]:
df_homopoly_selec = df_homopoly[df_homopoly.mono_name.isin(addits.ex_mono.values)]
df_homopoly_selec = df_homopoly_selec[['polymer', 'mono_name', 'mono_smiles', 
								'solvent', 'solvent_smiles', 'solvent_characteristic']]
#valid_solvs = df_homopoly_selec.solvent.unique()
addits_selec = addits[addits.solv_name.isin(df_homopoly_selec.solvent.unique())]
addits_to_investigate = addits_selec.name.unique()
# Below is from manual investigation of pubchem
addits_to_exclude = ['ethyl-2-cyano-3,3-diphenylacrylate', "Tetraethyl 2,2’-(1,4-phenylenedimethylylidene)dimalonate", 'oleamide', 'lauric diethanolamide', 'glyceryl monooleate', 'biuret']
addits_to_investigate = [x for x in list(addits_to_investigate) if x not in addits_to_exclude]
addits_to_investigate

['2,4-dihydroxybenzophenone',
 'Methyl salicylate',
 'dioctyl adipate',
 'decabromodiphenyl ether',
 'melamine',
 'stearic acid',
 'butyl stearate',
 'stearyl alcohol',
 'erucamide',
 '1,3,5-Tris[3-(dimethylamino)propyl]hexahydro-1,3,5-triazine',
 'azodicarbonamide',
 'sudan I']

### Define additive solvents from pubchem (manual)
- Below values were manually retrieved from PubChem searches.

In [42]:
solub_addit_pc = {"2,4-dihydroxybenzophenone": {"INSOL": ["water"],
						"SOL": ["methanol", "ethanol", "diethyl ether", "methyl ethyl ketone", "ethyl acetate"],
						"PARTIAL":["benzene"]},
					"methyl salicylate": {"INSOL": [],
											"SOL": ["chloroform", "diethyl ether", "ethanol", "diethyl ether"],
											"PARTIAL":["water"]},
					"dioctyl adipate": {"INSOL": ["water", "glycerine"],
											"SOL": ["ethanol", "chloroform", "diethyl ether"],
											"PARTIAL":[]},
					"decabromodiphenyl ether": {"INSOL": ["acetone", "toluene", "benzene", "methylene bromide", "xylene", "water"],
											"SOL": [],
											"PARTIAL": [],},
					"melamine": {"INSOL": ["water", "diethyl ether", "benzene", "carbon tetrachloride"],
											"SOL": [],
											"PARTIAL": ["glycol", "glycerol", "pyridine", "ethanol"]},
					"stearic acid": {"INSOL": ["water"],
											"SOL": ["acetone", "chloroform", "carbon disulfide", "ethanol", "diethyl ether"],
											"PARTIAL":["ethanol", "benzene"]},
					"butyl stearate": {"INSOL": ["water"],
											"SOL": ["ethanol", "acetone"],
											"PARTIAL":[],},
					"stearyl alcohol": {"INSOL": ["water"],
											"SOL": ["chloroform", "ethanol", "diethyl ether"],
											"PARTIAL":["acetone", "benzene"]},
					"erucamide": {"INSOL": [],
											"SOL": ["isopropanol"],
											"PARTIAL":["ethanol", "acetone"],},
					"azodicarbonamide": {"INSOL": ["water", "ethanol", "acetone"],
											"SOL": [],
											"PARTIAL": [],},
					"sudan I": {"INSOL": ["water"],
											"SOL": ["ethanol", "acetone", "petroleum ether", "carbon disulfide"],
											"PARTIAL":[]}}
										

In [43]:
if "expt_solub_addit" not in addits_selec.columns:
	addits_selec.insert(loc=8, column="expt_solub_addit", value=None)

Add pubchem (Experimental) solubities for additives

In [44]:
for name, vals in solub_addit_pc.items():
	for category, solv_lst in vals.items():
		if category == "INSOL":
			sol_code = 0
		elif category == "SOL":
			sol_code = 1
		elif category == "PARTIAL":
			
			sol_code = None
		for solv in solv_lst:
			
			cond_1 = addits_selec.name == name
			cond_2 = addits_selec.solv_name == solv
			relevant_df = addits_selec.loc[cond_1 & cond_2, 'expt_solub_addit']
			# if couldnt find
			if relevant_df.shape[0] < 1:
				print(f"WARNING: Could not find '{solv}' as a solvent for additive '{name}'")
			addits_selec.loc[cond_1 & cond_2, 'expt_solub_addit'] = sol_code
		



Add experimental poly

In [45]:
vals_lst = []
for row in addits_selec.itertuples():
	mono,solv = row.ex_mono,row.solv_name

	mono_is_same = df_homopoly.mono_name == mono
	solv_is_same = df_homopoly.solvent == solv
	expt_solv_poly = df_homopoly.loc[mono_is_same & solv_is_same, :]
	if expt_solv_poly.shape[0] < 1:
		print(f"WARNING: Could not find '{solv}' as a solvent for monomer '{mono}'")
		vals_lst.append(None)
		continue
	elif expt_solv_poly.shape[0] > 1:
		print(f"WARNING: Found multiple rows for '{solv}' as a solvent for monomer '{mono}'")
		vals_lst.append(None)
		continue
	else:
		expt_sol_val = expt_solv_poly.solvent_characteristic.values[0]
		print("EXPT SOL VAL IS",expt_sol_val)
		vals_lst.append(expt_sol_val)
	
if 'expt_solub_mono' not in addits_selec.columns:
	addits_selec.insert(loc=8, column='expt_solub_mono', value=vals_lst)

EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 0
EXPT SOL VAL IS 0
EXPT SOL VAL IS 0
EXPT SOL VAL IS 0
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 0
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL VAL IS 1
EXPT SOL V

In [46]:
addits_selected = ['stearic acid', 'Sudan I', '2,4-dihydroxybenzophenone']
addits_selec[addits_selec.name.isin(addits_selected)]

Unnamed: 0,type,name,addit_smi,ex_mono,mono_smi,solv_name,solv_smi,pred_solub_mono,expt_solub_mono,expt_solub_addit
1,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,acetone,CC(C)=O,1,1.0,
3,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,carbon tetrachloride,ClC(Cl)(Cl)Cl,1,0.0,
5,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,methanol,CO,0,,1
6,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,methylene chloride,ClCCl,1,,
8,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,"n,n-dimethylformamide",CN(C)C=O,1,1.0,
...,...,...,...,...,...,...,...,...,...,...
3740,Lubricants,stearic acid,CCCCCCCCCCCCCCCCCC(=O)O,ethylene,C=C,ethylene chloride,ClCCCl,0,,
3741,Lubricants,stearic acid,CCCCCCCCCCCCCCCCCC(=O)O,ethylene,C=C,isophorone,CC1=CC(=O)CC(C)(C)C1,1,,
3742,Lubricants,stearic acid,CCCCCCCCCCCCCCCCCC(=O)O,ethylene,C=C,hexamethylphosphoric triamide,CN(C)P(=O)(N(C)C)N(C)C,1,,
3743,Lubricants,stearic acid,CCCCCCCCCCCCCCCCCC(=O)O,ethylene,C=C,mesityl oxide,CC(=O)C=C(C)C,0,,


In [47]:
addits_selec.dropna().solv_name.unique()
addits_selec.dropna().name.unique()

array(['2,4-dihydroxybenzophenone', 'dioctyl adipate',
       'decabromodiphenyl ether', 'melamine', 'stearic acid',
       'butyl stearate', 'stearyl alcohol', 'azodicarbonamide', 'sudan I'],
      dtype=object)

### Load predictions

In [None]:
print("\n\n\t!! TIP: Warning below about feature names can be safely ignored.")
print("\t!!\tWe get this error because our features are suffixed with _solute instead of _addit when training.\n\n")
with open(pkl_path/"fit_RF_model_smallmol.pkl", "rb") as f:
	regr_rdkit_rf_smallmol = pickle.load(f)
pred_dgsolv_smallmol_rdkit_rf = regr_rdkit_rf_smallmol.predict(descrips_rdk_addit_solv)

Convert 

In [49]:
pred_solub_small_all = pd.DataFrame(pred_dgsolv_smallmol_rdkit_rf, columns=["pred_dgsolv"])

good_idx = addits_selec.dropna().index
pred_solub_small_selec = pred_solub_small_all[pred_solub_small_all.index.isin(good_idx)]

In [50]:

def conv_binary_solub(entry):
	# Soluble
	if entry < 0:
		return 1
	else:
		return 0


if "pred_solub_addit" not in addits_selec.columns:
	addits_selec = addits_selec.dropna()
	print("ADD")
	addits_selec.insert(loc=9, 
				column="pred_solub_addit", 
				#value=list(pred_solub_small_selec.pred_dgsolv.map(conv_binary_solub)))
				value=list(pred_solub_small_selec.pred_dgsolv.map(conv_binary_solub)))
	addits_selec.insert(loc=9,
				column="monopred_matches_expt", 
				value=None)
	addits_selec.insert(loc=12, 
				column="additpred_matches_expt",
				value=None)

				
for row in addits_selec.itertuples():
	if row.pred_solub_mono == row.expt_solub_mono:
		addits_selec.at[row.Index, "monopred_matches_expt"] = True
		print("Mono TRUE (pred/expt)",row.pred_solub_mono,row.expt_solub_mono)
	else:
		print("Mono FALSE (pred/expt)",row.pred_solub_mono,row.expt_solub_mono)
  
  
	if row.pred_solub_addit == row.expt_solub_addit:
		addits_selec.at[row.Index, "additpred_matches_expt"] = True
		print("Addit TRUE (pred/expt)",row.pred_solub_addit,row.expt_solub_addit)
	else:
		addits_selec.at[row.Index, "additpred_matches_expt"] = False
		print("Addit FALSE (pred/expt)",row.pred_solub_addit,row.expt_solub_addit)
	
dataset_A_additives = addits_selec.copy(deep=True)
print("\n\n Additives: Dataset A")
dataset_A_additives


ADD
Mono TRUE (pred/expt) 1 1.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 0 0.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 0 0.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 1 1.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 0 0.0
Addit FALSE (pred/expt) 1 0
Mono TRUE (pred/expt) 1 1.0
Addit FALSE (pred/expt) 1 0
Mono TRUE (pred/expt) 1 1.0
Addit FALSE (pred/expt) 1 0
Mono TRUE (pred/expt) 0 0.0
Addit FALSE (pred/expt) 1 0
Mono TRUE (pred/expt) 1 1.0
Addit FALSE (pred/expt) 1 0
Mono TRUE (pred/expt) 0 0.0
Addit FALSE (pred/expt) 1 0
Mono TRUE (pred/expt) 1 1.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 1 1.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 0 0.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 1 1.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 0 0.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 1 1.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 0 0.0
Addit TRUE (pred/expt) 1 1
Mono TRUE (pred/expt) 1 1.0
Addit TRUE (pred/expt) 1 1


Unnamed: 0,type,name,addit_smi,ex_mono,mono_smi,solv_name,solv_smi,pred_solub_mono,expt_solub_mono,monopred_matches_expt,pred_solub_addit,expt_solub_addit,additpred_matches_expt
52,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,vinyl chloride,ClC=C,methyl ethyl ketone,CCC(C)=O,1,1.0,True,1,1,True
217,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,ethylene,C=C,methanol,CO,0,0.0,True,1,1,True
232,Stabilizers,"2,4-dihydroxybenzophenone",O=C(c1ccccc1)c1ccc(O)cc1O,ethylene,C=C,ethanol,CCO,0,0.0,True,1,1,True
1710,Plasticizers,dioctyl adipate,CCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCC,vinyl chloride,ClC=C,chloroform,ClC(Cl)Cl,1,1.0,True,1,1,True
1909,Flame retardants,decabromodiphenyl ether,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,styrene,C=Cc1ccccc1,acetone,CC(C)=O,0,0.0,True,1,0,False
1918,Flame retardants,decabromodiphenyl ether,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,styrene,C=Cc1ccccc1,benzene,c1ccccc1,1,1.0,True,1,0,False
1919,Flame retardants,decabromodiphenyl ether,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,styrene,C=Cc1ccccc1,toluene,Cc1ccccc1,1,1.0,True,1,0,False
2121,Flame retardants,decabromodiphenyl ether,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,ethylene,C=C,acetone,CC(C)=O,0,0.0,True,1,0,False
2554,Flame retardants,melamine,Nc1nc(N)nc(N)n1,styrene,C=Cc1ccccc1,benzene,c1ccccc1,1,1.0,True,1,0,False
2568,Flame retardants,melamine,Nc1nc(N)nc(N)n1,styrene,C=Cc1ccccc1,diethyl ether,CCOCC,0,0.0,True,1,0,False


In [63]:

addits_selec.additpred_matches_expt.value_counts()
addits_selec.monopred_matches_expt.value_counts()

True    33
Name: monopred_matches_expt, dtype: int64

In [64]:
dataset_A_additives.expt_solub_addit.value_counts()

1    24
0     9
Name: expt_solub_addit, dtype: int64

In [65]:
dataset_B_additives = addits_selec.loc[(addits_selec.expt_solub_mono == 0) & (addits_selec.expt_solub_addit == 1), :]
dataset_B_additives.name.unique()

array(['2,4-dihydroxybenzophenone', 'stearic acid', 'butyl stearate',
       'stearyl alcohol', 'sudan I'], dtype=object)

In [67]:
add_view = addits_selec.sort_values(by=["pred_solub_addit"]).set_index(['type','name','solv_name']).sort_values(by="type")
poly_view = (addits_selec.sort_values(by=["pred_solub_addit"])
						.set_index(['ex_mono','solv_name','name'])
						.sort_values(by=["ex_mono","solv_name","name"]))
# Paper_shortlist is equivalent to dataset_B_additives (additives Dataset B in paper)
paper_shortlist = poly_view.loc[(poly_view.expt_solub_mono == 0) & (poly_view.expt_solub_addit == 1), :] 
paper_shortlist

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,type,addit_smi,mono_smi,solv_smi,pred_solub_mono,expt_solub_mono,monopred_matches_expt,pred_solub_addit,expt_solub_addit,additpred_matches_expt
ex_mono,solv_name,name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ethylene,acetone,butyl stearate,Lubricants,CCCCCCCCCCCCCCCCCC(=O)OCCCC,C=C,CC(C)=O,0,0.0,True,1,1,True
ethylene,acetone,stearic acid,Lubricants,CCCCCCCCCCCCCCCCCC(=O)O,C=C,CC(C)=O,0,0.0,True,1,1,True
ethylene,acetone,sudan I,Colourants,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,C=C,CC(C)=O,0,0.0,True,1,1,True
ethylene,ethanol,"2,4-dihydroxybenzophenone",Stabilizers,O=C(c1ccccc1)c1ccc(O)cc1O,C=C,CCO,0,0.0,True,1,1,True
ethylene,ethanol,butyl stearate,Lubricants,CCCCCCCCCCCCCCCCCC(=O)OCCCC,C=C,CCO,0,0.0,True,1,1,True
ethylene,ethanol,stearyl alcohol,Lubricants,CCCCCCCCCCCCCCCCCCO,C=C,CCO,0,0.0,True,1,1,True
ethylene,ethanol,sudan I,Colourants,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,C=C,CCO,0,0.0,True,1,1,True
ethylene,methanol,"2,4-dihydroxybenzophenone",Stabilizers,O=C(c1ccccc1)c1ccc(O)cc1O,C=C,CO,0,0.0,True,1,1,True
styrene,acetone,butyl stearate,Lubricants,CCCCCCCCCCCCCCCCCC(=O)OCCCC,C=Cc1ccccc1,CC(C)=O,0,0.0,True,1,1,True
styrene,acetone,stearic acid,Lubricants,CCCCCCCCCCCCCCCCCC(=O)O,C=Cc1ccccc1,CC(C)=O,0,0.0,True,1,1,True


In [54]:
add_view

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,addit_smi,ex_mono,mono_smi,solv_smi,pred_solub_mono,expt_solub_mono,monopred_matches_expt,pred_solub_addit,expt_solub_addit,additpred_matches_expt
type,name,solv_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Colourants,sudan I,ethanol,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,ethylene,C=C,CCO,0,0.0,True,1,1,True
Colourants,sudan I,acetone,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,methyl methacrylate,COC(=O)C(C)=C,CC(C)=O,1,1.0,True,1,1,True
Colourants,sudan I,carbon disulfide,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,styrene,C=Cc1ccccc1,S=C=S,1,1.0,True,1,1,True
Colourants,sudan I,acetone,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,styrene,C=Cc1ccccc1,CC(C)=O,0,0.0,True,1,1,True
Colourants,sudan I,acetone,O=C1C=Cc2ccccc2/C1=N/Nc1ccccc1,ethylene,C=C,CC(C)=O,0,0.0,True,1,1,True
Flame retardants,melamine,diethyl ether,Nc1nc(N)nc(N)n1,styrene,C=Cc1ccccc1,CCOCC,0,0.0,True,1,0,False
Flame retardants,melamine,benzene,Nc1nc(N)nc(N)n1,styrene,C=Cc1ccccc1,c1ccccc1,1,1.0,True,1,0,False
Flame retardants,decabromodiphenyl ether,acetone,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,ethylene,C=C,CC(C)=O,0,0.0,True,1,0,False
Flame retardants,decabromodiphenyl ether,toluene,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,styrene,C=Cc1ccccc1,Cc1ccccc1,1,1.0,True,1,0,False
Flame retardants,decabromodiphenyl ether,benzene,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,styrene,C=Cc1ccccc1,c1ccccc1,1,1.0,True,1,0,False
