In [2]:
import re
import pickle
import blosc
import copy

import numpy as np
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit import RDConfig
from rdkit.Chem import rdMolDescriptors, Descriptors, rdmolfiles, PandasTools, AllChem, Draw, rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D, MolDrawing, DrawingOptions
import mordred
from mordred import Calculator, descriptors
from itertools import combinations, permutations, pairwise, product, starmap
#from rdkit.Chem.Draw import IPythonConsole
import pickle
from scipy import stats
from pathlib import Path
from natsort import natsorted

from descriptor_gen_utils import _gen_morgan_fp, morgan_fp_from_smi_superlist, rdkit_fp_from_smi_superlist

## Load Atactic DF, Do mild cleanup

In [3]:
replacement_level = "none"
#compression_type = "_blosc"
compression_type = None

csv_path = Path.cwd().parent/"data/csvs"
json_path = Path.cwd().parent/"data/jsons"
pkl_path = Path.cwd().parent/"data/pkls/"

In [4]:
def write_compressed_pkl(py_object, file_dir, file_name: str):
	pickled_obj = pickle.dumps(py_object)
	compressed_pickled_obj = blosc.compress(pickled_obj)
	if isinstance(file_dir, str):
		file_to_write = file_dir + file_name
	elif isinstance(file_dir, Path):
		file_to_write = file_dir/file_name
     
	with open(file_dir + file_name, "wb") as f:
		f.write(compressed_pickled_obj)
		print("Successfully wrote BLOSC-compressed pickle:\t",file_name)

def read_compressed_pkl(file_dir: str, file_name: str):
	# Read compressed bytes
	with open(file_dir + file_name, "rb") as f:
		compressed_obj = f.read()
	# Decompress bytes
	decompressed_obj = blosc.decompress(compressed_obj)
	# Convert from bytes
	obj_out = pickle.loads(decompressed_obj)
	print("Succesfully read BLOSC-compressed pickle:\t",file_name)
	return obj_out


In [5]:


replacement_level_dict = {
	"none":"nr",
	"partial":"wpr",
	"full":"wr",
}

file_suffix = replacement_level_dict[replacement_level] 


db_path = (Path.cwd().parent/"data/csvs")/f"combined_solv_df_atactic_{file_suffix}.csv" 
df = pd.read_csv(db_path, sep=';')

section_dict = df.set_index('polymer').section.to_dict()
df = df.drop(['section', 'references','polymer_comments','solv_comments'], axis=1)
df.columns = ["polymer","mono_name","mono_smiles", "solvent", "solvent_smiles", "solvent_characteristic"]
df = df.dropna()
df

Unnamed: 0,polymer,mono_name,mono_smiles,solvent,solvent_smiles,solvent_characteristic
0,Poly(allene),allene,C=C=C,benzene,C1=CC=CC=C1,1
1,"Poly(1,2-butadiene)","1,2-butadiene",C=C=CC,toluene,CC1=CC=CC=C1,1
2,"Poly(1,2-butadiene)","1,2-butadiene",C=C=CC,chloroform,ClC(Cl)Cl,1
3,"Poly(1,3-butadiene)","1,3-butadiene",C=CC=C,tetrahydrofuran,O1CCCC1,1
4,Poly(isoprene),isoprene,C=CC(C)=C,tetrahydrofuran,O1CCCC1,1
...,...,...,...,...,...,...
1850,"Poly(dihydrosilylene-1,4-phenylene)","dihydrosilylene-1,4-phenylene",[SiH2]=C1[C]=CC=[C]C1,ethanol,CCO,0
1851,"Poly(hydrophenylsilyleneethynylene-1,3-phenyle...","hydrophenylsilyleneethynylene-1,3-phenyleneeth...",[C]#CC1C(C(C#[C])=CC=C1)=[SiH]C2C=CC=CC2,hexane,CCCCCC,0
1852,"Poly(hydrophenylsilyleneethynylene-1,3-phenyle...","hydrophenylsilyleneethynylene-1,3-phenyleneeth...",[C]#CC1C(C(C#[C])=CC=C1)=[SiH]C2C=CC=CC2,methanol,CO,0
1853,Poly(hexamethylcyclotrisilazane),hexamethylcyclotrisilazane,C[Si]1(C)N(C)[Si](C)(C)N(C)[SiH2]N1,acetone,CC(C)=O,0


### Remove Polymers/Datapoints with 'At Elevated Temp'

Some entries had incorrect solvents (e.g. ethylene, propylene, etc.) because of statements like "at elevated temperature:" or "Above 80C:". We drop these data points (~50) and reset the index.

In [8]:
# Below defines all polymers whose 'good solvents' should be removed 
elevated_temp_mono_goodsolv = ['oxymethylene', 'formaldehyde', 'propylene', 'ethylene', 
								"oxyisophthaloyloxy-1,4-phenylene", "phenylene isophthalate",
								 "oxyisophthyloyloxy-4,4'-biphenylylene", 
								 "iminoadipoyliminohexamethylene", "hexamethylene adipamide"]
								 # Poly(1-butene), isotactic 
# Below defines all polymers whose 'bad solvents' should be removed from DB
elevated_temp_mono_badsolv = ["thio-1,4-phenylene", 'phenylene sulfide']



df_elev_gsolv = df[df.mono_name.isin(elevated_temp_mono_goodsolv) & (df.solvent_characteristic == 1)]
df_elev_bsolv = df[df.mono_name.isin(elevated_temp_mono_badsolv) & (df.solvent_characteristic == 0)]
print("Df shape before:",df.shape)
df = df.drop(df_elev_gsolv.index)
df = df.drop(df_elev_bsolv.index)
print("Df shape after:",df.shape)
#df = [section_dict[poly_name] for poly_name in list(df.polymer)]



Df shape before: (1818, 6)
Df shape after: (1818, 6)


### Create Mol Objects

In [9]:
df = df.sort_values(by='mono_smiles')
df = df.reset_index(drop=True)

In [10]:

def insert_new_col(df, pos, col_name, values):
	if col_name not in df.columns:
		df.insert(loc=pos, column=col_name, value=values)
	return df
mono_mols = [Chem.MolFromSmiles(x) for x in df.mono_smiles]
df = insert_new_col(df, 3, 'mono_mol', mono_mols)
#df = insert_new_col(df, 10, 'mono2_mol', mono2_mol)
#df = insert_new_col(df, 11, 'mono3_mol', mono3_mol)

In [11]:
solv_smiles = list(df.solvent_smiles)
solv_mols = [Chem.MolFromSmiles(x) for x in solv_smiles]
df = insert_new_col(df, 6, 'solvent_mol', solv_mols)


In [12]:

mono_multiple_mols = [(i,x) for i,x in enumerate(list(df['mono_smiles'].unique())) if x.count(".") > 0]
solvent_multiple_mols = [(i,x) for i,x in enumerate(list(df['solvent_smiles'].unique())) if x.count(".") > 0]

if len(mono_multiple_mols) > 0:
	print("~~~~~~~~~~~~~~~~~~~~~~WARNING~~~~~~~~~~~~~~~~~~~~~~")
	print("~~~~~~~~~~~~~~~~~~~~~~WARNING~~~~~~~~~~~~~~~~~~~~~~")
	print("ONE OR MORE MONOMERS HAVE TWO MOLECULES IN SMILES, MUST FIX MANUALLY")
	print(*mono_multiple_mols,sep='\n')
else:
	print("No monomers have extra molecules")
	
if len(solvent_multiple_mols) > 0:
	print("~~~~~~~~~~~~~~~~~~~~~~WARNING~~~~~~~~~~~~~~~~~~~~~~")
	print("~~~~~~~~~~~~~~~~~~~~~~WARNING~~~~~~~~~~~~~~~~~~~~~~")
	print("ONE OR MORE SOLVENTS HAVE TWO MOLECULES IN SMILES, MUST FIX MANUALLY")
	#print(*solvent_multiple_mols,sep='\n')
	print(*solvent_multiple_mols,sep='\n')
else:
	print("No solvents have extra molecules")

mono_multiple_mols_df = df[df['mono_smiles'].isin([x[1] for x in mono_multiple_mols])]
solvent_multiple_mols_df = df[df['solvent_smiles'].isin([x[1] for x in solvent_multiple_mols])]
#dir(mono_multiple_mols[0])
#display(mono_multiple_mols_df,solvent_multiple_mols_df)
print("Bad Monomer Smiles\n",list(mono_multiple_mols_df.mono_name.unique()))
print("Bad Solvent Smiles\n",list(solvent_multiple_mols_df.solvent.unique()))



No monomers have extra molecules
No solvents have extra molecules
Bad Monomer Smiles
 []
Bad Solvent Smiles
 []


### Generate NoRad DF

In [13]:
import os
from pathlib import Path
from copy import copy,deepcopy

def clear_radical_electrons(mol_list: list, print_output: bool=False):
	new_mol_list = [deepcopy(x) for x in mol_list]
	for mol_pos,m in enumerate(new_mol_list):
		#m = Chem.AddHs(m)
		if type(m) == float:
			print(m,"Is float, appending np.nan")
			new_mol_list[mol_pos] = np.nan
		elif Chem.Descriptors.NumRadicalElectrons(m) > 0:
			if print_output:
				display("BEFORE ADDITION",m)
			rw_mol = Chem.RWMol(m)
			rad_atoms = [atom for atom in rw_mol.GetAtoms() if atom.GetNumRadicalElectrons() == 1]
			rad_pos = [atom.GetIdx() for atom in rad_atoms]
			double_rads = [atom for atom in rw_mol.GetAtoms() if atom.GetNumRadicalElectrons() == 2]
			if len(double_rads) > 0:
				if print_output:
					print("~~Warning - Double Radicals present, H not added for them~~")
     
			#* Set Num radicals to zero, add bond to H for each radical. Does not touch double radicals
			for i,rad_atom in enumerate(rad_atoms):
				if print_output:
					print(f"NUM ATOMS BEFORE H ADDITION {i}: {rw_mol.GetNumAtoms()}")
				rad_atom.SetNumRadicalElectrons(0)

				atom_indices = [x.GetIdx() for x in rw_mol.GetAtoms()]
				#print("INDICES BEFORE",atom_indices)

				Chem.RWMol.AddAtom(rw_mol, Chem.Atom('H'))
				if print_output:
					print(f"NUM ATOMS AFTER H ADDITION {i}: {rw_mol.GetNumAtoms()}")

				atom_indices = [x.GetIdx() for x in rw_mol.GetAtoms()]
				#print("INDICES AFTER",atom_indices)

				#display(rw_mol.GetMol())
				rw_mol.AddBond(rad_pos[i], atom_indices[-1], Chem.BondType.SINGLE)
				#display(rw_mol.GetMol())
			mol_out = Chem.RemoveHs(rw_mol.GetMol())
			if print_output:
				display("AFTER ADDITION",mol_out)
			new_mol_list[mol_pos] = mol_out
			
			print(f"~~RADICALS REPLACED WITH H FOR MOL {mol_pos}~~\n")
		else:
			new_mol_list[mol_pos] = Chem.RemoveHs(m)
	return new_mol_list


In [14]:
def get_norad_smiles(mol_lst: list, do_print: bool=False):
	
	mol_norad = clear_radical_electrons(mol_lst, print_output=do_print)
	smiles_norad = []
	for m in mol_norad:
		#print(m)
		if m is np.nan:
			smiles_norad.append(np.nan)
		else:
			smiles_norad.append(Chem.MolToSmiles(m))
	#smiles_norad = [Chem.MolToSmiles(x) for x in mol_norad] #! if x is not np.nan?
	return mol_norad, smiles_norad
	#print(len(set(mono1_smiles)),len(set(mono1_smiles_norad)))
	#print(*zip(mono1_smiles,mono1_smiles_norad),sep='\n')


mol_superlist_norad = []
smiles_superlist_norad = []
for i,lst in enumerate([mono_mols, solv_mols]):
	print("ON LIST NO",i)
	mol_norad,smiles_norad = get_norad_smiles(lst)
	mol_superlist_norad.append(mol_norad)
	smiles_superlist_norad.append(smiles_norad)

ON LIST NO 0
~~RADICALS REPLACED WITH H FOR MOL 16~~

~~RADICALS REPLACED WITH H FOR MOL 17~~

~~RADICALS REPLACED WITH H FOR MOL 18~~

~~RADICALS REPLACED WITH H FOR MOL 19~~

~~RADICALS REPLACED WITH H FOR MOL 20~~

~~RADICALS REPLACED WITH H FOR MOL 29~~

~~RADICALS REPLACED WITH H FOR MOL 844~~

~~RADICALS REPLACED WITH H FOR MOL 845~~

~~RADICALS REPLACED WITH H FOR MOL 846~~

~~RADICALS REPLACED WITH H FOR MOL 847~~

~~RADICALS REPLACED WITH H FOR MOL 848~~

~~RADICALS REPLACED WITH H FOR MOL 849~~

~~RADICALS REPLACED WITH H FOR MOL 850~~

~~RADICALS REPLACED WITH H FOR MOL 851~~

~~RADICALS REPLACED WITH H FOR MOL 852~~

~~RADICALS REPLACED WITH H FOR MOL 853~~

~~RADICALS REPLACED WITH H FOR MOL 854~~

~~RADICALS REPLACED WITH H FOR MOL 855~~

~~RADICALS REPLACED WITH H FOR MOL 856~~

~~RADICALS REPLACED WITH H FOR MOL 857~~

~~RADICALS REPLACED WITH H FOR MOL 858~~

~~RADICALS REPLACED WITH H FOR MOL 859~~

~~RADICALS REPLACED WITH H FOR MOL 881~~

~~RADICALS REPLACED WITH H 

In [15]:
mono_mol_norad,solv_mol_norad = mol_superlist_norad
mono_smiles_norad,solv_smiles_norad = smiles_superlist_norad
print(*zip(list(df.mono_name),list(df.mono_smiles),mono_smiles_norad),sep='\n')

('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('acetylene', 'C#C', 'C#C')
('phenylacetylene', 'C#CC1=CC=CC=C1', 'C#Cc1ccccc1')
('phenylacetylene', 'C#CC1=CC=CC=C1', 'C#Cc1ccccc1')
('phenylacetylene', 'C#CC1=CC=CC=C1', 'C#Cc1ccccc1')
('phenylacetylene', 'C#CC1=CC=CC=C1', 'C#Cc1ccccc1')
('phenylacetylene', 'C#CC1=CC=CC=C1', 'C#Cc1ccccc1')
('9,9-dipropargylfluorene', 'C#CCC1(CC#C)C2=C(C3=C1C=CC=C3)C=CC=C2', 'C#CCC1(CC#C)c2ccccc2-c2ccccc21')
('1,4-phenylene-1,2-di(4-phenoxyphenyl)ethenylene', 'C(C1=CC=C(OC2=CC=CC=C2)C=C1)(C3=CC=[C]C=C3)=[C]C4=CC=C(OC5=CC=CC=C5)C=C4', 'C(=C(c1ccccc1)c1ccc(Oc2ccccc2)cc1)c1ccc(Oc2ccccc2)cc1')
('1,4-phenylene-1,2-di(4-phenoxyphenyl)ethenylene', 'C(C1=CC=C(OC2=CC=CC=C2)C=C1)(C3=CC=[C]C=C3)=[C]C4=CC=C(OC5=CC=CC=C5)C=C4', 'C(=C(c1ccccc1)c1ccc(Oc2ccccc2)cc1)c1cc

In [16]:
# norad dataframe
dfnr = df.copy(deep=True)
mol_colnames = ['mono_mol', 'solvent_mol']
smiles_colnames = ['mono_smiles', 'solvent_smiles']
for i,colname in enumerate(mol_colnames):
	dfnr[colname] = mol_superlist_norad[i]
for i,colname in enumerate(smiles_colnames):
	dfnr[colname] = smiles_superlist_norad[i]
dfnr

Unnamed: 0,polymer,mono_name,mono_smiles,mono_mol,solvent,solvent_smiles,solvent_mol,solvent_characteristic
0,Poly(acetylene),acetylene,C#C,"<img data-content=""rdkit/molecule"" src=""data:i...",aniline,Nc1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",1
1,Poly(acetylene),acetylene,C#C,"<img data-content=""rdkit/molecule"" src=""data:i...",acetone,CC(C)=O,"<img data-content=""rdkit/molecule"" src=""data:i...",0
2,Poly(acetylene),acetylene,C#C,"<img data-content=""rdkit/molecule"" src=""data:i...",Benzene,c1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",0
3,Poly(acetylene),acetylene,C#C,"<img data-content=""rdkit/molecule"" src=""data:i...",carbon tetrachloride,ClC(Cl)(Cl)Cl,"<img data-content=""rdkit/molecule"" src=""data:i...",0
4,Poly(acetylene),acetylene,C#C,"<img data-content=""rdkit/molecule"" src=""data:i...",cyclohexane,C1CCCCC1,"<img data-content=""rdkit/molecule"" src=""data:i...",0
...,...,...,...,...,...,...,...,...
1813,Poly(methylphenylsiloxane),methylphenylsiloxane,C[SiH](O[SiH3])c1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",diethyl ether,CCOCC,"<img data-content=""rdkit/molecule"" src=""data:i...",1
1814,Poly(methylphenylsiloxane),methylphenylsiloxane,C[SiH](O[SiH3])c1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",ethylene glycol,OCCO,"<img data-content=""rdkit/molecule"" src=""data:i...",0
1815,Poly(methylphenylsiloxane),methylphenylsiloxane,C[SiH](O[SiH3])c1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",chloroform,ClC(Cl)Cl,"<img data-content=""rdkit/molecule"" src=""data:i...",1
1816,Poly(methylphenylsiloxane),methylphenylsiloxane,C[SiH](O[SiH3])c1ccccc1,"<img data-content=""rdkit/molecule"" src=""data:i...",butanol,CCCCO,"<img data-content=""rdkit/molecule"" src=""data:i...",1


In [17]:
dfnr_type = dfnr.copy()
dfnr_type['ptype'] = dfnr_type.polymer.map(section_dict)
dfnr_type = dfnr_type[['polymer', 'ptype']]
dfnr_type.ptype.value_counts().nlargest(10)
dfnr_type

Unnamed: 0,polymer,ptype
0,Poly(acetylene),Poly(acetylenes)
1,Poly(acetylene),Poly(acetylenes)
2,Poly(acetylene),Poly(acetylenes)
3,Poly(acetylene),Poly(acetylenes)
4,Poly(acetylene),Poly(acetylenes)
...,...,...
1813,Poly(methylphenylsiloxane),Poly(sulfonates)
1814,Poly(methylphenylsiloxane),Poly(sulfonates)
1815,Poly(methylphenylsiloxane),Poly(sulfonates)
1816,Poly(methylphenylsiloxane),Poly(sulfonates)


In [18]:
dfnr_type.to_csv(csv_path/"df_TYPES_atactic_NOPE.csv", sep=';', index=False)

In [19]:

if compression_type == '_blosc':
	write_compressed_pkl(dfnr, pkl_path, f"df_atactic_NOPE_{file_suffix}_norad{compression_type}.pkl")
elif compression_type == '_gzip':
	dfnr.to_pickle(pkl_path/f"df_atactic_NOPE_{file_suffix}_norad{compression_type}.pkl",
							compression={'method': 'gzip'}) 
elif compression_type == None:
	dfnr.to_pickle(pkl_path/f"df_atactic_NOPE_{file_suffix}_norad.pkl") 


### Generate Descriptors

#### Descriptor Cleanup

In [20]:
#* Check if column is all zero, Get Column Statistics
def get_col_non_zero_percent(df, print_extra: bool=False, percent_cutoff: float=1.0):
	print("BEFORE NON ZERO DROPS",df.shape)
	non_zero_lst = []
	for col_name in df.columns:
		if (type(df[col_name][0])) == np.ndarray:
			print(col_name,"is array")
		elif (df[col_name] == 0).all():
			print("ALL ZERO",col_name)
		val_counts = df[col_name].value_counts()

		zero_counts = val_counts.index.isin([0.0])
		non_zero_counts = val_counts[~zero_counts]
		sum_val_counts = val_counts.sum()
		sum_nonzero_counts = non_zero_counts.sum()
		num_zero_counts = sum_val_counts - sum_nonzero_counts
		percent_non_zero = (sum_nonzero_counts/sum_val_counts)*100
		#! Added: Remove col if < cutoff
		if percent_non_zero < percent_cutoff:
			print(f"Dropped {col_name} as it was less than cutoff of {percent_cutoff}: {percent_non_zero}")
			df.drop(columns=col_name, axis=1, inplace=True)
			continue

		non_zero_lst.append((col_name,percent_non_zero))
		if print_extra == True:
			print(col_name)
			display(val_counts)
			display(zero_counts)
			display(non_zero_counts)
			print("TOTAL SUM",sum_val_counts)
			print("NON-ZERO COUNTS",sum_nonzero_counts)
			print("ZERO COUNTS",num_zero_counts)
			print("\n{} is {:2.2f} % non-zero \n\n".format(col_name,percent_non_zero))
	print("AFTER NON ZERO DROPS",df.shape)
	return non_zero_lst




#### Gen MorganFP

Generate MFP for atactic no rad

In [40]:
gen_fingerprints = True
write_fps_to_file = True
n_fp_bits = 32768

In [36]:
if gen_fingerprints == True:
	atnr_label_list = ['mono', 'solvent']
	df_morgan_fp_fxn = morgan_fp_from_smi_superlist(smi_superlist=smiles_superlist_norad, 
													label_list=atnr_label_list,
													 fp_rad=3, n_fp_bits=n_fp_bits,
													hush=False)
if write_fps_to_file == True:
	mfp_fname = f"atactic_NOPE_{file_suffix}_morgan_fp_{n_fp_bits}_norad{compression_type}"
	
	if compression_type == '_blosc':
		write_compressed_pkl(df_morgan_fp_fxn, pkl_path, mfp_fname + ".pkl")
	elif compression_type == '_gzip':
		df_morgan_fp_fxn.to_pickle(pkl_path/(f"{mfp_fname}.pkl"), compression={'method': 'gzip'})
	else:
		df_morgan_fp_fxn.to_csv(csv_path/(f"atactic_NOPE_{file_suffix}_morgan_fp_{n_fp_bits}_norad.csv"), index=False)

Processing Morgan FP for mono...
	bits in use: 2153
	Stacked shape: (1818, 32768)
	No NaNs found
Processing Morgan FP for solvent...
	bits in use: 537
	Stacked shape: (1818, 32768)
	No NaNs found
	Joining dataframes for mono and solvent


#### Gen RDKit FP

Clean

In [38]:
if gen_fingerprints == True:
	atnr_label_list = ['mono', 'solvent']
	df_rdkit_fp_fxn = rdkit_fp_from_smi_superlist(smi_superlist=smiles_superlist_norad, 
													label_list=atnr_label_list,
													 fp_rad=3, n_fp_bits=n_fp_bits,
													hush=False)
if write_fps_to_file == True:

	rdfp_fname = f"atactic_NOPE_{file_suffix}_rdkit_fp_{n_fp_bits}_norad{compression_type}"
	
	if compression_type == '_blosc':
		write_compressed_pkl(df_rdkit_fp_fxn, pkl_path, rdfp_fname + ".pkl")
	elif compression_type == '_gzip':
		df_rdkit_fp_fxn.to_pickle(pkl_path/(f"{rdfp_fname}.pkl"), compression={'method': 'gzip'})
	else:
		df_rdkit_fp_fxn.to_csv(csv_path/(f"atactic_NOPE_{file_suffix}_rdkit_fp_{n_fp_bits}_norad.csv"), index=False)

#### Gen Mordred Descriptors

In [41]:
calc = Calculator(descriptors, ignore_3D=True)
calc3D = Calculator(descriptors, ignore_3D=False)
desc_list_norad = []
df_mordred_2d_norad_mono = calc.pandas(dfnr['mono_mol'])

df_mordred_2d_norad_solv = calc.pandas(dfnr['solvent_mol'])
print(df_mordred_2d_norad_mono.shape, df_mordred_2d_norad_solv.shape)

 83%|████████▎ | 1513/1818 [01:00<00:16, 18.64it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 1818/1818 [01:10<00:00, 25.63it/s]
100%|██████████| 1818/1818 [00:55<00:00, 32.64it/s]


(1818, 1613) (1818, 1613)


In [25]:
#display(mono_mols_norad[1094])
#sol_di_norad.columns.rename({sol_di_norad.columns[0]:'Index'})
#sol_di_norad = sol_di_norad.set_index(sol_di_norad.columns[0])
df_mordred_2d_norad = df_mordred_2d_norad_mono.join(df_mordred_2d_norad_solv, how='left', lsuffix='_mono', rsuffix='_solvent')
#if list(df_mordred_2d_norad_solv.columns)[0].count('_solvent') == 0:
#	df_mordred_2d_norad_solv.columns = [x + "_solvent" for x in list(df_mordred_2d_norad_solv.columns)]
#df_mordred_2d_norad = df_mordred_2d_norad.join(df_mordred_2d_norad_solv, how='left')
display(df_mordred_2d_norad.dropna().shape, df_mordred_2d_norad.shape, df_mordred_2d_norad)

(1818, 3226)

(1818, 3226)

Unnamed: 0,ABC_mono,ABCGG_mono,nAcid_mono,nBase_mono,SpAbs_A_mono,SpMax_A_mono,SpDiam_A_mono,SpAD_A_mono,SpMAD_A_mono,LogEE_A_mono,...,SRW10_solvent,TSRW10_solvent,MW_solvent,AMW_solvent,WPath_solvent,WPol_solvent,Zagreb1_solvent,Zagreb2_solvent,mZagreb1_solvent,mZagreb2_solvent
0,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,8.124151,33.544698,93.057849,6.646989,42,5,30.0,31.0,2.361111,1.666667
1,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,6.188264,24.179697,58.041865,5.804186,9,0,12.0,9.0,3.111111,1.000000
2,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,7.627057,30.941317,78.046950,6.503913,27,3,24.0,24.0,1.5,1.500000
3,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,7.625107,29.418928,151.875411,30.375082,16,0,20.0,16.0,4.0625,1.000000
4,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,7.627057,30.941317,84.093900,4.671883,27,3,24.0,24.0,1.5,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,7.140017,6.873356,0,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,...,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,2.75,1.500000
1814,7.140017,6.873356,0,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,...,5.509388,22.328143,62.036779,6.203678,10,1,10.0,8.0,2.5,1.250000
1815,7.140017,6.873356,0,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,...,6.188264,24.179697,117.914383,23.582877,9,0,12.0,9.0,3.111111,1.000000
1816,7.140017,6.873356,0,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,...,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,2.75,1.500000


In [26]:

def return_nan_on_str(x):
	if type(x) == 'str':
		print("STRING REPLACED",x)
		return np.nan
	else:
		return x


for i,col in enumerate(df_mordred_2d_norad.columns):
	curr_series = df_mordred_2d_norad[col]
	dt = curr_series.dtype
	if dt == object:
		print("DATATYPE IS OBJECT, DROPPING COL",col)
		df_mordred_2d_norad.drop(columns=col, inplace=True)
		continue
		#print(df_mordred_2d_norad[col].value_counts())
	print("ALL EQUALS ZERO:",(curr_series == 0).all())
	is_all_zero = (curr_series == 0).all()
	if is_all_zero == True:
		df_mordred_2d_norad.drop(columns=col, inplace=True)
	booltest = False
df_mordred_2d_norad = df_mordred_2d_norad.apply(return_nan_on_str)


display(df_mordred_2d_norad.dropna().shape, df_mordred_2d_norad)


ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS ZERO: False
ALL EQUALS 

(1818, 1808)

Unnamed: 0,ABC_mono,ABCGG_mono,nAcid_mono,nBase_mono,SpAbs_A_mono,SpMax_A_mono,SpDiam_A_mono,SpAD_A_mono,SpMAD_A_mono,LogEE_A_mono,...,SRW09_solvent,SRW10_solvent,TSRW10_solvent,MW_solvent,AMW_solvent,WPath_solvent,WPol_solvent,Zagreb1_solvent,Zagreb2_solvent,mZagreb2_solvent
0,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,0.0,8.124151,33.544698,93.057849,6.646989,42,5,30.0,31.0,1.666667
1,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,0.0,6.188264,24.179697,58.041865,5.804186,9,0,12.0,9.0,1.000000
2,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,0.0,7.627057,30.941317,78.046950,6.503913,27,3,24.0,24.0,1.500000
3,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,0.0,7.625107,29.418928,151.875411,30.375082,16,0,20.0,16.0,1.000000
4,0.000000,0.000000,0,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,...,0.0,7.627057,30.941317,84.093900,4.671883,27,3,24.0,24.0,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,7.140017,6.873356,0,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000
1814,7.140017,6.873356,0,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,...,0.0,5.509388,22.328143,62.036779,6.203678,10,1,10.0,8.0,1.250000
1815,7.140017,6.873356,0,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,...,0.0,6.188264,24.179697,117.914383,23.582877,9,0,12.0,9.0,1.000000
1816,7.140017,6.873356,0,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000


In [27]:
non_zero_percents_mordred_2d = get_col_non_zero_percent(df_mordred_2d_norad, False, 2.0)
print(*sorted(non_zero_percents_mordred_2d,key= lambda x: x[1]),sep='\n')
print(*zip(df_mordred_2d_norad.columns,[type(x) for x in df_mordred_2d_norad.columns]),sep='\n')
display(df_mordred_2d_norad.corr().head())

BEFORE NON ZERO DROPS (1818, 1808)
Dropped nBase_mono as it was less than cutoff of 2.0: 1.2651265126512652
Dropped nSpiro_mono as it was less than cutoff of 2.0: 0.16501650165016502
Dropped nB_mono as it was less than cutoff of 2.0: 0.11001100110011
Dropped nBr_mono as it was less than cutoff of 2.0: 0.825082508250825
Dropped nI_mono as it was less than cutoff of 2.0: 0.16501650165016502
Dropped Xch-3d_mono as it was less than cutoff of 2.0: 1.76017601760176
Dropped Xch-3dv_mono as it was less than cutoff of 2.0: 1.76017601760176
Dropped NssBH_mono as it was less than cutoff of 2.0: 0.11001100110011
Dropped NtCH_mono as it was less than cutoff of 2.0: 1.1001100110011002
Dropped NdNH_mono as it was less than cutoff of 2.0: 0.44004400440044
Dropped NaaNH_mono as it was less than cutoff of 2.0: 0.605060506050605
Dropped NddsN_mono as it was less than cutoff of 2.0: 0.44004400440044
Dropped NaasN_mono as it was less than cutoff of 2.0: 1.3751375137513753
Dropped NaaO_mono as it was less t

Unnamed: 0,ABC_mono,ABCGG_mono,nAcid_mono,SpAbs_A_mono,SpMax_A_mono,SpDiam_A_mono,SpAD_A_mono,SpMAD_A_mono,LogEE_A_mono,VE1_A_mono,...,SRW09_solvent,SRW10_solvent,TSRW10_solvent,MW_solvent,AMW_solvent,WPath_solvent,WPol_solvent,Zagreb1_solvent,Zagreb2_solvent,mZagreb2_solvent
ABC_mono,1.0,0.983633,0.159541,0.996055,0.69069,0.708152,0.996055,0.545833,0.919648,0.922286,...,0.07994,-0.007934,0.017456,0.007311,0.028026,-0.032807,-0.012913,-0.005757,-0.001202,-0.026923
ABCGG_mono,0.983633,1.0,0.182558,0.974163,0.751393,0.773113,0.974163,0.544116,0.948323,0.939121,...,0.07276,-0.006647,0.016685,0.010711,0.031115,-0.029307,-0.007386,-0.002402,0.002304,-0.025221
nAcid_mono,0.159541,0.182558,1.0,0.141636,0.098055,0.108927,0.141636,-0.081603,0.168181,0.161569,...,0.009947,-0.055528,-0.051682,-0.036402,0.035019,-0.034969,-0.044232,-0.050108,-0.044894,-0.078967
SpAbs_A_mono,0.996055,0.974163,0.141636,1.0,0.664083,0.682496,1.0,0.575326,0.913324,0.919424,...,0.077995,-0.008642,0.015515,0.004386,0.028206,-0.037098,-0.019696,-0.010115,-0.006248,-0.028771
SpMax_A_mono,0.69069,0.751393,0.098055,0.664083,1.0,0.985247,0.664083,0.685188,0.858996,0.779217,...,0.029263,0.00223,0.013501,0.031239,0.045985,-0.000671,0.020267,0.018973,0.024251,-0.004013


In [28]:
display(df_mordred_2d_norad.shape, df_mordred_2d_norad.dropna(how='any').shape, df_mordred_2d_norad)

(1818, 1611)

(1818, 1611)

Unnamed: 0,ABC_mono,ABCGG_mono,nAcid_mono,SpAbs_A_mono,SpMax_A_mono,SpDiam_A_mono,SpAD_A_mono,SpMAD_A_mono,LogEE_A_mono,VE1_A_mono,...,SRW09_solvent,SRW10_solvent,TSRW10_solvent,MW_solvent,AMW_solvent,WPath_solvent,WPol_solvent,Zagreb1_solvent,Zagreb2_solvent,mZagreb2_solvent
0,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,8.124151,33.544698,93.057849,6.646989,42,5,30.0,31.0,1.666667
1,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,6.188264,24.179697,58.041865,5.804186,9,0,12.0,9.0,1.000000
2,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.627057,30.941317,78.046950,6.503913,27,3,24.0,24.0,1.500000
3,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.625107,29.418928,151.875411,30.375082,16,0,20.0,16.0,1.000000
4,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.627057,30.941317,84.093900,4.671883,27,3,24.0,24.0,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000
1814,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,5.509388,22.328143,62.036779,6.203678,10,1,10.0,8.0,1.250000
1815,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.188264,24.179697,117.914383,23.582877,9,0,12.0,9.0,1.000000
1816,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000


In [34]:
display(df_mordred_2d_norad.dropna().shape, df_mordred_2d_norad)

mordred_fname = f"atactic_NOPE_{file_suffix}_mordred_descriptors_norad_noempty{compression_type}"

if compression_type == '_blosc':
	write_compressed_pkl(df_mordred_2d_norad, pkl_path, mordred_fname + ".pkl")
elif compression_type == '_gzip':
	df_mordred_2d_norad.to_pickle(pkl_path/(f"{mordred_fname}.pkl"), compression={'method': 'gzip'})
else:
	df_mordred_2d_norad.to_csv(csv_path/(f"atactic_NOPE_{file_suffix}_mordred_descriptors_norad_noempty.csv"), index=False)


(1818, 1611)

Unnamed: 0,ABC_mono,ABCGG_mono,nAcid_mono,SpAbs_A_mono,SpMax_A_mono,SpDiam_A_mono,SpAD_A_mono,SpMAD_A_mono,LogEE_A_mono,VE1_A_mono,...,SRW09_solvent,SRW10_solvent,TSRW10_solvent,MW_solvent,AMW_solvent,WPath_solvent,WPol_solvent,Zagreb1_solvent,Zagreb2_solvent,mZagreb2_solvent
0,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,8.124151,33.544698,93.057849,6.646989,42,5,30.0,31.0,1.666667
1,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,6.188264,24.179697,58.041865,5.804186,9,0,12.0,9.0,1.000000
2,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.627057,30.941317,78.046950,6.503913,27,3,24.0,24.0,1.500000
3,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.625107,29.418928,151.875411,30.375082,16,0,20.0,16.0,1.000000
4,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.627057,30.941317,84.093900,4.671883,27,3,24.0,24.0,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000
1814,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,5.509388,22.328143,62.036779,6.203678,10,1,10.0,8.0,1.250000
1815,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.188264,24.179697,117.914383,23.582877,9,0,12.0,9.0,1.000000
1816,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000


#### Gen RDKit Descriptors (2D)
- Generate for each smiles, use dict to add to df

In [42]:

import re


mono_properties_subset = ['lipinskiHBA', 
		'NumHBA', 
		'lipinskiHBD', 
		'NumHBD', 
		'NumRotatableBonds', 
		'NumHeteroatoms', 
		'NumAmideBonds', 
		'FractionCSP3', 
		'NumRings', 
		'NumAromaticRings', 
		'NumAliphaticRings', 
		'NumSaturatedRings', 
		'NumHeterocycles', 
		'NumSaturatedHeterocycles', 
		'NumAliphaticHeterocycles', 
		'NumAtomStereoCenters', 
		'tpsa',
		'chi0v',
		'chi1v',
		'chi2v',
		'chi3v',
		'chi4v',
		'kappa1',
		'kappa2',
		'kappa3']


solvent_properties_subset = ['lipinskiHBA', 
		'NumHBA', 
		'lipinskiHBD', 
		'NumHBD', 
		'NumRotatableBonds', #MOVED DOWN 2
		'NumHeteroatoms', 
		'NumAmideBonds', 
		'FractionCSP3', 
		'NumRings', 
		'NumAromaticRings', 
		'NumAliphaticRings', 
		'NumSaturatedRings', 
		'NumHeterocycles', 
		'NumSaturatedHeterocycles', 
		'NumAliphaticHeterocycles', 
		'tpsa',
		'chi0v',
		'chi1v',
		'chi2v',
		'chi3v',
		'chi4v',
		'kappa1',
		'kappa2',
		'kappa3']


def generate_rdmol_descriptors(descriptor_list, smiles_lst):
	"""From a descriptor list and a smiles list, returns a list and dataframe of rdMolDescriptors (with tuples handled appropriately)."""
	mol_list = [Chem.MolFromSmiles(x) for x in smiles_lst]

	rd_descriptors = []
	properties = rdMolDescriptors.Properties(descriptor_list)
	for i,entry in enumerate(mol_list):
		if entry is None:
			print("SMILES COULD NOT BE PARSED:",smiles_lst[i],entry,i)
			rd_descriptors.append('')
			continue
		else:
			properties_tuple = list(zip(properties.GetPropertyNames(), properties.ComputeProperties(entry)))
			rd_descriptors.append(properties_tuple)
	rd_descriptors = [list(x) for x in rd_descriptors]
	descriptor_df = pd.DataFrame(rd_descriptors, columns=descriptor_list)

	def get_second_elmt_tuple(x):
		if x is None:
			return np.nan
		else:
			return x[1]
	descriptor_df = descriptor_df.applymap(get_second_elmt_tuple)
	descriptor_df = descriptor_df.dropna()
	return descriptor_df 

#* Generate Descriptors: MONOMER
df_mono_descriptors_2d = generate_rdmol_descriptors(mono_properties_subset, \
													list(dfnr.mono_smiles)) 

#* Generate Descriptors: ALL SOLVENT
df_solvent_descriptors_2d = generate_rdmol_descriptors(solvent_properties_subset, \
														list(dfnr.solvent_smiles))


#* Join Descriptors: Mono + Solvent
df_descriptors_2d = df_mono_descriptors_2d.join(df_solvent_descriptors_2d,how='left',lsuffix='_mono', rsuffix='solvent')
display(df_descriptors_2d)


Unnamed: 0,lipinskiHBA_mono,NumHBA_mono,lipinskiHBD_mono,NumHBD_mono,NumRotatableBonds_mono,NumHeteroatoms_mono,NumAmideBonds_mono,FractionCSP3_mono,NumRings_mono,NumAromaticRings_mono,...,NumAliphaticHeterocyclessolvent,tpsasolvent,chi0vsolvent,chi1vsolvent,chi2vsolvent,chi3vsolvent,chi4vsolvent,kappa1solvent,kappa2solvent,kappa3solvent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,26.02,3.964102,2.199359,0.799572,0.799572,0.453039,4.186113,1.646196,0.929059
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,17.07,2.908248,1.204124,0.000000,0.000000,0.000000,3.670000,1.044532,6.883958
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.00,3.464102,2.000000,0.666667,0.666667,0.384900,3.411571,1.605769,0.582399
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.00,5.035574,2.267787,0.000000,0.000000,0.000000,6.160000,1.741848,38.291986
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.00,4.242641,3.000000,1.500000,1.500000,1.060660,4.166667,2.222222,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,9.23,3.822462,1.991564,0.408248,0.408248,0.204124,4.960000,3.960000,3.960000
1814,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,40.46,2.308641,1.132456,0.100000,0.100000,0.000000,3.920000,2.920000,1.920000
1815,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,0.00,3.979031,1.963961,0.000000,0.000000,0.000000,4.870000,2.128398,13.259483
1816,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,20.23,3.568534,2.023335,0.511667,0.511667,0.158114,4.960000,3.960000,3.960000


In [43]:
non_zero_percents_atombd_2d = get_col_non_zero_percent(df_descriptors_2d, False, 1.0)
print(*sorted(non_zero_percents_atombd_2d,key= lambda x: x[1]),sep='\n')
print(*zip(df_descriptors_2d.columns,[type(x) for x in df_descriptors_2d.columns]),sep='\n')
display(df_descriptors_2d.corr().head())

BEFORE NON ZERO DROPS (1818, 49)
AFTER NON ZERO DROPS (1818, 49)
('NumSaturatedHeterocycles_mono', 6.325632563256326)
('NumAmideBondssolvent', 7.5357535753575355)
('NumAliphaticHeterocycles_mono', 9.02090209020902)
('NumSaturatedRings_mono', 10.176017601760176)
('NumSaturatedHeterocyclessolvent', 11.276127612761275)
('NumAliphaticHeterocyclessolvent', 11.276127612761275)
('NumHeterocyclessolvent', 12.706270627062707)
('NumSaturatedRingssolvent', 13.586358635863585)
('NumHeterocycles_mono', 13.641364136413642)
('NumAtomStereoCenters', 13.641364136413642)
('NumAliphaticRingssolvent', 13.806380638063805)
('NumAmideBonds_mono', 14.466446644664465)
('NumAromaticRingssolvent', 18.316831683168317)
('NumAliphaticRings_mono', 19.03190319031903)
('NumRotatableBondssolvent', 24.03740374037404)
('NumHBDsolvent', 25.90759075907591)
('lipinskiHBD_mono', 28.492849284928496)
('lipinskiHBDsolvent', 29.977997799779978)
('NumHBD_mono', 30.033003300330037)
('NumRingssolvent', 31.95819581958196)
('NumAroma

Unnamed: 0,lipinskiHBA_mono,NumHBA_mono,lipinskiHBD_mono,NumHBD_mono,NumRotatableBonds_mono,NumHeteroatoms_mono,NumAmideBonds_mono,FractionCSP3_mono,NumRings_mono,NumAromaticRings_mono,...,NumAliphaticHeterocyclessolvent,tpsasolvent,chi0vsolvent,chi1vsolvent,chi2vsolvent,chi3vsolvent,chi4vsolvent,kappa1solvent,kappa2solvent,kappa3solvent
lipinskiHBA_mono,1.0,0.912366,0.53432,0.53085,0.437185,0.808442,0.393589,0.148881,0.230562,0.225185,...,0.022663,0.07346,-0.07754,-0.076116,-0.073777,-0.073777,-0.082747,-0.059983,-0.076329,0.021644
NumHBA_mono,0.912366,1.0,0.299596,0.337206,0.472438,0.783957,0.140713,0.125558,0.248605,0.223125,...,0.047182,0.001483,-0.070233,-0.068195,-0.067823,-0.067823,-0.070977,-0.074935,-0.055112,-0.00692
lipinskiHBD_mono,0.53432,0.299596,1.0,0.913211,0.264235,0.405831,0.633376,0.118535,0.066437,0.091518,...,-0.052364,0.203877,-0.064036,-0.061108,-0.050518,-0.050518,-0.063797,-0.006183,-0.106302,0.06641
NumHBD_mono,0.53085,0.337206,0.913211,1.0,0.259124,0.437405,0.574008,0.098503,0.096002,0.12102,...,-0.05184,0.145466,-0.071643,-0.074184,-0.079739,-0.079739,-0.086685,-0.024535,-0.099023,0.05634
NumRotatableBonds_mono,0.437185,0.472438,0.264235,0.259124,1.0,0.42019,0.144652,0.340455,0.03995,0.133793,...,0.046222,-0.068741,-0.027934,-0.02268,-0.036553,-0.036553,-0.043935,-0.034197,-0.029501,-0.023844


In [44]:
#if use_norad_mono_smiles == True:

rdkit_fname = f"atom_bd_2d_rdkit{compression_type}" #! OLD and bad!
rdkit_fname = f"atactic_NOPE_{file_suffix}_atom_bd_2d_rdkit_norad{compression_type}"

if compression_type == '_blosc':
	write_compressed_pkl(df_descriptors_2d, pkl_path, rdkit_fname + ".pkl")
elif compression_type == '_gzip':
	df_descriptors_2d.to_pickle(pkl_path/(f"{rdkit_fname}.pkl"), compression={'method': 'gzip'})
else:
	df_descriptors_2d.to_json(json_path/f"atactic_NOPE_{file_suffix}_atom_bd_2d_rdkit_norad.json")


### Combine Descriptors

##### Read Morgan/RDKit FP and Mordred From File

In [46]:
mordred_fname = f"atactic_NOPE_{file_suffix}_mordred_descriptors_norad_noempty{compression_type}"
mfp_fname = f"atactic_NOPE_{file_suffix}_morgan_fp_{n_fp_bits}_norad{compression_type}"
rdfp_fname = f"atactic_NOPE_{file_suffix}_rdkit_fp_{n_fp_bits}_norad{compression_type}"
#rdkit_fname = f"atactic_NOPE_{file_suffix}_atom_bd_2d_rdkit_norad{compression_type}"


if compression_type == '_blosc':
	mordred_2d_norad = read_compressed_pkl(pkl_path, mordred_fname + ".pkl")
	#morgan_fp_df = None
	#if morgan_fp_df is None:
	morgan_fp_df = read_compressed_pkl(pkl_path, mfp_fname + ".pkl")
	#rdkit_fp_df = None
	#if rdkit_fp_df is None:
	rdkit_fp_df = read_compressed_pkl(pkl_path, rdfp_fname + ".pkl")
elif compression_type == '_gzip':
	mordred_2d_norad = pd.read_pickle(pkl_path/(f"{mordred_fname}.pkl"), compression={'method': 'gzip'})
	#morgan_fp_df = None
	#if morgan_fp_df is None:
	morgan_fp_df = pd.read_pickle(pkl_path/(f"{mfp_fname}.pkl"), compression={'method': 'gzip'})
	#rdkit_fp_df = None
	#if rdkit_fp_df is None:
	rdkit_fp_df = pd.read_pickle(pkl_path/(f"{rdfp_fname}.pkl"), compression={'method': 'gzip'})
else:
	mordred_2d_norad = pd.read_csv(csv_path/(f"atactic_NOPE_{file_suffix}_mordred_descriptors_norad_noempty.csv"))
	morgan_fp_df = pd.read_csv(csv_path/(f"atactic_NOPE_{file_suffix}_morgan_fp_{n_fp_bits}_norad.csv"))
	rdkit_fp_df = pd.read_csv(csv_path/(f"atactic_NOPE_{file_suffix}_rdkit_fp_{n_fp_bits}_norad.csv"))
display(morgan_fp_df.shape,rdkit_fp_df.shape,mordred_2d_norad)

(1818, 65536)

(1818, 65536)

Unnamed: 0,ABC_mono,ABCGG_mono,nAcid_mono,SpAbs_A_mono,SpMax_A_mono,SpDiam_A_mono,SpAD_A_mono,SpMAD_A_mono,LogEE_A_mono,VE1_A_mono,...,SRW09_solvent,SRW10_solvent,TSRW10_solvent,MW_solvent,AMW_solvent,WPath_solvent,WPol_solvent,Zagreb1_solvent,Zagreb2_solvent,mZagreb2_solvent
0,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,8.124151,33.544698,93.057849,6.646989,42,5,30.0,31.0,1.666667
1,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,6.188264,24.179697,58.041865,5.804186,9,0,12.0,9.0,1.000000
2,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.627057,30.941317,78.046950,6.503913,27,3,24.0,24.0,1.500000
3,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.625107,29.418928,151.875411,30.375082,16,0,20.0,16.0,1.000000
4,0.000000,0.000000,0,2.000000,1.00000,2.000000,2.000000,1.000000,1.407606,1.414214,...,0.0,7.627057,30.941317,84.093900,4.671883,27,3,24.0,24.0,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000
1814,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,5.509388,22.328143,62.036779,6.203678,10,1,10.0,8.0,1.250000
1815,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.188264,24.179697,117.914383,23.582877,9,0,12.0,9.0,1.000000
1816,7.140017,6.873356,0,12.857279,2.21432,4.428639,12.857279,1.285728,3.188884,2.963010,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000


##### Concatenate RD Descriptors with FPs and/or Mordred and/or Morpheus
- Note that some MORFEUS/Mordred descriptors are redundant with rdkit descriptors.

In [47]:
atom_bond_mordred_2d = df_descriptors_2d.join(mordred_2d_norad)
atom_bond_mfp = df_descriptors_2d.join(morgan_fp_df)
atom_bond_rdfp = df_descriptors_2d.join(rdkit_fp_df)
display(atom_bond_mordred_2d,atom_bond_mfp,atom_bond_rdfp)


Unnamed: 0,lipinskiHBA_mono,NumHBA_mono,lipinskiHBD_mono,NumHBD_mono,NumRotatableBonds_mono,NumHeteroatoms_mono,NumAmideBonds_mono,FractionCSP3_mono,NumRings_mono,NumAromaticRings_mono,...,SRW09_solvent,SRW10_solvent,TSRW10_solvent,MW_solvent,AMW_solvent,WPath_solvent,WPol_solvent,Zagreb1_solvent,Zagreb2_solvent,mZagreb2_solvent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,8.124151,33.544698,93.057849,6.646989,42,5,30.0,31.0,1.666667
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,6.188264,24.179697,58.041865,5.804186,9,0,12.0,9.0,1.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,7.627057,30.941317,78.046950,6.503913,27,3,24.0,24.0,1.500000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,7.625107,29.418928,151.875411,30.375082,16,0,20.0,16.0,1.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,7.627057,30.941317,84.093900,4.671883,27,3,24.0,24.0,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000
1814,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,5.509388,22.328143,62.036779,6.203678,10,1,10.0,8.0,1.250000
1815,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,6.188264,24.179697,117.914383,23.582877,9,0,12.0,9.0,1.000000
1816,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000


Unnamed: 0,lipinskiHBA_mono,NumHBA_mono,lipinskiHBD_mono,NumHBD_mono,NumRotatableBonds_mono,NumHeteroatoms_mono,NumAmideBonds_mono,FractionCSP3_mono,NumRings_mono,NumAromaticRings_mono,...,morgan_solvent_32758,morgan_solvent_32759,morgan_solvent_32760,morgan_solvent_32761,morgan_solvent_32762,morgan_solvent_32763,morgan_solvent_32764,morgan_solvent_32765,morgan_solvent_32766,morgan_solvent_32767
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1814,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0,0,0,1,0,0,0,0,0,0
1815,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1816,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,lipinskiHBA_mono,NumHBA_mono,lipinskiHBD_mono,NumHBD_mono,NumRotatableBonds_mono,NumHeteroatoms_mono,NumAmideBonds_mono,FractionCSP3_mono,NumRings_mono,NumAromaticRings_mono,...,rdkit_solvent_32758,rdkit_solvent_32759,rdkit_solvent_32760,rdkit_solvent_32761,rdkit_solvent_32762,rdkit_solvent_32763,rdkit_solvent_32764,rdkit_solvent_32765,rdkit_solvent_32766,rdkit_solvent_32767
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1814,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1815,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1816,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0


##### Checking for Null Values

In [48]:

df_dict = {
	"atom_bd_2d":  df_descriptors_2d,
	"mordred_2d":  df_mordred_2d_norad,
	"mfp_2d":  morgan_fp_df,
	"rdfp_2d":  rdkit_fp_df,
	"rdkit_mordred_2d":  atom_bond_mordred_2d,
	"rdkit_mfp_2d":  atom_bond_mfp,
	"rdkit_rdfp_2d":  atom_bond_rdfp,
}

for df_name,df in df_dict.items():
	print(f"\n{df_name} with shape {df.shape}")
	has_null = False
	for j,colname in enumerate(df.columns):
		has_null = df[colname].isna().values.any()

		if has_null == True:
			print(j,colname)
	if has_null == False:
		print(f"\tNo null values found for {df_name}")
	


atom_bd_2d with shape (1818, 49)
	No null values found for atom_bd_2d

mordred_2d with shape (1818, 1611)
	No null values found for mordred_2d

mfp_2d with shape (1818, 65536)
	No null values found for mfp_2d

rdfp_2d with shape (1818, 65536)
	No null values found for rdfp_2d

rdkit_mordred_2d with shape (1818, 1660)
	No null values found for rdkit_mordred_2d

rdkit_mfp_2d with shape (1818, 65585)
	No null values found for rdkit_mfp_2d

rdkit_rdfp_2d with shape (1818, 65585)
	No null values found for rdkit_rdfp_2d


### Remove Mordred Overlap with RDKit
Removing Mordred descriptors which overlap w/ RDKit (did a manual check on all ~1600!)

In [49]:
mordred_removal_candidates_manual = ['nAromAtom', 
'nAromBond', 
'nHetero', 
'nX',
'nBondsA', 
'C1SP3', 
'C2SP3', 
'C3SP3', 
'C4SP3', 
'FCSP3', 
'nHBAcc', 
'nHBDon', 
'Kier1', 
'LabuteASA', 
'nRing', 
'n5Ring', 
'n6Ring', 
'nHRing', 
'n5HRing', 
'n6HRing', 
'naRing', 
'n6aRing', 
'naHRing', 
'n6aHRing', 
'nARing', 
'n5ARing', 
'n6ARing', 
'nAHRing', 
'n5AHRing', 
'nRot', 
'TopoPSA(NO)', 
'TopoPSA']

In [50]:
drop_overlap_mordred = True


if drop_overlap_mordred is True:
	orig_cols = list(atom_bond_mordred_2d.columns)
	drop_cols_mono = [x + '_mono' for x in mordred_removal_candidates_manual]
	not_solv_descriptors = ['C3SP3', 'C4SP3', 'Kier1', 'naHRing', 'n6aHRing']
	drop_cols_solv = [x + '_solvent' for x in mordred_removal_candidates_manual if x not in not_solv_descriptors]
	print('Before: ',atom_bond_mordred_2d.shape)
	atom_bond_mordred_2d = atom_bond_mordred_2d.drop(columns=drop_cols_mono)
	atom_bond_mordred_2d = atom_bond_mordred_2d.drop(columns=drop_cols_solv)
	print('After: ',atom_bond_mordred_2d.shape)
	#atom_bond_mordred_2d = atom_bond_mordred_2d.drop(columns=mordred_removal_candidates_manual)
	print("DROPPED OVERLAPPING MORDRED DESCRIPTORS!")
else:
	print("Did NOT modify descriptors")
atom_bond_mordred_2d

Before:  (1818, 1660)
After:  (1818, 1601)
DROPPED OVERLAPPING MORDRED DESCRIPTORS!


Unnamed: 0,lipinskiHBA_mono,NumHBA_mono,lipinskiHBD_mono,NumHBD_mono,NumRotatableBonds_mono,NumHeteroatoms_mono,NumAmideBonds_mono,FractionCSP3_mono,NumRings_mono,NumAromaticRings_mono,...,SRW09_solvent,SRW10_solvent,TSRW10_solvent,MW_solvent,AMW_solvent,WPath_solvent,WPol_solvent,Zagreb1_solvent,Zagreb2_solvent,mZagreb2_solvent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,8.124151,33.544698,93.057849,6.646989,42,5,30.0,31.0,1.666667
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,6.188264,24.179697,58.041865,5.804186,9,0,12.0,9.0,1.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,7.627057,30.941317,78.046950,6.503913,27,3,24.0,24.0,1.500000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,7.625107,29.418928,151.875411,30.375082,16,0,20.0,16.0,1.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,7.627057,30.941317,84.093900,4.671883,27,3,24.0,24.0,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1813,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000
1814,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,5.509388,22.328143,62.036779,6.203678,10,1,10.0,8.0,1.250000
1815,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,6.188264,24.179697,117.914383,23.582877,9,0,12.0,9.0,1.000000
1816,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.142857,1.0,1.0,...,0.0,6.192362,25.583106,74.073165,4.938211,20,2,14.0,12.0,1.500000


In [51]:
atom_bond_mordred_2d_fname = f"atactic_NOPE_{file_suffix}_atom_bond_mordred_2d{compression_type}"
atom_bond_mfp_fname = f"atactic_NOPE_{file_suffix}_atom_bond_morganfp_32768_2d{compression_type}"
atom_bond_rdfp_fname = f"atactic_NOPE_{file_suffix}_atom_bond_rdkitfp_32768_2d{compression_type}"


if compression_type == '_blosc':
	write_compressed_pkl(atom_bond_mordred_2d, pkl_path, atom_bond_mordred_2d_fname + ".pkl")
	write_compressed_pkl(atom_bond_mfp, pkl_path, atom_bond_mfp_fname + ".pkl")
	write_compressed_pkl(atom_bond_rdfp, pkl_path, atom_bond_rdfp_fname + ".pkl")
elif compression_type == '_gzip':
	atom_bond_mordred_2d.to_pickle(pkl_path/(f"{atom_bond_mordred_2d_fname}.pkl"), compression={'method': 'gzip'})
	atom_bond_mfp.to_pickle(pkl_path/(f"{atom_bond_mfp_fname}.pkl"), compression={'method': 'gzip'})
	atom_bond_rdfp.to_pickle(pkl_path/(f"{atom_bond_rdfp_fname}.pkl"), compression={'method': 'gzip'})
else:
	atom_bond_mordred_2d.to_csv(csv_path/f"atactic_NOPE_{file_suffix}_atom_bond_mordred_2d.csv")
	atom_bond_mfp.to_csv(csv_path/f"atactic_NOPE_{file_suffix}_atom_bond_morganfp_32768_2d.csv")
	atom_bond_rdfp.to_csv(csv_path/f"atactic_NOPE_{file_suffix}_atom_bond_rdkitfp_32768_2d.csv")