In [1]:
import VASPsol as vs
import pandas as pd
import os
import numpy as np
import COSMO_TL as ctl
from dask.distributed import Client, LocalCluster, progress
import dask

In [2]:
cluster = LocalCluster(n_workers=16, threads_per_worker=1, memory_limit='2GB')
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46315 instead


In [3]:
directory = '/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_Benchmarks/'

In [17]:
# get all the directories under directory that have a VAC folder
# get all subdirectories under 
subdirs = [x[0] for x in os.walk(directory)]

def get_run_dirs(subdir):
    if os.path.isdir(os.path.join(subdir, 'VAC')):
        run_dir = subdir
        return run_dir

run_dirs = [dask.delayed(get_run_dirs)(subdir) for subdir in subdirs]
run_dirs = dask.compute(*run_dirs)
run_dirs, len(run_dirs)

KeyboardInterrupt: 

In [18]:
run_dirs = [x for x in run_dirs if x is not None]
len(run_dirs)

4270

In [19]:
vasp_data = [client.submit(vs.data, run_dir) for run_dir in run_dirs]
progress(vasp_data)

VBox()

In [20]:
vasp_data = client.gather(vasp_data) 

In [21]:
df = pd.concat([x.ml_df for x in vasp_data])
# name the fname vaspsol_data_mm_dd_yyyy.csv
fname = 'vaspsol_data_'+pd.Timestamp.now().strftime('%m_%d_%Y')+'.csv'
df.to_csv(fname, index=False)

In [22]:

# get all the sigma profiles for each row
sig_jobs = [client.submit(ctl.gl.tr_sig, i) for i in df['SoluteName'].unique()]
progress(sig_jobs)

fluorobenzene , not found in tr set
FLUOROBENZENE 1.0
FOUND :  FLUOROBENZENE


VBox()

In [23]:
sig_jobs = client.gather(sig_jobs)

sigma_profiles = []
areas = []
volumes = []
for i in sig_jobs:
    try:
        sigma_profiles.append(i[0])
        areas.append(i[1][0])
        volumes.append(i[1][1])
    except:
        sigma_profiles.append(np.nan)
        areas.append(np.nan)
        volumes.append(np.nan)
    

In [24]:
sig_df = pd.DataFrame({'SoluteName': df['SoluteName'].unique(),
                          'sigma_solute': sigma_profiles,
                            'area_solute': areas,
                            'volume_solute': volumes})
sig_df = sig_df.dropna()
# expand sigma_solute starting labeling columns sigma_solute_0, sigma_solute_1, etc.
sig_df = sig_df.join(pd.DataFrame(sig_df['sigma_solute'].tolist(), index=sig_df.index).add_prefix('sigma_solute_'))
sig_df = sig_df.drop(columns=['sigma_solute'])
sig_df


Unnamed: 0,SoluteName,area_solute,volume_solute,sigma_solute_0,sigma_solute_1,sigma_solute_2,sigma_solute_3,sigma_solute_4,sigma_solute_5,sigma_solute_6,...,sigma_solute_41,sigma_solute_42,sigma_solute_43,sigma_solute_44,sigma_solute_45,sigma_solute_46,sigma_solute_47,sigma_solute_48,sigma_solute_49,sigma_solute_50
0,fluorobenzene,129.11164,118.58796,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,chloroethene,130.86805,120.08422,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,fluoromethane,73.04786,54.03435,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,p-hydroxybenzaldehyde,154.86425,148.18570,0.0,0.0,0.0,0.0,0.0,0.077145,0.817811,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,chlorofluoromethane,85.84896,68.19157,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,1-pentene,132.80415,119.25272,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
567,3-methylphenol,167.77043,163.60916,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
570,pentylacetate,193.44069,183.75686,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
571,"1,2-ethanediamine",124.74549,113.38439,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.28798,1.197013,1.403753,0.929919,0.527809,0.691103,0.278165,0.0,0.0,0.0


In [25]:
# repeat this process and append the solvent data to the dataframe
sol_jobs = [client.submit(ctl.gl.tr_sig, i) for i in df['Solvent'].unique()]
progress(sol_jobs)


VBox()

In [26]:
sol_jobs = client.gather(sol_jobs)


octane , not found in tr set
N-OCTANE 0.8571428571428571
dichloroethane , not found in tr set
FOUND :  N-OCTANE
carbontet , not found in tr set
ethylacetate , not found in tr set
perfluorobenzene , not found in tr set
nonanol , not found in tr set
CARBON-TETRAFLUORIDE 0.6206896551724138
isopropyltoluene , not found in tr set
pentanol , not found in tr set
ETHYL-ACETATE 0.96
2-NONANOL 0.875
tributylphosphate , not found in tr set
trimethylbenzene , not found in tr set
PERFLUOROBENZENE 1.0
hexanol , not found in tr set
DICHLOROMETHANE 0.9655172413793104
tetrachloroethene , not found in tr set
3-PENTANOL 0.8888888888888888
butanol , not found in tr set
DIISOPROPYL-KETONE 0.7647058823529411
2-HEXANOL 0.875
TRIETHYL-PHOSPHATE 0.8571428571428571
undecane , not found in tr set
FOUND :  ETHYL-ACETATE
FOUND :  2-NONANOL
N-BUTANOL 0.875
FOUND :  PERFLUOROBENZENE
ETHYLBENZENE 0.8571428571428571
FOUND :  DICHLOROMETHANE
FOUND :  3-PENTANOL
N-UNDECANE 0.8888888888888888
TETRACHLOROETHYLENE 0.944444

In [27]:
sigma_profiles = []
areas = []
volumes = []
for i in sol_jobs:
    try:
        sigma_profiles.append(i[0])
        areas.append(i[1][0])
        volumes.append(i[1][1])
    except:
        sigma_profiles.append(np.nan)
        areas.append(np.nan)
        volumes.append(np.nan)

sol_df = pd.DataFrame({'Solvent': df['Solvent'].unique(),
                            'sigma_solvent': sigma_profiles,
                            'area_solvent': areas,
                            'volume_solvent': volumes})
sol_df = sol_df.dropna()
sol_df = sol_df.join(pd.DataFrame(sol_df['sigma_solvent'].tolist(), index=sol_df.index).add_prefix('sigma_solvent_'))
sol_df = sol_df.drop(columns=['sigma_solvent'])
sol_df

Unnamed: 0,Solvent,area_solvent,volume_solvent,sigma_solvent_0,sigma_solvent_1,sigma_solvent_2,sigma_solvent_3,sigma_solvent_4,sigma_solvent_5,sigma_solvent_6,...,sigma_solvent_41,sigma_solvent_42,sigma_solvent_43,sigma_solvent_44,sigma_solvent_45,sigma_solvent_46,sigma_solvent_47,sigma_solvent_48,sigma_solvent_49,sigma_solvent_50
0,water,43.26923,25.73454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.053392,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hexadecane,356.86514,364.78139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,octanol,205.69824,201.77709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.193256,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,dimethylsulfoxide,111.55473,98.66647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.680617,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,methanol,67.90163,48.77104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.048332,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,isopropanol,108.38848,92.10607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.381600,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,mcresol,150.16772,142.60401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,bromoethane,108.94110,95.72774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94,butylbenzene,197.69481,196.40914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# merge sol_df and sig_df into df
df2 = pd.merge(df, sig_df, on='SoluteName')
df2 = pd.merge(df2, sol_df, on='Solvent')
df2 = df2[df2['Charge'] == 0]
df2['error'] = df2['error'].abs()
df2 = df2[df2['error'].abs() < 10]
df2 = df2.dropna()
df2 = df2.reset_index(drop=True)
df2.to_csv('../data/'+fname)
df2

Unnamed: 0,Solvent,FileHandle,directory,NC_K,SIGMA_K,TAU,Solvation_Energy,Total_Energy,No.,SoluteName,...,sigma_solvent_41,sigma_solvent_42,sigma_solvent_43,sigma_solvent_44,sigma_solvent_45,sigma_solvent_46,sigma_solvent_47,sigma_solvent_48,sigma_solvent_49,sigma_solvent_50
0,water,0157flu,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0010,0.200000,0.000670,0.073867,-76.193434,2240,fluorobenzene,...,2.053392,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,water,0157flu,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0020,0.866667,0.000010,-0.200151,-76.467452,2240,fluorobenzene,...,2.053392,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,water,0157flu,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0010,1.200000,0.001000,-0.042744,-76.310045,2240,fluorobenzene,...,2.053392,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,water,0157flu,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0010,0.200000,0.001000,0.152467,-76.114834,2240,fluorobenzene,...,2.053392,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,water,0157flu,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0030,0.533333,0.000340,-0.097007,-76.364308,2240,fluorobenzene,...,2.053392,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7906,mcresol,0036tol,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0025,0.600000,0.000525,0.006012,-92.653357,1438,toluene,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7907,bromoethane,0036tol,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0025,0.600000,0.000525,0.015738,-92.643631,181,toluene,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7908,butylbenzene,0078pen,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0025,0.600000,0.000525,0.020910,-89.013866,261,2-pentanone,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7909,secbutanol,0069met,/blue/hennig/ericfonseca/NASA/VASPsol/Truhlar_...,0.0025,0.600000,0.000525,-0.114669,-22.252249,1934,formaldehyde,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
def get_X_solute(df):
    X = df[['volume_solute', 'area_solute', 'NC_K', 'SIGMA_K','TAU']]
    sig_cols = [col for col in df.columns if 'sigma_solute' in col]
    sigs = df[sig_cols].to_numpy()
    X = X.to_numpy().reshape(len(df), -1)
    X = np.column_stack((X, sigs))
    return X
def get_X_solvent(df):
    X = df[['volume_solvent', 'area_solvent','NC_K','SIGMA_K','TAU']]
    sig_cols = [col for col in df.columns if 'sigma_solvent' in col]
    sigs = df[sig_cols].to_numpy()
    X = X.to_numpy().reshape(len(df), -1)
    X = np.column_stack((X, sigs))
    return X
def get_X(df):
    X_solute = get_X_solute(df)
    X_solvent = get_X_solvent(df)
    X = np.column_stack((X_solute, X_solvent))
    return X
X_solute = get_X_solute(df2)
X_solvent = get_X_solvent(df2)
X = get_X(df2)
X_solute.shape, X_solvent.shape,   X.shape

((7911, 56), (7911, 56), (7911, 112))

In [None]:
cluster.close()
client.close()
