In [1]:
import VASPsol as vs
import pandas as pd
import os
import numpy as np
import COSMO_TL as ctl
from dask.distributed import Client, LocalCluster, progress
import dask

In [2]:

def get_X_solute(df):
    X = df[['volume_solute', 'area_solute', 'NC_K', 'SIGMA_K','TAU', 'default_error']]
    sig_cols = [col for col in df.columns if 'sigma_solute' in col]
    sigs = df[sig_cols].to_numpy()
    X = X.to_numpy().reshape(len(df), -1)
    X = np.column_stack((X, sigs))
    return X

def get_X_solvent(df):
    X = df[['volume_solvent', 'area_solvent']]
    solvent_props_names = ['eps', 'n', 'alpha', 'beta', 'gamma', 'phi**2', 'psi**2', 'beta**2', 'NC_K','SIGMA_K','TAU', 'default_error']
    solvent_props = df[solvent_props_names].to_numpy()
    X = np.column_stack((X, solvent_props)) 
    sig_cols = [col for col in df.columns if 'sigma_solvent' in col]
    sigs = df[sig_cols].to_numpy()
    X = X.to_numpy().reshape(len(df), -1)
    X = np.column_stack((X, sigs))
    return X

def get_X(df):
    X_solute = get_X_solute(df)
    # use everything but the last 4 columns
    X_solvent = get_X_solvent(df)[:,:-4]
    X = np.column_stack((X_solute, X_solvent))
    return X
# given a dataframe return a dataframe with the mean value for all columns with error in the name
# grouped by SoluteName
def get_mean_df(df):
    df2 = df.copy()
    original_columns = list(df2.columns)
    print(original_columns)
    cols = [col for col in df2.columns if 'error' in col]
    original_columns = [col for col in original_columns if col not in cols]
    df3 = df2.groupby(['SoluteName', 'NC_K','SIGMA_K','TAU'])[cols].mean()
    df3 = df3.reset_index()
    # return a dataframe with the unique values in SoluteName and the mean values for all columns with error in the name
    # get all the other colums  from the original dataframe
    df4 = df2[original_columns]
    df4 = df4.reset_index(drop=True)
    df5 = pd.merge(df4, df3, on=['SoluteName', 'NC_K','SIGMA_K','TAU'])
    df5 = df5.drop_duplicates()
    return df5

def rejection_sampling(vector, samples=100):
    indicies = np.arange(len(vector))
    # fit a gaussian kernel to the data
    from sklearn.neighbors import KernelDensity
    kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(vector.reshape(-1, 1))
    # create a vector of each points probability
    logprob = kde.score_samples(vector.reshape(-1, 1))
    p = np.exp(logprob)
    # normalize the probabilities
    p = p / p.sum()

    output = []
    for sample in range(samples):
        # use the probability to sample the data
        sample = np.random.choice(indicies, p=p)
        # make sure the sample is not already in the vector
        while sample in output:
            sample = np.random.choice(indicies, p=p)
        # add the sample to the vector
        output.append(sample)
    return output

In [3]:

df = pd.read_csv('../data/vaspsol_data_3_2_2023.csv')
print(len(df))
df['error'] = df['error'].abs()
df = df[df['error'] < 10]
#df = df[df['Solvent'] == 'water']
df = df[df['Charge'] == 0]
NC_K_default = 0.0025
SIGMA_K_default = 0.6
TAU_default = 0.000525
default_df = df[(df['NC_K'] == NC_K_default) & (df['SIGMA_K'] == SIGMA_K_default) & (df['TAU'] == TAU_default)]
default_df


df_to_append = default_df[['SoluteName','error']]
# rename error to default_error
df_to_append = df_to_append.rename(columns={'error': 'default_error'})
df_to_append


# match up the default error back to the original dataframe
df = pd.merge(df, df_to_append, on=['SoluteName'])
# this expanded the number of rows in the dataframe. This is not what we want
df = df.drop_duplicates('Unnamed: 0')

groups = df[df['Solvent']=='water'].groupby(['NC_K', 'SIGMA_K', 'TAU'])
# get the number of unique groups
# using the groups split of the dataframe so that unique combos of NC_K, SIGMA_K, and TAU are in each group
split = 0.99
# get the unique groups
groups = df.groupby(['NC_K', 'SIGMA_K', 'TAU'])
# get the indicies of the groups
indicies = [np.array(i) for i in groups.indices.values()]
# get the number of groups
num_groups = len(indicies)
# get the number of groups to use for training
num_train_groups = int(num_groups*split)
# get the indicies of the groups to use for training
train_indicies = np.random.choice(indicies, size=num_train_groups, replace=False)
train_indicies = np.concatenate(train_indicies.flatten())
# get the indicies of the groups to use for testing
test_indicies = np.array([i for i in np.concatenate(indicies) if i not in train_indicies])
train_df = df.iloc[train_indicies]
test_df = df.iloc[test_indicies]

X_train = get_X_solute(train_df)
X_test = get_X_solute(test_df)
y_train = train_df['error'].to_numpy()
y_test = test_df['error'].to_numpy()


# print out the shape of the training data and the training labels. Nice retro looking print statment
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')
n_observations_train = X_train.shape[0]
n_features_train = X_train.shape[1]
n_observations_test = X_test.shape[0]
n_features_test = X_test.shape[1]

print('TRAINING SET DETAILS')
print(f'Number of observations: {n_observations_train}')
print(f'Number of features: {n_features_train}')

print('TESTING SET DETAILS')
print(f'Number of observations: {n_observations_test}')
print(f'Number of features: {n_features_test}')

6539
X_train shape: (6500, 57), y_train shape: (6500,)
X_test shape: (39, 57), y_test shape: (39,)
TRAINING SET DETAILS
Number of observations: 6500
Number of features: 57
TESTING SET DETAILS
Number of observations: 39
Number of features: 57


  train_indicies = np.random.choice(indicies, size=num_train_groups, replace=False)


In [4]:
# lets use oversampling to balance the data
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
n = 10
# get the counts for each NC_K, SIGMA_K, and TAU combo
# this is the number of observations for each group
counts = df.groupby(['NC_K', 'SIGMA_K', 'TAU']).size()
# get the max number of observations for any group
max_count = counts.max()
# reset df so that only groups with n members are included
df = df.groupby(['NC_K', 'SIGMA_K', 'TAU']).filter(lambda x: len(x) >= n)
# copy samples of the groups with less than the max number of observations
# until the number of observations for each group is equal to the max number of observations
# this will balance the data
df2 = df.groupby(['NC_K', 'SIGMA_K', 'TAU']).apply(lambda x: x.sample(max_count, replace=True))
# relabel the NC_K, SIGMA_K, and TAU columns to index_0, index_1, and index_2
# this is because the groupby function adds a new column for the groupby columns
df2 = df2.rename(columns={'NC_K': 'index_0', 'SIGMA_K': 'index_1', 'TAU': 'index_2'})
df2, df2['Solvent'].unique().shape

(                                  Unnamed: 0 Solvent FileHandle   index_0  \
 NC_K     SIGMA_K  TAU                                                       
 0.000875 1.000000 0.000794 10483        1488   water    0044met  0.000875   
                            74750        4942   water    0086eth  0.000875   
                            68952        4021   water    0053phe  0.000875   
                            10483        1488   water    0044met  0.000875   
                            10483        1488   water    0044met  0.000875   
 ...                                      ...     ...        ...       ...   
 0.004000 0.822222 0.000525 71192        4301   water    0053phe  0.004000   
                            13459        1860   water    0044met  0.004000   
                            57595        3639   water    0036tol  0.004000   
                            57595        3639   water    0036tol  0.004000   
                            41300        3094   water    0506nit

In [5]:
# now grab 100 samples from each group and remake the dataframe
# this will make the data more balanced
df3 = df2.groupby(['NC_K', 'SIGMA_K', 'TAU']).apply(lambda x: x.sample(25, replace=False))
# make a normal dataframe from the groupby object
df3 = df3.reset_index(drop=True)
df3['NC_K'] = df3['index_0']
df3['SIGMA_K'] = df3['index_1']
df3['TAU'] = df3['index_2']
df3

# lets make df4 that uses rejection sampling to balance the data
# this will be used to compare the results of the oversampling
df_temp = df2
# make NC_K, SIGMA_K, and TAU columns that correspond
#df2 = df2.rename(columns={'index_0': 'NC_K', 'index_1': 'SIGMA_K', 'index_2': 'TAU'})
# extract a dataframe for each group
# break up the dataframe into multiple dataframes each of which contains only one group
# this is done by using the groupby function
# use the index of the groupby object to get the indicies of the rows in the original dataframe
# that correspond to the group

dfs = []
for i in df_temp.groupby(['NC_K', 'SIGMA_K', 'TAU']).indices.values():
    dfs.append(df_temp.iloc[i])
# for each group use rejection sampling to select 100 samples
for i in range(len(dfs)):
    indicies = rejection_sampling(dfs[i]['error'].to_numpy(), 200)
dfs = [dfs[i].iloc[indicies] for i in range(len(dfs))]
df3 = pd.concat(dfs)
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 0,Solvent,FileHandle,index_0,index_1,index_2,Solvation_Energy,Total_Energy,No.,SoluteName,...,sigma_solvent_42,sigma_solvent_43,sigma_solvent_44,sigma_solvent_45,sigma_solvent_46,sigma_solvent_47,sigma_solvent_48,sigma_solvent_49,sigma_solvent_50,default_error
NC_K,SIGMA_K,TAU,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
0.000875,1.000000,0.000794,68952,4021,water,0053phe,0.000875,1.000000,0.000794,-0.143641,-82.828741,2142,phenol,...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.902688
0.000875,1.000000,0.000794,77467,5416,water,0217wat,0.000875,1.000000,0.000794,-0.249512,-14.477122,2285,water,...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.896919
0.000875,1.000000,0.000794,17626,2216,water,0018cyc,0.000875,1.000000,0.000794,0.131978,-98.980835,2108,cyclohexane,...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040865
0.000875,1.000000,0.000794,68952,4021,water,0053phe,0.000875,1.000000,0.000794,-0.143641,-82.828741,2142,phenol,...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.902688
0.000875,1.000000,0.000794,77467,5416,water,0217wat,0.000875,1.000000,0.000794,-0.249512,-14.477122,2285,water,...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.896919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.004000,0.822222,0.000525,74179,4834,water,0062dio,0.004000,0.822222,0.000525,-0.443380,-78.639905,2150,"1,4-dioxane",...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.469889
0.004000,0.822222,0.000525,57595,3639,water,0036tol,0.004000,0.822222,0.000525,-0.271411,-92.930785,2126,toluene,...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119705
0.004000,0.822222,0.000525,71192,4301,water,0053phe,0.004000,0.822222,0.000525,-0.631490,-83.316686,2142,phenol,...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.902688
0.004000,0.822222,0.000525,57595,3639,water,0036tol,0.004000,0.822222,0.000525,-0.271411,-92.930785,2126,toluene,...,0.080147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119705


In [7]:
# make NC_K, SIGMA_K and TAU columns instead of index. Right now NC_K, SIGMA_K, and TAU are a multi-level index 
# this is because the groupby function adds a new column for the groupby columns

NC_K = df3.index.get_level_values('NC_K')
SIGMA_K = df3.index.get_level_values('SIGMA_K')
TAU = df3.index.get_level_values('TAU')

# remove all the multi-level index columns
df4 = df3.reset_index(drop=True)
df4['NC_K'] = NC_K
df4['SIGMA_K'] = SIGMA_K
df4['TAU'] = TAU
df4

Unnamed: 0.1,Unnamed: 0,Solvent,FileHandle,index_0,index_1,index_2,Solvation_Energy,Total_Energy,No.,SoluteName,...,sigma_solvent_45,sigma_solvent_46,sigma_solvent_47,sigma_solvent_48,sigma_solvent_49,sigma_solvent_50,default_error,NC_K,SIGMA_K,TAU
0,714,water,0008noc,0.000875,1.000000,0.000794,0.187200,-139.507280,2100,n-octane,...,0.0,0.0,0.0,0.0,0.0,0.0,1.255968,0.000875,1.000000,0.000794
1,4021,water,0053phe,0.000875,1.000000,0.000794,-0.143641,-82.828741,2142,phenol,...,0.0,0.0,0.0,0.0,0.0,0.0,0.902688,0.000875,1.000000,0.000794
2,4021,water,0053phe,0.000875,1.000000,0.000794,-0.143641,-82.828741,2142,phenol,...,0.0,0.0,0.0,0.0,0.0,0.0,0.902688,0.000875,1.000000,0.000794
3,2216,water,0018cyc,0.000875,1.000000,0.000794,0.131978,-98.980835,2108,cyclohexane,...,0.0,0.0,0.0,0.0,0.0,0.0,0.040865,0.000875,1.000000,0.000794
4,5416,water,0217wat,0.000875,1.000000,0.000794,-0.249512,-14.477122,2285,water,...,0.0,0.0,0.0,0.0,0.0,0.0,0.896919,0.000875,1.000000,0.000794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36195,4834,water,0062dio,0.004000,0.822222,0.000525,-0.443380,-78.639905,2150,"1,4-dioxane",...,0.0,0.0,0.0,0.0,0.0,0.0,1.469889,0.004000,0.822222,0.000525
36196,865,water,0008noc,0.004000,0.822222,0.000525,-0.188190,-139.882710,2100,n-octane,...,0.0,0.0,0.0,0.0,0.0,0.0,1.255968,0.004000,0.822222,0.000525
36197,3639,water,0036tol,0.004000,0.822222,0.000525,-0.271411,-92.930785,2126,toluene,...,0.0,0.0,0.0,0.0,0.0,0.0,0.119705,0.004000,0.822222,0.000525
36198,865,water,0008noc,0.004000,0.822222,0.000525,-0.188190,-139.882710,2100,n-octane,...,0.0,0.0,0.0,0.0,0.0,0.0,1.255968,0.004000,0.822222,0.000525


In [8]:
import dask.dataframe as dd
ddf = dd.from_pandas(df4, npartitions=100)
ddf.to_parquet('../data/vaspsol_data_3_2_2023_balanced/')