In [1]:
import os
import sys
import pickle
import pandas as pd
import numpy as np
import math
import json

# Switch to parent path to import local module
parent_path = str(os.getcwd()).split('prec_rec')[0] # zeosyn_gen
os.chdir(parent_path)
print('Switched directory to:', os.getcwd())

import data.utils as utils
sys.modules['utils'] = utils # Way to get around relative imports in utils for ZeoSynGen_dataset # https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory

from precursors_clean import prec_dict_clean

Switched directory to: /home/jupyter/Elton/Zeolites/zeosyn_gen


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(f'data/ZeoSynGen_dataset.pkl', 'rb') as f: # load dataset
    dataset = pickle.load(f)

train_dataset, val_dataset, test_dataset = dataset.train_val_test_split(mode='system', both_graph_feat_present=True, random_state=0, return_dataframe=True)

SYSTEMS:
train+val: 1856 test: 464

n_datapoints:
train: 14749 val: 2107 test: 5168


In [3]:
def get_precursor_dataset(split, save=False):
    '''
    Returns a dataframe of precursors for a given split of the dataset.

    Args:
        split: str, 'train', 'val', or 'test'
    
    Returns:
        prec_df: pd.DataFrame, columns: ['zeo', 'osda', 'precs',  <zeolite features> ..., <osda features> ...]
    '''

    assert split in ['train', 'val', 'test']

    if split == 'train':
        syn_ratio, precs, zeo_code, zeo, osda_smiles, osda = train_dataset[1], train_dataset[2], train_dataset[3], train_dataset[5], train_dataset[13], train_dataset[15]
    elif split == 'val':
        syn_ratio, precs, zeo_code, zeo, osda_smiles, osda = val_dataset[1], val_dataset[2], val_dataset[3], val_dataset[5], val_dataset[13], val_dataset[15]
    elif split == 'test':
        syn_ratio, precs, zeo_code, zeo, osda_smiles, osda = test_dataset[1], test_dataset[2], test_dataset[3], test_dataset[5], test_dataset[13], test_dataset[15]

    precs_list = [] # final target strings for each datapoint
    invalid_count = 0
    for p_set in precs: # p_set = precursor set delimited by ','
        p_set = str(p_set)

        valid = True
        if (p_set == 'nan') or (p_set == '0'):
            valid = False
            invalid_count += 1
        
        if valid:
            ps = p_set.split(', ')
            ps = [p.strip() for p in ps]
            
            ps_clean = {prec_dict_clean[p]['common name']:prec_dict_clean[p] for p in ps if p != ''} # dict of common names as keys, and metadata as values
            # try:
            #     ps_clean = {prec_dict_clean[p]['common name']:prec_dict_clean[p] for p in ps if p != ''} # dict of common names as keys, and metadata as values
            # except Exception as e:
            #     print(e)
        
            elem_order = ['Si', 'Al', 'P', 'Ge', 'B', 'Na', 'K', 'OH', 'F', 'H2O']
            
            ps_clean_ordered = [] # need to order w.r.t. elem_order. heteroatoms first, then cations, then anions
            for e in elem_order:
                ps_w_elem = [p for (p, meta) in ps_clean.items() if e in meta['class(es)']]
                for p in ps_w_elem:
                    ps_clean_ordered.append(p)
                    ps_clean.pop(p) # remove from dictionary to avoid duplicates in same string

            ps_clean_ordered_cat = ", ".join(ps_clean_ordered) # str. concatenate all precursors in a string with delimiters
            
            precs_list.append(ps_clean_ordered_cat)

        else:
            precs_list.append(np.nan)
        
    assert len(zeo_code) == len(zeo) == len(osda_smiles) == len(osda) == len(precs_list)

    osda.columns = ['osda_'+c for c in osda.columns]
    syn_ratio.columns = ['syn_'+c for c in syn_ratio.columns]

    prec_df = pd.concat([pd.DataFrame({'zeo': zeo_code, 'osda': osda_smiles, 'precs': precs_list}), zeo, syn_ratio, osda], axis=1)
    prec_df = prec_df[prec_df['zeo'] != 'Dense/Amorphous']
    prec_df = prec_df.dropna(subset=['precs'])

    if save:
        prec_df.to_csv(f'prec_rec/data/prec_dataset_{split}.csv', index=False)

    return prec_df

In [4]:
get_precursor_dataset('train', save=True)

Unnamed: 0,zeo,osda,precs,zeo_num_atoms,zeo_a,zeo_b,zeo_c,zeo_alpha,zeo_beta,zeo_gamma,...,osda_free_sasa_mean_0,osda_mol_weight,osda_npr1_mean_0,osda_npr2_mean_0,osda_num_rot_bonds_mean_0,osda_pmi1_mean_0,osda_pmi2_mean_0,osda_pmi3_mean_0,osda_spherocity_index_mean_0,osda_volume_mean_0
0,CHA,Cn1ccnc1,"aluminum isobutoxide, phosphoric acid, hydrofl...",-0.148220,0.247695,0.296209,0.394198,0.623268,0.585498,1.086546,...,-0.476287,-0.614002,-0.192417,0.024587,-0.738041,-0.516072,-0.283353,-0.291016,-1.021658,-0.734189
1,GME,OSDA-free,"sodium silicate, sodium aluminate",-0.365307,0.250264,0.298850,-0.136598,0.623269,0.585497,1.086546,...,-1.761791,-1.460152,-1.298974,-2.106378,-0.738041,-0.610035,-0.313704,-0.331597,-1.428735,-1.493623
2,BEA,CC[N+](CC)(CC)CC,"silica sol, aluminum nitrate, lithium hydroxide",0.358314,0.120121,0.165025,1.658267,0.623112,0.585670,0.426950,...,0.060213,-0.117798,1.267251,0.292085,0.290843,-0.063275,-0.229249,-0.229172,1.115007,0.006052
4,MFI,CCCC[N+](CC)(CC)CC,"sodium silicate, aluminum sulfate",0.937211,1.014372,1.055163,0.260727,0.643353,0.585497,0.426949,...,0.359907,0.171315,0.244542,0.505503,0.805285,0.029761,-0.131605,-0.132424,0.443561,0.320299
5,LTA,OSDA-free,"silica, sodium aluminate",-0.365307,0.044403,0.087190,0.099114,0.623268,0.585497,0.426949,...,-1.761791,-1.460152,-1.298974,-2.106378,-0.738041,-0.610035,-0.313704,-0.331597,-1.428735,-1.493623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14740,GIS,OSDA-free,"silica sol, aluminum sulfate, sodium hydroxide",-0.510031,-0.220418,-0.185091,-0.084890,0.623270,0.585499,0.426951,...,-1.761791,-1.460152,-1.298974,-2.106378,-0.738041,-0.610035,-0.313704,-0.331597,-1.428735,-1.493623
14745,MFI,CCC[N+](CCC)(CCC)CCC,"silica sol, aluminum nitrate, sodium hydroxide",0.937211,1.014372,1.055163,0.260727,0.643353,0.585497,0.426949,...,0.687540,0.460428,1.227826,0.276644,1.319728,0.781630,-0.097029,-0.066753,1.017037,0.631831
14746,NES,OSDA-free,"sodium aluminate, sodium hydroxide",1.660832,1.737815,0.338720,1.293826,0.623269,0.585497,0.426949,...,-1.761791,-1.460152,-1.298974,-2.106378,-0.738041,-0.610035,-0.313704,-0.331597,-1.428735,-1.493623
14747,BEA,C[N+]1(C)CC2C3C=CC(CC3)C2C1,"silica sol, potassium hydroxide, ammonium fluo...",0.358314,0.120121,0.165025,1.658267,0.623112,0.585670,0.426950,...,0.250881,0.377323,0.056111,0.822282,-0.738041,-0.092779,-0.126195,-0.149179,0.733952,0.279312


In [5]:
get_precursor_dataset('val', save=True)

Unnamed: 0,zeo,osda,precs,zeo_num_atoms,zeo_a,zeo_b,zeo_c,zeo_alpha,zeo_beta,zeo_gamma,...,osda_free_sasa_mean_0,osda_mol_weight,osda_npr1_mean_0,osda_npr2_mean_0,osda_num_rot_bonds_mean_0,osda_pmi1_mean_0,osda_pmi2_mean_0,osda_pmi3_mean_0,osda_spherocity_index_mean_0,osda_volume_mean_0
1,MFI,CCC[N+](CCC)(CCC)CCC,"tetraethyl orthosilicate, sodium hydroxide",0.937211,1.014372,1.055163,0.260727,0.643353,0.585497,0.426949,...,0.687540,0.460428,1.227826,0.276644,1.319728,0.781630,-0.097029,-0.066753,1.017037,0.631831
2,GME,CC1CC(C)C[N+](C)(C)C1,"sodium silicate, zeolite FAU, sodium hydroxide",-0.365307,0.250264,0.298850,-0.136598,0.623269,0.585497,1.086546,...,0.117435,0.005982,0.867486,-0.235012,-0.738041,0.008844,-0.224038,-0.195081,0.246275,0.062326
3,LTA,OSDA-free,"tetraethyl orthosilicate, sodium aluminate, so...",-0.365307,0.044403,0.087190,0.099114,0.623268,0.585497,0.426949,...,-1.761791,-1.460152,-1.298974,-2.106378,-0.738041,-0.610035,-0.313704,-0.331597,-1.428735,-1.493623
6,MFI,CCC[N+](CCC)(CCC)CCC,"silica sol, aluminum isopropoxide",0.937211,1.014372,1.055163,0.260727,0.643353,0.585497,0.426949,...,0.687540,0.460428,1.227826,0.276644,1.319728,0.781630,-0.097029,-0.066753,1.017037,0.631831
7,PAU,CC[N+](CC)(CC)CC,"aluminum hydroxide, sodium hydroxide, sodium s...",11.357353,2.730725,2.849184,2.544064,0.623269,0.585497,0.426949,...,0.060213,-0.117798,1.267251,0.292085,0.290843,-0.063275,-0.229249,-0.229172,1.115007,0.006052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2100,ITT,C[n+]1ccn(Cc2ccc3ccccc3c2)c1,"tetraethyl orthosilicate, aluminum isopropoxid...",0.032685,0.888104,0.954658,0.056079,0.623269,0.585497,1.081457,...,0.681896,0.841075,-0.475288,0.691833,-0.223599,0.207169,0.153564,0.143936,-0.033494,0.520512
2101,ANA,C[N+](C)(C)C,"tetraethyl orthosilicate, boric acid, potassiu...",0.068866,0.232792,0.280886,0.270576,0.623269,0.585497,0.426949,...,-0.414065,-0.696024,2.493364,0.831382,-0.738041,-0.421515,-0.289209,-0.307840,2.546754,-0.614445
2103,CFI,C[N+]12CCCC[C@@H]1[C@H]1C[C@@H](C2)[C@@H]2CCCC...,"silica sol, aluminum nitrate, boric acid, sodi...",-0.220582,0.256367,-0.742720,1.598924,0.621341,0.589439,0.421946,...,0.611318,1.110287,0.043465,0.603514,-0.738041,0.423534,0.040146,0.039612,0.411721,0.953963
2105,MFI,CCC[N+](CCC)(CCC)CCC,"silatrane, sodium hydroxide",0.937211,1.014372,1.055163,0.260727,0.643353,0.585497,0.426949,...,0.687540,0.460428,1.227826,0.276644,1.319728,0.781630,-0.097029,-0.066753,1.017037,0.631831


In [6]:
get_precursor_dataset('test', save=True)

Unnamed: 0,zeo,osda,precs,zeo_num_atoms,zeo_a,zeo_b,zeo_c,zeo_alpha,zeo_beta,zeo_gamma,...,osda_free_sasa_mean_0,osda_mol_weight,osda_npr1_mean_0,osda_npr2_mean_0,osda_num_rot_bonds_mean_0,osda_pmi1_mean_0,osda_pmi2_mean_0,osda_pmi3_mean_0,osda_spherocity_index_mean_0,osda_volume_mean_0
0,CHA,CN1CCCCC1,"aluminum oxide, phosphoric acid",-0.148220,0.247695,0.296209,0.394198,0.623268,0.585498,1.086546,...,-0.246689,-0.438075,0.427628,0.058682,-0.738041,-0.397005,-0.268250,-0.271666,0.237711,-0.437479
1,NON,CC1CCCN1,tetramethyl orthosilicate,0.792487,1.266288,0.474885,0.292412,0.623297,0.585873,0.426917,...,-0.356675,-0.582632,0.408337,0.184213,-0.738041,-0.462407,-0.279898,-0.289400,0.486566,-0.589265
2,NON,CC1CCCN1,tetramethyl orthosilicate,0.792487,1.266288,0.474885,0.292412,0.623297,0.585873,0.426917,...,-0.356675,-0.582632,0.408337,0.184213,-0.738041,-0.462407,-0.279898,-0.289400,0.486566,-0.589265
3,NON,CC1CCCN1,tetramethyl orthosilicate,0.792487,1.266288,0.474885,0.292412,0.623297,0.585873,0.426917,...,-0.356675,-0.582632,0.408337,0.184213,-0.738041,-0.462407,-0.279898,-0.289400,0.486566,-0.589265
4,NON,CC1CCCN1,tetramethyl orthosilicate,0.792487,1.266288,0.474885,0.292412,0.623297,0.585873,0.426917,...,-0.356675,-0.582632,0.408337,0.184213,-0.738041,-0.462407,-0.279898,-0.289400,0.486566,-0.589265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5162,ITE,NCCNCCNCCNCCNCCN,"zeolite FAU, silica gel, sodium hydroxide",0.358314,1.081856,-0.172076,0.935389,0.623269,0.585503,0.426949,...,1.193088,0.934618,-0.884686,0.660981,2.605833,0.236933,0.711256,0.717630,-0.927512,0.868605
5163,ITE,NCCNCCNCCNCCNCCN,"zeolite FAU, silica gel, sodium hydroxide",0.358314,1.081856,-0.172076,0.935389,0.623269,0.585503,0.426949,...,1.193088,0.934618,-0.884686,0.660981,2.605833,0.236933,0.711256,0.717630,-0.927512,0.868605
5164,ITE,NCCNCCNCCNCCNCCN,"zeolite FAU, silica gel, sodium hydroxide",0.358314,1.081856,-0.172076,0.935389,0.623269,0.585503,0.426949,...,1.193088,0.934618,-0.884686,0.660981,2.605833,0.236933,0.711256,0.717630,-0.927512,0.868605
5165,ITE,NCCNCCNCCNCCNCCN,"zeolite FAU, silica gel, sodium hydroxide",0.358314,1.081856,-0.172076,0.935389,0.623269,0.585503,0.426949,...,1.193088,0.934618,-0.884686,0.660981,2.605833,0.236933,0.711256,0.717630,-0.927512,0.868605
