In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import QuantileTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer



In [2]:
het_cols = ['Si','Al','P','Ge','B','V','Be','Ga','Ti'] # heteroatoms in the dataset
ratios = {'Si/Al': {'quantile': 0.98, 'name': 'Si/Al'},
          'Al/P': {'quantile': 0.99, 'name': 'Al/P'},
          'Si/Ge': {'quantile': 0.995, 'name': 'Si/Ge'},
          'Si/B': {'quantile': 0.99, 'name': 'Si/B'},
          'Na/T': {'quantile': 0.95, 'name': 'Na/T'},
          'K/T': {'quantile': 0.99, 'name': 'K/T'},
          'OH/T': {'quantile': 0.97, 'name': 'OH/T'},
          'F/T': {'quantile': 0.95, 'name': 'F/T'},
          'H2O/T': {'quantile': 0.99, 'name': 'H$_2$O/T'},
          'sda1/T': {'quantile': 0.98, 'name': 'SDA/T'},
          }

conds = {'cryst_temp': {'name': 'Cryst. temp. ($^\circ$C)'},
         'cryst_time': {'name': 'Cryst. time ($^\circ$C)'},
         }

def check_nans(df):
    return f'Number of NaNs: {df.isna().sum()}'

def check_infs(df):
    return f'Number of Infs: {df.isin([np.inf, -np.inf]).sum()}'

def preprocess_gel(df, x, y, quantile, plot = False):
    '''Converts a dataframe column x and column y, and adds a new column with the ratio x/y. Then caps the value of ratio x/y at a certain quantile. Then applies a quantile transform to the ratio x/y. Also saves the quantile transformer to a pickle file.

    Args:
        df (pd.DataFrame): dataframe
        x (str): column name
        y (str): column name
        quantile (float): quantile to cap the ratio x/y at
    Returns:
        df (pd.DataFrame): dataframe with new columns
    '''
    ratio = f"{x}/{y}"
    df[ratio] = df[x]/df[y]

    # Fix NaNs from 0.0/0.0
    idxs = df[df[ratio].isna()].index
    for idx in idxs:
        df.loc[idx, ratio] = 0. # set to 0.0

    # Fix infs
    idxs = df[df[ratio] == np.inf].index # NaNs from 0.0/0.0
    high_val = np.quantile(df[(df[ratio] != np.inf) & (df[ratio] != 0.)][ratio], quantile)
    print(ratio)
    print('High val:', high_val)
    for idx in idxs:
        df.loc[idx, ratio] = high_val # set to about 400.0

    # Set upper limit
    idxs = df[df[ratio] >= high_val].index # High values
    for idx in idxs:
        df.loc[idx, ratio] = high_val # set to about 400.0

    print(check_nans(df[ratio]))
    print(check_infs(df[ratio]))

    # Quantile transform
    qt = QuantileTransformer(n_quantiles=1000, random_state=0)
    df[f'{ratio}_qt'] = qt.fit_transform(np.array(df[ratio]).reshape(-1, 1)).reshape(-1)
    with open(f'{x}{y}_qt.pkl', 'wb') as f:
        pickle.dump(qt, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f'{x}{y}_qt.pkl', 'rb') as f:
        qt = pickle.load(f)

    if plot:
        plt.figure(figsize=(15,7))
        sns.displot(df[ratio], bins=30) 
        sns.displot(df[f'{ratio}_qt'], bins=30) 
        plt.show()

    return df

def preprocess_conditions(df, plot = False):
    '''Imputes conditions based on gel compositions. Applies quantile transformation to conditions. Also saves the quantile transformer to a pickle file.

    Args:
        df (pd.DataFrame): dataframe
    Returns:
        df (pd.DataFrame): dataframe with new columns
    '''

    # Conditions
    syn_cols = list(ratios.keys()) + list(conds.keys()) 
    df_syn = df[syn_cols] # create a temporary dataframe with only the synthesis conditions
    # Impute missing values in conditions
    imp = IterativeImputer(min_value = 0., sample_posterior = True, skip_complete = True, random_state=0) # Sample posterior to get a distribution, skip_complete to speed up, min value is 0. since time and temp are +ve
    df_syn = imp.fit_transform(df_syn)
    df_syn = pd.DataFrame(df_syn, columns = syn_cols, index = df.index)
    df['cryst_temp'] = df_syn['cryst_temp'] # replace the original columns with the imputed columns
    df['cryst_time'] = df_syn['cryst_time'] # replace the original columns with the imputed columns

    for cond in conds.keys():
        print(cond)
        print(check_nans(df[cond]))
        print(check_infs(df[cond]))

        # Quantile transform
        qt = QuantileTransformer(n_quantiles=1000, random_state=0)
        df[f'{cond}_qt'] = qt.fit_transform(np.array(df[cond]).reshape(-1, 1)).reshape(-1)
        with open(f'{cond}_qt.pkl', 'wb') as f:
            pickle.dump(qt, f, protocol=pickle.HIGHEST_PROTOCOL)
        with open(f'{cond}_qt.pkl', 'rb') as f:
            qt = pickle.load(f)

        if plot:
            plt.figure(figsize=(15,7))
            sns.displot(df[cond], bins=30) 
            sns.displot(df[f'{cond}_qt'], bins=30) 
            plt.show()

    return df

In [3]:
df = pd.read_excel('ZEOSYN.xlsx').drop(columns = ['Unnamed: 0'])
df = df[~df['doi'].isna()] # keep only non-empty rows
df['T'] = df[het_cols].sum(axis=1)

In [4]:
# Preprocess gel compoisitions
# Convert to ratios, cap ratio to a certain quantile of the ratio, then apply a quantile transform to the ratio
for ratio, config in ratios.items():
    x, y = ratio.split('/')
    df = preprocess_gel(df=df, x=x, y=y, quantile=config['quantile'])

Si/Al
High val: 400.00000000000006
Number of NaNs: 0
Number of Infs: 0
Al/P
High val: 1.7179967159277212
Number of NaNs: 0
Number of Infs: 0
Si/Ge
High val: 98.9999999999999
Number of NaNs: 0
Number of Infs: 0
Si/B
High val: 250.00000000000003
Number of NaNs: 0
Number of Infs: 0
Na/T
High val: 1.920999102706711
Number of NaNs: 0
Number of Infs: 0
K/T
High val: 5.333333333333333
Number of NaNs: 0
Number of Infs: 0
OH/T
High val: 2.4341677246909406
Number of NaNs: 0
Number of Infs: 0
F/T
High val: 1.25
Number of NaNs: 0
Number of Infs: 0
H2O/T
High val: 200.00000000000006
Number of NaNs: 0
Number of Infs: 0
sda1/T
High val: 6.097582682238018
Number of NaNs: 0
Number of Infs: 0


In [5]:
# Preprocess conditions
# Impute missing values in conditions, then apply a quantile transform to the condition
df = preprocess_conditions(df, plot = False)

cryst_temp
Number of NaNs: 0
Number of Infs: 0
cryst_time
Number of NaNs: 0
Number of Infs: 0


In [6]:
df.to_csv('ZEOSYN_preprocessed.csv')

Unnamed: 0,Si,Al,P,Na,K,Li,Sr,Rb,Cs,Ba,...,OH/T,OH/T_qt,F/T,F/T_qt,H2O/T,H2O/T_qt,sda1/T,sda1/T_qt,cryst_temp_qt,cryst_time_qt
0,0.030769,0.000000,0.0,0.0,0.001538,0.000000,0.0,0.0,0.0,0.0,...,0.250000,0.686186,0.0,0.000000,31.000000,0.718827,0.200000,0.381882,0.301802,0.948949
1,0.105263,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.5,0.948949,7.500000,0.266266,0.500000,0.804805,0.722723,0.639139
2,0.021959,0.000000,0.0,0.0,0.002635,0.000000,0.0,0.0,0.0,0.0,...,0.270000,0.701168,0.0,0.000000,44.000000,0.858859,0.150000,0.311311,0.301802,0.546046
3,0.046512,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.5,0.920420,20.000000,0.555556,0.000000,0.000000,0.098186,0.970651
5,0.021368,0.008547,0.0,0.0,0.000000,0.004274,0.0,0.0,0.0,0.0,...,2.285714,0.982919,0.0,0.000000,28.571429,0.661662,1.142857,0.928929,0.116617,0.375375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30159,0.967742,0.032258,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.144144,0.959434
30160,0.975610,0.024390,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.461461,0.777825
30161,0.985915,0.014085,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.592092,0.315791
30162,0.990099,0.009901,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.592092,0.733037
