In [1]:
# importing required libraries
import pandas as pd

# Notebook for Creating Blank Data Tables
This notebook creates 4 blank tables for storing sacrifical reagent data. These tables are stored in parquet file format rather than csv to decrease their file size and improve their loading speed:
- **compound_table:** The compound table hold the information about each unqiqu compound such as name, formula, whether or not it is recyclable, and the dois for the references where the compound was reported. The id of the compound table is used as a unque identifier for each compound and is referred to as coumpound_id in the other tables.
- **electrochem_table:** The electrochem table holds redox potential data from different experimental and simulation studies for the compound. Each table row corresponds to one redox couple for one compound in one environment. i.e. a compound might have 2 redox couples and be measured in two different electrolyte solutions, hence it will have four rows in the electrochem table. Each row should be referenced to a paper using a doi or other identifier in the measurement_ref field. The compounds are referred to using the compound_id field which corresponds to the id of the compound in the compound table.
- **photochem_table:** The photochem table holds the relevant spectroscopic information for each absorption peak of a compound. The same peak will have different entries in different rows for different solvents. Again, the compound is identified by compound_id and each entry should be referenced in the measurement_ref field.
- **solubility_table:** The solubility table holds solubility data for each compound in different solvents with one row per solvent system per compound. Again, the compound is identified by compound_id and each entry should be referenced in the measurement_ref field.

In [19]:
compound_data_type_dict = {'id': 'int32',
                  'name': 'string', 
                  'formula': 'string', 
                  'molecular_weight': 'int32', 
                  'CAS': 'string', 
                  'source_dois': 'string', 
                  'recyclable': 'boolean', 
                  'type': 'string', 
                  'compound_family': 'string', 
                  'used_in_photocat': 'boolean', 
                  'used_in_rfbs': 'boolean', 
                  'used_as_hcarrier': 'boolean'}
compound_df = pd.DataFrame({c: pd.Series(dtype=typ) for (c, typ) in compound_data_type_dict.items()})
compound_df.dtypes


id                    int32
name                 string
formula              string
molecular_weight      int32
CAS                  string
source_dois          string
recyclable          boolean
type                 string
compound_family      string
used_in_photocat    boolean
used_in_rfbs        boolean
used_as_hcarrier    boolean
dtype: object

In [21]:
electrochem_data_type_dict = {'id': 'int32',
                            'compound_id': 'int32',
                            'redox_potential_V': 'float32', 
                            'reference_electrode': 'string',
                            'working_electrode': 'string',
                            'electrolyte': 'string',
                            'electrolyte': 'string',
                            'solvent': 'string', 
                  'CAS': 'string', 
                  'peak_to_peak_separation_mV': 'float32', 
                  'diffusion_coefficient': 'float32', 
                  'diffusion_coefficient_unit': 'string', 
                  'exchange_current_density': 'float32', 
                  'exchange_current_unit': 'string', 
                  'measurement_ref': 'string'}
electrochem_df = pd.DataFrame({c: pd.Series(dtype=typ) for (c, typ) in electrochem_data_type_dict.items()})
electrochem_df.dtypes

id                              int32
compound_id                     int32
redox_potential_V             float32
reference_electrode            string
working_electrode              string
electrolyte                    string
solvent                        string
CAS                            string
peak_to_peak_separation_mV    float32
diffusion_coefficient         float32
diffusion_coefficient_unit     string
exchange_current_density      float32
exchange_current_unit          string
measurement_ref                string
dtype: object

In [22]:
photochem_data_type_dict = {'id': 'int32', 
                            'compound_id': 'int32',
                            'maxima_nm': 'int32', 
                            'absorption_coefficient': 'float32', 
                            'lifetime_ns': 'float32', 
                            'solvent': 'string', 
                            'measurement_ref': 'string'}
photochem_table = pd.DataFrame({c: pd.Series(dtype=typ) for (c, typ) in photochem_data_type_dict.items()})
photochem_table.dtypes

id                          int32
compound_id                 int32
maxima_nm                   int32
absorption_coefficient    float32
lifetime_ns               float32
solvent                    string
measurement_ref            string
dtype: object

In [23]:
sol_dtype_dict = {'id': 'int32',
                  'compound_id': 'int32',
                  'solvent': 'string', 
                  'solubility_molarity': 'float32',
                  'molarity_unit': 'string', 
                  'solubility_molality': 'float32',
                  'molality_unit': 'string', 
                  'polarity': 'float32',
                  'polarity_measure': 'string',
                  'measurement_ref': 'string'}
solubility_table = pd.DataFrame({c: pd.Series(dtype=typ) for (c, typ) in sol_dtype_dict.items()})
solubility_table.dtypes

id                       int32
compound_id              int32
solvent                 string
solubility_molarity    float32
molarity_unit           string
solubility_molality    float32
molality_unit           string
polarity               float32
polarity_measure        string
measurement_ref         string
dtype: object

In [25]:
compound_df.to_parquet('compound_table.parquet', engine='pyarrow', compression=None)


Unnamed: 0,id,name,formula,molecular_weight,CAS,source_dois,recyclable,type,compound_family,used_in_photocat,used_in_rfbs,used_as_hcarrier


In [27]:
electrochem_df.to_parquet('electrochem_table.parquet', engine='pyarrow', compression=None)
photochem_table.to_parquet('photochem_table.parquet', engine='pyarrow', compression=None)
solubility_table.to_parquet('solubility_table.parquet', engine='pyarrow', compression=None)