In [1]:
import pandas as pd

# Notebook for adding New compounds to table

### 1. Run this cell to load the table

In [16]:
# load table from parquet file
compound_table = pd.read_parquet('compound_table.parquet')

# sort values by unique idea so last has highest id
compound_table.sort_values('id', ascending=True, inplace=True)

# electrochem_table = pd.read_parquet('electrochem_table.parquet')
# photochem_table = pd.read_parquet('photochem_table.parquet')
# solubility_table = pd.read_parquet('solubility_table.parquet')


### 2. Enter the details of the compound you want to add below, then run the cell

In [21]:
compound_details = {
                  'name': 'potassium 2,5-hydroxybenzenesulfonate (potassium hydroquinone sulfonate)', 
                  'formula': 'C6H5KO5S', 
                  'molecular_weight': 228, 
                  'CAS': '21799-87-1', 
                  'source_dois': 'DOI:10.1002/aenm.202002453, https://pubchem.ncbi.nlm.nih.gov/compound/23672329', 
                  'recyclable': 1, 
                  'type': 'electron donor', 
                  'compound_family': 'quinone', 
                  'used_in_photocat': 0, 
                  'used_in_rfbs': 0, 
                  'used_as_hcarrier': 1}

### 3. Run this cell to add your compound to the table
This code includes a duplication check.

In [22]:
# create unique compound id and add compound

# if the table is empty add the compound with id 1
if compound_table.empty:
    compound_details['id'] = int(1)
    compound_table = pd.concat([compound_table, pd.DataFrame([compound_details,])])
    
    
else:
    # check if compounds already exist with the same combination of name, formula and CAS
    check_mask = compound_table.isin([compound_details['name'], compound_details['formula'], compound_details['CAS']])
    check_mask = check_mask.loc[(check_mask.name==True) & (check_mask.formula==True), :]
    
    # if the CAS is a blank string ignore CAS in your checks
    if compound_details['CAS'] == '':
        check_mask.CAS = False
    
    
    # if no compounds with matching name, formula and CAS are found add one to the last id number and add the compound with that id
    if check_mask.any(axis=1).empty:
        # this assumes table always ordered ascending to make addition faster
        compound_details['id'] = int(compound_table.id.iat[-1] + 1)
        compound_table = pd.concat([compound_table, pd.DataFrame([compound_details,])])
        
    else:
        # if matches are found print them with a warning message and don't add the compound yet
        print(f"The following compounds appear to match the one you are trying to add {check_table.name.to_list()}.")
        print("To edit an existing entry please use the compound editor notebook. Otherwise please change the name or formula of your compound to add it.")
        display(compound_table.loc[check_mask.any(axis=1), :])
        print("Your compound has not yet been added")
    

#### (Execute next cell to preview last 5 entries)

In [23]:
# code to preview last five entries
if compound_table.shape[0] > 6:
    display(compound_table.tail(5))
else:
    display(compound_table)


Unnamed: 0,id,name,formula,molecular_weight,CAS,source_dois,recyclable,type,compound_family,used_in_photocat,used_in_rfbs,used_as_hcarrier
0,19,"1,3-dimethyl-2-(2,4,6-trimethoxyphenyl)-2H-be...",C18H22N2O3,314.0,,"DOI:10.1021/acs.jpcc.2c03541, https://pubchem....",1,electron donor,benzimidazole,1,0,0
0,20,"5-methoxy-1,3-dimethyl-2-(4-methoxyphenyl)-2H...",C17H20N2O2,284.0,,DOI:10.1039/D1CY01209F,0,electron donor,benzimidazole,1,0,0
0,21,"5,6-dimethoxy-1,3-dimethyl-2-(phenyl)-2H-benz...",C17H20N2O2,284.0,,DOI:10.1039/D1CY01209F,0,electron donor,benzimidazole,1,0,0
0,22,"2,7-anthraquinonedisulfonic acid (AQDS)",C14H8O8S2,368.0,84-49-1,"DOI:10.1002/aenm.202002453, https://pubchem.nc...",1,electron donor,quinone,0,1,1
0,23,"potassium 2,5-hydroxybenzenesulfonate (potassi...",C6H5KO5S,228.0,21799-87-1,"DOI:10.1002/aenm.202002453, https://pubchem.nc...",1,electron donor,quinone,0,0,1


### 4. Run the next cell to save your changes to the table 

In [25]:
compound_table.to_parquet('compound_table.parquet', engine='pyarrow', compression=None)

In [7]:
254+16+12+2

284