# Read all isotherms from the ISO-DB

In [1]:
import git
import json
import pickle
from tqdm import tqdm
from pathlib import Path

isodb_git_path = Path("/home/daniele/Databases/isodb-library") # Cloned from https://github.com/NIST-ISODB/isodb-library

print("Last commit:", git.Repo(isodb_git_path).head.object.hexsha)

Last commit: c00557bf5173a83a7f0bde73bb96a162c2ce9f12


## List of isotherms

We first download lists of all isotherms, materials and adsorbents in the NIST database.

`Isotherms` are dictionaries like:
```
    {
        "filename": "10.1002adem.200500223.isotherm1",
        "DOI": "10.1002/adem.200500223",
        "adsorbent": {
            "hashkey": "NIST-MATDB-991daf7313251e7e607e2bab2da57e33"
        },
        "adsorbates": [
            {
                "InChIKey": "UFHFLCQGNIYNRP-UHFFFAOYSA-N"
            }
        ],
        "category": "",
        "temperature": 77,
        "tabular_data": 0,
        "isotherm_type": ""
    },
```
`Adsorbents` are dictionaries like:
```
    {
        "hashkey": "NIST-MATDB-991daf7313251e7e607e2bab2da57e33",
        "name": "CuBTC",
        "synonyms": [
            "Basolite C300",
            "C300",
            "Cu-BTC",
            "Cu3(BTC)2",
            "HKUST-1",
            "MOF-199"
        ]
    },
```
`Adsorbates` are dictionaries like:
```
    {
        "InChIKey": "CURLTUGMZLYLDI-UHFFFAOYSA-N",
        "name": "Carbon Dioxide",
        "synonyms": [
            "Carbon dioxide",
            "Carbon oxide",
            "CO2"
        ]
    },
```

`Bibliography` are dictionaries like:
```
{
        "DOI": "10.1002/adem.200500223",
        "title": "Improved Hydrogen Storage in the Metal-Organic Framework Cu3(BTC)2",
        "journal": "Advanced Engineering Materials",
        "year": 2006,
        "authors": [
            "Piotr Krawiec",
            "Markus Kramer",
            "Michal Sabo",
            "R. Kunschke",
            "Heidrun Fröde",
            "Stefan Kaskel"
        ],
        "categories": [
            "exp"
        ],
        "adsorbents": [
            {
                "hashkey": "NIST-MATDB-991daf7313251e7e607e2bab2da57e33",
                "name": "CuBTC"
            }
        ],
        "adsorbentMaterial": [
            "CuBTC"
        ],
        "adsorbates": [
            {
                "InChIKey": "IJGRMHOSHXDMSA-UHFFFAOYSA-N",
                "name": "Nitrogen"
            },
            {
                "InChIKey": "UFHFLCQGNIYNRP-UHFFFAOYSA-N",
                "name": "Hydrogen"
            }
        ],
        "adsorbateGas": [
            "Nitrogen",
            "Hydrogen"
        ],
        "temperatures": [
            77,
            87
        ],
        "pressures": [
            0,
            1
        ],
        "isotherms": [
            {
                "filename": "10.1002adem.200500223.isotherm1"
            },
            {
                "filename": "10.1002adem.200500223.isotherm2"
            },
            {
                "filename": "10.1002adem.200500223.isotherm3"
            }
        ]
    }
```

In [2]:
# load files to memory (to copy in other notebook!)
nistdb = {
    'Adsorbents': [],
    'Adsorbates': [],
    'Bibliography': [],
    'Isotherms': [],
}

for key in nistdb:
    if key=='Isotherms':
        for iso_path in tqdm((isodb_git_path / "Library").glob("10*/*")):
            with open(iso_path, "r") as iso_json:
                nistdb[key].append(json.load(iso_json))
    else:
        for json_path in (isodb_git_path / "Library").glob(f"{key}/*"):
            with open(json_path , "r", encoding='utf-8') as json_file:
                nistdb[key].append(json.load(json_file))
    print(f"Loaded {len(nistdb[key])} {key}")

Loaded 7386 Adsorbents
Loaded 434 Adsorbates
Loaded 4128 Bibliography


35482it [00:04, 7872.90it/s] 

Loaded 35482 Isotherms





> NOTE: the Adsorbates present in the isotherms are only 280 (not 434), as shown in a later cell

## Cleaning isotherms data
Inspired by 
* Add `total_adsorption`: due to the switch to a multicomponent JSON format, the *total_adsorption* field, which is supposed to show the total amount of all species adsorbed is sometimes left blank. here we iterate and generate this field when it is absent.
* Correct `category=""` when possible: for old data, `category` is left blank. We can correct it only for those isotherm whose biblio is only of one kind. 

In [3]:
# Add total_adsorption
import numpy as np
isos = nistdb['Isotherms']

fixes = 0
for iso in isos:
    if iso['isotherm_data'][0]['total_adsorption'] is None:
        fixes += 1
        for point in iso['isotherm_data']:
            point['total_adsorption'] = float(np.sum([dp['adsorption'] for dp in point['species_data']]))

print(f"Corrections performed in {fixes} isotherms over {len(isos)}.")

Corrections performed in 6538 isotherms over 35482.


In [4]:
# Correct category="": assign 'exp' or 'sim' to all isotherms, if the enclosing paper has only one category!

doi_to_category = { bib['DOI'] : bib['categories'] for bib in  nistdb['Bibliography']}

fixes, known, unknown = 0, 0, 0
for iso in isos:
    if iso['category']=="":
        if iso['DOI'] in doi_to_category and len(doi_to_category[iso['DOI']])==1:
            fixes += 1
            iso['category']=doi_to_category[iso['DOI']][0]
        else:
            unknown += 1
    else:
        known +=1
        

print(f"Corrections performed in {fixes} isotherms, {known} already assigned, and {unknown} left unknown.")

Corrections performed in 14378 isotherms, 12578 already assigned, and 8526 left unknown.


## Save (or load) pickled isotherms if needed

In [5]:
with open("data/nistdb.pickle", 'wb') as f:
    pickle.dump(nistdb, f)

In [6]:
# To copy in other notebooks to load the database

isos = nistdb['Isotherms']
print(f"Loaded {len(isos)} full isotherms.")

Loaded 35482 full isotherms.


# Analyse the database after data-cleaning

In [7]:
print("Total isotherms:", len(nistdb['Isotherms']))
print("Total adsorbents:", len(nistdb['Adsorbents']))
print('Total articles:', len(nistdb['Bibliography']))
adsorbates = set([ ads['name'] for isot in nistdb['Isotherms'] for ads in isot['adsorbates'] ])
print('Total adsorbates:', len(adsorbates))

Total isotherms: 35482
Total adsorbents: 7386
Total articles: 4128
Total adsorbates: 280


In [8]:
nistdb['Adsorbents'][100]

{'External_Resources': [],
 'formula': '',
 'hashkey': 'NIST-MATDB-2b2edf2ac6965fca1a5c8f1449476d04',
 'name': 'PEI(aq) with M.W. 10000',
 'synonyms': []}

In [9]:
# Identity of adsorbents
for match in ["zeolite", "carbon", "mof", "cof"]:
    count = 0
    for adsorbent in nistdb['Adsorbents']:
        if any([ match in x.lower() for x in [adsorbent['name']] + adsorbent['synonyms']]):
            count+=1
    perc = 100 * count/len(nistdb['Adsorbents'])
    print(f"'{match}' matches: {perc:.3f}")


'zeolite' matches: 3.249
'carbon' matches: 7.040
'mof' matches: 12.551
'cof' matches: 1.286


In [10]:
# Some statistics referring to the articles in the Bibliography

bib = nistdb['Bibliography']

count = len([art for art in bib if art['isotherms']])
print('Paper containing at least one isotherm:', count, f"({round(count/len(bib)*100,1)})%")

count = len([art for art in bib if art['isotherms'] and 'exp' in art['categories']])
print('Paper containing at least one isotherm, and HAS experimental:', count, f"({round(count/len(bib)*100,1)})%")

count = len([art for art in bib if art['isotherms'] and art['categories']==['exp']])
print('Paper containing at least one isotherm, and ONLY experimental:', count, f"({round(count/len(bib)*100,1)})%")


Paper containing at least one isotherm: 3307 (80.1)%
Paper containing at least one isotherm, and HAS experimental: 2951 (71.5)%
Paper containing at least one isotherm, and ONLY experimental: 2462 (59.6)%


In [11]:
# Some statistics on the isoterms
isos = nistdb['Isotherms']
print("Total isotherms:", len(isos))

count = len([isot for isot in isos if isot['category']=='exp'])
print('Experimental:', count, f"({round(count/len(isos)*100,1)})%")

for adsorbate in ['Nitrogen', 'Carbon Dioxide', 'Hydrogen', 'Methane', 'Water']:
    count = len([isot for isot in isos if isot['category']=='exp' and len(isot['adsorbates'])==1 and isot['adsorbates'][0]['name']==adsorbate])
    print(f'Experimental, {adsorbate}:', count, f"({round(count/len(isos)*100,1)})%")

print()
count = len([isot for isot in isos if isot['category']=='exp' and len(isot['adsorbates'])==1 and isot['adsorbates'][0]['name']=='Nitrogen' and isot['temperature']==77])
print(f'Experimental, Nitrogen@77:', count, f"({round(count/len(isos)*100,1)})%")
count = len([isot for isot in isos if isot['category']=='exp' and len(isot['adsorbates'])==1 and isot['adsorbates'][0]['name']=='Argon' and isot['temperature']==87])
print(f'Experimental, Argon@87:', count, f"({round(count/len(isos)*100,1)})%")

Total isotherms: 35482
Experimental: 21375 (60.2)%
Experimental, Nitrogen: 5153 (14.5)%
Experimental, Carbon Dioxide: 4366 (12.3)%
Experimental, Hydrogen: 2540 (7.2)%
Experimental, Methane: 2212 (6.2)%
Experimental, Water: 607 (1.7)%

Experimental, Nitrogen@77: 4003 (11.3)%
Experimental, Argon@87: 140 (0.4)%


In [12]:
# Some statistics on the adrsorbents
count = set()
for isot in nistdb['Isotherms']:
    if isot['category'] == 'exp':
        count.add(isot['adsorbent']['name'])
print("Adsorbents at least one exp isotherm:", len(count), end=" ")
print(f"({100*len(count)/len(nistdb['Adsorbents']):.2f}%)")
experimental_isotherms = count

Adsorbents at least one exp isotherm: 4834 (65.45%)


In [13]:
# Some statistics on the adrsorbents
count = set()
for isot in nistdb['Isotherms']:
    if isot['category'] == 'exp' and (
        (isot['adsorbates'][0]['name']=='Nitrogen' and isot['temperature']==77) or (
         isot['adsorbates'][0]['name']=='Argon'    and isot['temperature']==87)):
        count.add(isot['adsorbent']['name'])
print("Adsorbents with characterization:", len(count), end=" ")
print(f"({100*len(count)/len(nistdb['Adsorbents']):.2f}%)")

Adsorbents with characterization: 2675 (36.22%)


In [14]:
# Isotherms with volumtric unit
count=0
for isot in nistdb['Isotherms']:
    if isot['category']=='exp':
        unit = isot['adsorptionUnits']
        if "/" in unit and unit.split("/")[-1] in ['mol', 'volume', 'cm3', 'm3', 'cm2', 'm2', 'unit cell', 'unitcell', 'formula']:
            #print(unit.split("/"))
            count+=1
print("Adsorbents with volumetric (or weird) units:", count, end=" ")
print(f"({100*(count/len(experimental_isotherms)):.2f}%)")

Adsorbents with volumetric (or weird) units: 728 (15.06%)


> NOTE: in notebook `06` one can see that for our selected characterization isotherms the volumetric (or weird) units are just <1%

In [15]:
# Find possibly ambiguous names
possible_ambiguous_names = set([])
ambiguous_names = set([])

metals = [
    'Mg', 'Al',
    'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 
    'Zr', 'Ru', 'Rh',
    'Cd', 'In', 'Sn', 'Pb']

for m in metals: 
    for mat in nistdb['Adsorbents']:
        for syn in [mat['name']] + mat['synonyms']:
            if syn.endswith(f"({m})"): # name like: MOF-XX(M)
                possible_ambiguous_names.add(syn.replace(f"({m})",""))
            elif syn.startswith(f"{m}-"): # name like: M-MOF-XX
                possible_ambiguous_names.add(syn.replace(f"{m}-",""))

# Check if there is an ambiguous name which does not specify the metal
for mat in nistdb['Adsorbents']:
    if mat['name'] in possible_ambiguous_names:
        ambiguous_names.add(mat['name'])
        
ambiguous_dict = { k: [] for k in ambiguous_names}

# Link the ambiguous name (where the metal is not specified) to all its variants
for mat in nistdb['Adsorbents']:
    for ambiguous_name in ambiguous_names:
        if any(
            [f"{ambiguous_name}({m})"==syn for m in metals for syn in [mat['name']] + mat['synonyms']]
        ) or any(
            [f"{m}-{ambiguous_name}"==syn for m in metals for syn in [mat['name']] + mat['synonyms']]):
            ambiguous_dict[ambiguous_name].append(mat['name'])

for k in sorted(ambiguous_dict):
    print(f"{k:20s} >>> ", ambiguous_dict[k])
# NOTE: the selection of names to exclude will be manual (see Supporting Info of the draft).

(CH3)2-MIL-53        >>>  ['(CH3)2-MIL-53(Al)']
(OH)2-MIL-53         >>>  ['(OH)2-MIL-53(Al)']
C60@IRMOF-10         >>>  ['Mg-C60@IRMOF-10']
C60@IRMOF-8          >>>  ['Mg-C60@IRMOF-8']
CHA                  >>>  ['Ca-CHA']
COOH-MIL-53          >>>  ['Al-MIL-53-COOH']
Cu/MCM-41            >>>  ['Fe-Cu/MCM-41']
CuBTC                >>>  ['CuBTC(Cu)']
IRMOF-1              >>>  ['Zn-IRMOF-1', 'Mg-IRMOF-1', 'Ni-IRMOF-1', 'Cu-IRMOF-1']
MAMS-2               >>>  ['MAMS-2(Zn)']
MAMS-4               >>>  ['MAMS-4(Cu)']
MCM-48-ls            >>>  ['Mg-MCM-48-ls']
MIL-100              >>>  ['MIL-100(Fe)', 'MIL-100(Al)', 'MIL-100(Sc)', 'MIL-100(Cr)', 'Mn-MIL-100', 'MIL-100(V)']
MIL-101              >>>  ['MIL-101(V)', 'Cr-MIL(101)', 'MIL-101(Al)', 'MIL-101(Sc)']
MIL-101-NH2          >>>  ['Cr-MIL-101-NH2']
MIL-101-NO2          >>>  ['Cr-MIL-101-NO2']
MIL-101-bipy         >>>  ['Cr-MIL-101-bipy']
MIL-102              >>>  ['MIL-102(Fe)']
MIL-125              >>>  ['MIL-125(Ti)']
MIL-160             