# Check crystal structures and remove the ones with overlapping atoms

In [1]:
import pandas as pd
import pickle 
from tqdm import tqdm
import json
import yaml

In [2]:
MANUAL_EXCLUSIONS = list(pd.read_csv("data/manual_exclusions.csv")['identifier'])

with open("data/csd_542_mofchecker.json", "r") as file:
    mof_checker = json.load(file)
    
# fix
for mof, mof_dict in mof_checker.items():
    for structure in mof_dict:
        mof_checker[mof][structure]['has_atomic_overlaps'] = not mof_checker[mof][structure]['has_atomic_overlaps']
        mof_checker[mof][structure]['has_not_hydrogen'] = not mof_checker[mof][structure]['has_hydrogen']
        del mof_checker[mof][structure]['has_hydrogen']

mof_checker['FIQCEN']['asr']

{'name': 'FIQCEN',
 'graph_hash': '521324a26079f95c5a55e7f417c3fa7f',
 'scaffold_hash': 'e1682de934a12384f1952d34148e3832',
 'symmetry_hash': 'mEopNc0Bbry+C0Mp9WB7PJqumiUvMB4qfFQS9GqwjvI=225',
 'formula': 'Cu12 H24 C72 O48',
 'path': None,
 'density': 0.879095741266423,
 'has_carbon': True,
 'has_atomic_overlaps': False,
 'has_overcoordinated_c': False,
 'has_overcoordinated_n': False,
 'has_overcoordinated_h': False,
 'has_undercoordinated_c': False,
 'has_undercoordinated_n': False,
 'has_undercoordinated_rare_earth': False,
 'has_metal': True,
 'has_lone_molecule': False,
 'has_high_charges': True,
 'is_porous': None,
 'has_suspicicious_terminal_oxo': False,
 'has_not_hydrogen': False}

In [3]:
df = pd.read_csv("data/step-08.csv").dropna(subset=['nist_mat'])
df = df[~df.identifier.isin(MANUAL_EXCLUSIONS)]                        

mat_sel = list(set(df['nist_mat']))
mat_sel.sort()
mat_dict = {}
csd_refcodes_mem = []
for nist_name in mat_sel:
    csd_refcodes = list(df[df['nist_mat']==nist_name]['identifier'].values)
    csd_refcodes_mem += csd_refcodes
    mat_dict[nist_name] = {}
    for csd_refcode in csd_refcodes:       
        mat_dict[nist_name][csd_refcode] = df[df['identifier']==csd_refcode]['sgh_asr'].values[0]
        
with open("data/step-09.yml", "w") as f:
    yaml.dump(mat_dict, f)
# step-09.yml: { nist_adsorbent1: [csd1: SGH, csd2: SGH, ...], ... }
    
print("... after manual exclusions ...")    
print(f'All identified NIST-ISODB adsorbents: {len(mat_dict)}')
print(f'All identified CSD entries: {len(csd_refcodes_mem)}')

... after manual exclusions ...
All identified NIST-ISODB adsorbents: 676
All identified CSD entries: 777


In [4]:
with open("data/step-09.yml", "r") as file:
    mat_dict = yaml.safe_load(file)
mat_dict['CuBTC']

{'DIHVIB': '521324a26079f95c5a55e7f417c3fa7f',
 'DOTSOV42': '521324a26079f95c5a55e7f417c3fa7f',
 'FIQCEN': '521324a26079f95c5a55e7f417c3fa7f',
 'LUDLED': '521324a26079f95c5a55e7f417c3fa7f'}

In [5]:
# check the ones that have same nist_mat but different graph
# - used to verify and create manual_exclusions.csv
# - some are left because by visual inspection they are the same

with open("data/step-09.yml", "r") as file:
    mat_dict = yaml.safe_load(file)

count_unique, count_multiple, count_tocheck = 0, 0, 0
to_check = []
for mat_name, d in mat_dict.items():
    if len(d)==0:
           print('ERROR', mat_name)
    elif len(d)==1:
        count_unique+=1
    #elif len(d)<30:
    else:
        count_multiple+=1
        graph_values = list(d.values())
        if not all( [ x==graph_values[0] for x in graph_values]):
            print(mat_name)
            print(d)
            to_check+=list(d.keys())
            count_tocheck+=1
            
print()
print('NOTE: these left are ok, possibly primitive/conventional cells ')
print()
print("Unique NIST - CSD matches:", count_unique)
print("NIST - multiple CSD matches:", count_multiple)
print("NIST - CSD matches, total:", count_unique+count_multiple)
print("NIST - CSD matches to check because of different graphs:", count_tocheck)

Cu-MOF-74
{'COKNIB01': '9aa2c07385f5f2341ea925427725a3c4', 'COKNIB02': 'MISSING_COKNIB02', 'COKNIB03': '9aa2c07385f5f2341ea925427725a3c4'}
MOF-HTB'
{'NIBHUC': 'MISSING_NIBHUC', 'NIBJAK': '40173a48e39019e3fa20d50edc7e9133'}
Mn-MOF-74
{'COKNUN01': '4c1a36d71cb621668c336926a7773534', 'COKNUN02': 'MISSING_COKNUN02', 'COKNUN03': '4c1a36d71cb621668c336926a7773534', 'ORIWET': '4c1a36d71cb621668c336926a7773534'}
PCN-12
{'HOGLEV': '7ba11b2d99e473be8262cad965df7332', 'HOGLEV01': 'MISSING_HOGLEV01'}

NOTE: these left are ok, possibly primitive/conventional cells 

Unique NIST - CSD matches: 623
NIST - multiple CSD matches: 53
NIST - CSD matches, total: 676
NIST - CSD matches to check because of different graphs: 4


In [6]:
# Inspection

with open("data/step-09.yml", "r") as file:
    mat_dict = yaml.safe_load(file)
    
count = {
    'has_not_hydrogen': [],
    'has_atomic_overlaps': [],
    'has_lone_molecule': [],    
}

missing = []
for csd in csd_refcodes_mem:
    if csd not in mof_checker:
        missing.append(csd)
        continue
    for k in count:
        if mof_checker[csd]['asr'][k]:
            count[k].append(csd)

count['missing'] = missing
count['total'] = csd_refcodes_mem

for k in count:
    print(k, len(count[k]))

has_not_hydrogen 47
has_atomic_overlaps 38
has_lone_molecule 80
missing 29
total 777


In [7]:
with open("data/step-09.yml", "r") as file:
    mat_dict = yaml.safe_load(file)
    
mat_dict_new = {}
for nist_mat, nist_dict in mat_dict.items():
    nist_dict_new = {}
    for csd_mat in nist_dict:  
        if not csd_mat in count['has_atomic_overlaps'] + count['has_lone_molecule']:
            nist_dict_new[csd_mat] = nist_dict[csd_mat]
    if len(nist_dict_new)>0:
        mat_dict_new[nist_mat] = nist_dict_new
        
with open("data/step-10.yaml", "w") as f: # Same as -40 but without CSD entries with problems (overlaps)
    yaml.dump(mat_dict_new, f)  
    
print("NIST adsorbents left after cleaning:", len(mat_dict_new) ) 
print("CSD entries adsorbents left after cleaning:", sum([ len(v) for v in mat_dict_new.values()]) )   

NIST adsorbents left after cleaning: 569
CSD entries adsorbents left after cleaning: 666
