# Match adsorbents by synonyms

In [2]:
import pandas as pd
import pickle 
from tqdm import tqdm
from collections import Counter

In [16]:
ADSORBENTS_BLACKLISTED = [ # ambiguous names: see Supp. Info.
    "mil-53",
    "mil-100",
    "mil-101",
    "mil-102",
    "mof-74",
    "mil-68",
    "mil-88b",
]

with open("data/nistdb.pickle", 'rb') as f:
    nistdb = pickle.load(f)

In [10]:
# Compare mismatches for low/upper case
df = pd.read_csv("data/step-02.csv")
for i, csd_synonyms in enumerate(df['synonyms']): # Takes ca. 5 secs
    for csd_syn in eval(csd_synonyms):
        for mat in nistdb['Adsorbents']:
            all_nist_names = mat['synonyms'] + [mat['name']]
            if csd_syn.lower() in [ x.lower() for x in all_nist_names ] and csd_syn not in all_nist_names:
                print(csd_syn, all_nist_names)

bio-MOF-14 ['Bio-MOF-14']
bio-MOF-13 ['Bio-MOF-13']
bio-MOF-12 ['Bio-MOF-12']
Cd-MOF-2 ['CD-MOF-2']
NJU-Bai7 ['NJU-BAi7']
NJU-Bai8 ['NJU-BAi8']
CD-MOF-1 ['Cd-MOF-1']
CD-MOF-1 ['Cd-MOF-1']
CD-MOF-1 ['Cd-MOF-1']
dehydroxylated UiO-66 ['Dehydroxylated UiO-66']
sod-ZMOF ['SodZMOF', 'Sod-ZMOF']
UIO-67 ['UiO-67']
bio-MOF-11 ['Bio-MOF-11']


In [28]:
# Use case INsensitive
df = pd.read_csv("data/step-02.csv")
for i, csd_synonyms in enumerate(tqdm(df['synonyms'])): # Takes ca. 1 min
    if df.at[i,"note"]=="Excluding more than 3 same MOFs from the same DOI":
        #print(df.at[i,"publication_doi"])
        continue
    for csd_syn in eval(csd_synonyms):
        csd_syn = csd_syn.lower()
        
        for mat in nistdb['Adsorbents']:
            nist_syns = [ x.lower() for x in [ mat['name']] + mat['synonyms'] ]
            
            if nist_syns[0] in ADSORBENTS_BLACKLISTED: # ambiguous names skipped
                continue
            
            if csd_syn in nist_syns :
                df.at[i,"nist_mat"] = mat['name'] # NOTE: adsorbent name will be always the main one for NIST, not a synonym (e.g., no HKUST-1, only CuBTC)
                df.at[i,"note"] = "Matched by synonym"
                break
                
            # Try to perform some further matches: "MOF"=="MOFa", "MOF"=="MOF'", "MOF-n"=="MOFn"=="MOF n" **
            for nist_syn in nist_syns:
                if any([
                    csd_syn+'a'==nist_syn,
                    csd_syn==nist_syn+'a',
                    csd_syn+"'"==nist_syn,
                    csd_syn==nist_syn+"'",
                    csd_syn.replace('-',"")==nist_syn,
                    csd_syn==nist_syn.lower().replace('-',""),
                    csd_syn.replace('-'," ")==nist_syn,
                    csd_syn==nist_syn.replace('-'," ")
                ]):
                    df.at[i,"nist_mat"] = mat['name']
                    df.at[i,"note"] = "Matched by synonym (corrected)"
                    break                      
            
df.to_csv("data/step-03.csv", index=False)
# step-03.csv adds "note: Matched by synonym / Matched by synonym (corrected)" and the nist_name, if matched

100%|██████████| 105922/105922 [00:48<00:00, 2192.72it/s]


In [22]:
df = pd.read_csv("data/step-03.csv")
df[df['note'].str.startswith("Matched")].reset_index().head(10)

Unnamed: 0,index,identifier,ccdc_number,formula,has_disorder,publication_year,publication_doi,synonyms_orig,synonyms,note,nist_mat
0,244,ACIBEU,258461.0,(C21 H27 Co3 N3 O15)n,False,2006,10.1016/j.micromeso.2005.11.049,['MOF-CJ4'],['MOF-CJ4'],Matched by synonym,MOF-CJ4
1,245,ACIBIY,258462.0,"(C72 H72 Co36 O144)n,6n(H2 O1)",False,2006,10.1016/j.micromeso.2005.11.049,['MOF-CJ5'],['MOF-CJ5'],Matched by synonym,MOF-CJ5
2,246,ACIBOE,279674.0,"(C21 H19 N1 O16 Zn3)n,n(H2 O1)",False,2006,10.1016/j.micromeso.2005.11.049,['MOF-CJ3'],['MOF-CJ3'],Matched by synonym,MOF-CJ3
3,300,ACODUT,894905.0,"(C78 H42 O12 Yb2)n,6n(C4 H9 N1 O1)",True,2012,10.1039/c2cc35729a,['UTSA-30'],['UTSA-30'],Matched by synonym (corrected),UTSA-30a
4,357,ACUFEK,607441.0,"(C192 H120 Cu12 N24 O60)n,12(C2 H6 O1 S1),44(H...",True,2006,10.1021/ja058777l,"[""catena-(octakis(μ6-4,4',4''-s-Triazine-2,4,6...",['PCN-6'],Matched by synonym,PCN-6
5,588,ADUROI,922041.0,"(C21 H15 N3 O7 Zn1)n,n(C3 H7 N1 O1),3.5n(H2 O1)",True,2013,10.1002/anie.201302715,['NJU-Bai9'],['NJU-Bai9'],Matched by synonym,NJU-Bai9
6,1264,AHORAR,708306.0,"(C28 H18 Cu1 O5)n,2n(C4 H9 N1 O1)",True,2009,10.1021/ic900372t,['PCN-18'],['PCN-18'],Matched by synonym,PCN-18
7,3258,AWEYAE,1480229.0,"(C50 H32 N4 O12 Zn2)n,2n(C3 H7 N1 O1)",True,2016,10.1021/acs.cgd.6b01054,['TMU-24'],['TMU-24'],Matched by synonym,TMU-24
8,3389,AXICOB,1496410.0,(C56 H52 Cu3 O18 P6)n,True,2016,10.1002/anie.201607745,['CALF-33-Et2H'],['CALF-33-Et2H'],Matched by synonym,CALF-33-Et2H
9,3397,AXILOJ,812143.0,(C18 H12 N2 O4 Zn1)n,False,2011,10.1039/c1cc10983a,['CID-1'],['CID-1'],Matched by synonym,CID-1


## Get Statistics

In [29]:
df = pd.read_csv("data/step-03.csv")
print(f"(CSD matched thanks to correction: {len(df[df['note']=='Matched by synonym (corrected)'])}, see ** previous cell)")
df_matches_found = df[df['note'].str.startswith('Matched by synonym')] # can also be "Matched by synonym (corrected)"

print(f"Matches found CSD:  {df_matches_found['nist_mat'].count()} ... but they could be duplicates in the same paper!")
      
# Exclude those in the same paper
df_matches_found_differentpapers = df_matches_found.copy()
df_matches_found_differentpapers = df_matches_found_differentpapers[~df_matches_found_differentpapers.duplicated(subset=["nist_mat", "publication_doi"],keep='first')] 

print(f"Matches found CSD:  {df_matches_found_differentpapers['nist_mat'].count()} ... from different papers!")
      
# set() is taking too long!
unique = {} # considering unique if same CSD
for x in df_matches_found["nist_mat"]:
    if x not in unique:
        unique[x] = 1
    else:
        unique[x] += 1
        
unique_dp = {} # considering unique if same paper
for x in df_matches_found_differentpapers["nist_mat"]:
    if x not in unique_dp:
        unique_dp[x] = 1
    else:
        unique_dp[x] += 1
    
print()
print(f"Matches found NIST: {len(unique)} (by unique CSD enty)")
print(f"Matches found NIST: {len(unique_dp)} (by unique paper, should match with the previous!)")

(CSD matched thanks to correction: 40, see ** previous cell)
Matches found CSD:  434 ... but they could be duplicates in the same paper!
Matches found CSD:  383 ... from different papers!

Matches found NIST: 334 (by unique CSD enty)
Matches found NIST: 334 (by unique paper, should match with the previous!)


In [30]:
# (continues from previous cell)
# Count CSD entries and NIST-ISODB adsorbents coming from the same DOI
count_unique = dict(Counter(unique.values()))
count_unique = {k: v for k, v in sorted(count_unique.items(), key=lambda x: x[0])}
for k, v in count_unique.items():
    print(f"{v}\t NIST-ISODB adsorbents associated to {k} CSD entries")

print()
unique = {k: v for k, v in sorted(unique.items(), key=lambda x: x[1])}   
for k, v in unique.items():
    if v>=4:
        print(v, k)
        
print("\nNow considering the paper: more CSD entries can be associated to the same paper!")
print("NOTE: this statistic is more interesting, because we care if the same MOF has been used for different studies (papers)\n")

count_unique = dict(Counter(unique_dp.values()))
count_unique = {k: v for k, v in sorted(count_unique.items(), key=lambda x: x[0])}
for k, v in count_unique.items():
    print(f"{v}\t NIST-ISODB adsorbents associated to {k} paper")

print()
unique_dp = {k: v for k, v in sorted(unique_dp.items(), key=lambda x: x[1])}   
for k, v in unique_dp.items():
    if v>=4:
        print(v, k)

295	 NIST-ISODB adsorbents associated to 1 CSD entries
20	 NIST-ISODB adsorbents associated to 2 CSD entries
6	 NIST-ISODB adsorbents associated to 3 CSD entries
3	 NIST-ISODB adsorbents associated to 4 CSD entries
5	 NIST-ISODB adsorbents associated to 5 CSD entries
2	 NIST-ISODB adsorbents associated to 6 CSD entries
1	 NIST-ISODB adsorbents associated to 7 CSD entries
1	 NIST-ISODB adsorbents associated to 8 CSD entries
1	 NIST-ISODB adsorbents associated to 17 CSD entries

4 MOF-74-Ni
4 MOF-74-Co
4 ZIF-9
5 CuBTC
5 Cd-MOF-1
5 Cu-MOF-74
5 Mg-MOF-74
5 Zn-MOF-74
6 Mn-MOF-74
6 PCP-1
7 IRMOF-1
8 UiO-67
17 ZIF-8

Now considering the paper: more CSD entries can be associated to the same paper!
NOTE: this statistic is more interesting, because we care if the same MOF has been used for different studies (papers)

312	 NIST-ISODB adsorbents associated to 1 paper
13	 NIST-ISODB adsorbents associated to 2 paper
3	 NIST-ISODB adsorbents associated to 3 paper
2	 NIST-ISODB adsorbents associated t

In [31]:
# Exclude all the DOIs that contain already a match by synonim, or this can create problems when considering the number of structures/isotherms in the paper!
df = pd.read_csv("data/step-03.csv")
dois_already_matched = list(df[df['note'].str.startswith("Matched by synonym")]['publication_doi'])

for row in tqdm(df.itertuples(), total=df.shape[0]):
    if row.note=='-' \
       and not pd.isnull(row.publication_doi) \
       and row.publication_doi in dois_already_matched:
        df.at[row.Index, 'note'] = "Exclude DOI already matched by synonym"

df.to_csv("data/step-04.csv", index=False) 
# step-04.csv: add note "Exclude" if material from same DOI already matched

print("CSD entries excluded:", len(df[df['note']=="Exclude DOI already matched by synonym"]))

100%|██████████| 105922/105922 [00:00<00:00, 165327.45it/s]


CSD entries excluded: 370
