# Match by DOI

In [24]:
import pandas as pd
import pickle 
from tqdm import tqdm
import json

In [27]:
with open("data/nistdb.pickle", 'rb') as f:
    nistdb = pickle.load(f)
    
with open("data/csd_542_mofchecker.json", "r") as file:
    mof_checker = json.load(file)

mof_checker['FIQCEN']

{'asr': {'name': 'FIQCEN',
  'graph_hash': '521324a26079f95c5a55e7f417c3fa7f',
  'scaffold_hash': 'e1682de934a12384f1952d34148e3832',
  'symmetry_hash': 'mEopNc0Bbry+C0Mp9WB7PJqumiUvMB4qfFQS9GqwjvI=225',
  'formula': 'Cu12 H24 C72 O48',
  'path': None,
  'density': 0.879095741266423,
  'has_carbon': True,
  'has_hydrogen': True,
  'has_atomic_overlaps': True,
  'has_overcoordinated_c': False,
  'has_overcoordinated_n': False,
  'has_overcoordinated_h': False,
  'has_undercoordinated_c': False,
  'has_undercoordinated_n': False,
  'has_undercoordinated_rare_earth': False,
  'has_metal': True,
  'has_lone_molecule': False,
  'has_high_charges': True,
  'is_porous': None,
  'has_suspicicious_terminal_oxo': False},
 'fsr': {'name': 'FIQCEN',
  'graph_hash': '7476b78aa05f38eafe32f4305b186a38',
  'scaffold_hash': '28d2fe83c2312ed56ab28a947e4bdad4',
  'symmetry_hash': 'XJpP2YtrKA/2eBwUPtWVkjwWOBwL2pI4SqWbrbxLluU=225',
  'formula': 'Cu12 H24 C72 O60',
  'path': None,
  'density': 0.94885443430

In [11]:
# Add a column with nist_mat matched by doi

df = pd.read_csv("data/step-04.csv")

for pub in nistdb['Bibliography']:
    pub['DOI'] = pub['DOI'].lower() # NOTE: I already lowered all DOIs from CSD

for i, doi_csd in enumerate(tqdm(df['publication_doi'])): # takes 1 minute
    if not doi_csd: # Missing DOI
        continue
    for pub in nistdb['Bibliography']:
        doi_nist = pub['DOI']
        if doi_csd==doi_nist:
            df.at[i,"nist_mat_by_doi"] = str(pub['adsorbentMaterial']) # NOTE 1: adsorbentMaterial will be always the main name for NIST, not a synonym (e.g., no HKUST-1, only CuBTC)
            break                                                      # NOTE 2: pub['adsorbentMaterial'] is a list that may contain more entries if the paper contains more materials!
df.to_csv("data/step-05.csv", index=False) 
# step-05.csv: add column nist_mat_by_doi with possible matches but not performing the match yet 

100%|█████████████████████████████████| 105922/105922 [00:49<00:00, 2154.41it/s]


In [15]:
df = pd.read_csv("data/step-05.csv")
df[~pd.isna(df.nist_mat_by_doi)].head(3)

Unnamed: 0,identifier,ccdc_number,formula,has_disorder,publication_year,publication_doi,synonyms_orig,synonyms,note,nist_mat,nist_mat_by_doi
34,ABEMIF,833721.0,"(C2 H8 N1 1+)3n,n(C72 H24 Cl3 Cu12 O48 3-),9n(...",True,2011,10.1039/c1cc14118j,[],[],-,,"['[(CH3)NH2]3[(Cu4Cl)3-(btc)8].xMeOH', '[(CH3)..."
195,ACAJIZ,853983.0,"(C22 H24 Cu2 N2 O8)n,n(C2 H3 N1)",True,2012,10.1039/c2dt31427d,[],[],-,,"['{Cu2(Glu)2(mu-bpa)}*(CH3CN)', '{Cu2(Glu)2(mu..."
196,ACAJOF,853984.0,"(C23 H26 Cu2 N2 O8)n,n(C3 H6 O1)",False,2012,10.1039/c2dt31427d,[],[],-,,"['{Cu2(Glu)2(mu-bpa)}*(CH3CN)', '{Cu2(Glu)2(mu..."


In [13]:
# Showing (but not doing anything) possible names that need correction: 
#   they match DOI and have CSD synonyms, but their synonyms are not matching
df = pd.read_csv("data/step-05.csv")
df = df[(df['note']=='-') & ~pd.isnull(df['nist_mat_by_doi']) & (df['synonyms']!='[]')]
df[['identifier','publication_doi','synonyms','nist_mat_by_doi']].reset_index()

Unnamed: 0,index,identifier,publication_doi,synonyms,nist_mat_by_doi
0,292,ACOCOM,10.1039/c2cc34840c,['DUT-49(Cu)'],['DUT-49']
1,293,ACOCUS,10.1039/c2cc34840c,['DUT-49(Zn)'],['DUT-49']
2,295,ACODAZ,10.1039/c2cc34840c,['DUT-49(Co)'],['DUT-49']
3,537,ADOGEH,10.1039/c2cc36220a,['UHM-30'],['Cu3(NH2btc)2']
4,1636,ALAMUW,10.1002/chem.201002135,['SNU-50'],['C26H8Cu2N2O12']
...,...,...,...,...,...
78,98691,XUPSAE,10.1038/nchem.2114,['NOTT-300 deuteroethane solvate'],['NOTT-300']
79,102337,YUBFIL,10.1021/ic802446m,['MAF-X3'],"['C5.33H8.67N3.17O2.25Zn', 'C4H6FN3Zn']"
80,102338,YUBFOR,10.1021/ic802446m,['MAF-X4'],"['C5.33H8.67N3.17O2.25Zn', 'C4H6FN3Zn']"
81,102339,YUBFUX,10.1021/ic802446m,['MAF-X5'],"['C5.33H8.67N3.17O2.25Zn', 'C4H6FN3Zn']"


In [16]:
# Flag the ones with multiple NIST-ISODB adsorbents, but if unique add them to the nist_mat
df = pd.read_csv("data/step-05.csv")

for row in tqdm(df.itertuples(), total=df.shape[0]):
    if row.note.startswith("Matched by synonym"):
        continue
    elif row.note=="Exclude DOI already matched by synonym":
        continue        
    elif row.note=="Excluding more than 3 same MOFs from the same DOI":
        continue
    elif pd.isnull(row.nist_mat_by_doi):
        continue
        
    else:
        synonyms = eval(row.nist_mat_by_doi)
        if len(synonyms)==1: # Only one nist-isodb
            df.at[row.Index,"nist_mat"] = synonyms[0]
            if len(df[df.publication_doi==row.publication_doi])==1: # check if there are multiple CSD from the same DOI
                df.at[row.Index,"note"] = "One-to-one match by DOI"
            else:
                df.at[row.Index,"note"] = "OneNIST-to-manyCSD matches by DOI (to check)"
                
        # Third choice, if there is >one nist_adsorbent_by_doi, write them all and mark that there is need for revision
        else:
            df.at[row.Index,"note"] = "VariousNIST-to-paper matched by DOI"

df.to_csv("data/step-06.csv", index=False) 
# step-06.csv: add nist_mat for unique match, and "note": still need to check for multiple CSD

100%|████████████████████████████████| 105922/105922 [00:04<00:00, 23766.22it/s]


In [18]:
df = pd.read_csv("data/step-06.csv")
df[~pd.isna(df.nist_mat_by_doi)].head(3)

Unnamed: 0,identifier,ccdc_number,formula,has_disorder,publication_year,publication_doi,synonyms_orig,synonyms,note,nist_mat,nist_mat_by_doi
34,ABEMIF,833721.0,"(C2 H8 N1 1+)3n,n(C72 H24 Cl3 Cu12 O48 3-),9n(...",True,2011,10.1039/c1cc14118j,[],[],VariousNIST-to-paper matched by DOI,,"['[(CH3)NH2]3[(Cu4Cl)3-(btc)8].xMeOH', '[(CH3)..."
195,ACAJIZ,853983.0,"(C22 H24 Cu2 N2 O8)n,n(C2 H3 N1)",True,2012,10.1039/c2dt31427d,[],[],VariousNIST-to-paper matched by DOI,,"['{Cu2(Glu)2(mu-bpa)}*(CH3CN)', '{Cu2(Glu)2(mu..."
196,ACAJOF,853984.0,"(C23 H26 Cu2 N2 O8)n,n(C3 H6 O1)",False,2012,10.1039/c2dt31427d,[],[],VariousNIST-to-paper matched by DOI,,"['{Cu2(Glu)2(mu-bpa)}*(CH3CN)', '{Cu2(Glu)2(mu..."


In [19]:
# Show statistics
df = pd.read_csv("data/step-06.csv")
df = df.sort_values('note', ascending=True) # When I keep=first, these are sorted by note

print('Statistics by CSD entries')
for note in df['note'].unique():
    count = len(df[df.note==note])
    print(count, "\t",note)
    
print('\nStatistics by Paper')
df1 = df[~pd.isnull(df['publication_doi'])]
df1 = df1[~df1.duplicated(subset='publication_doi', keep='first')]

for note in df['note'].unique():
    count = len(df1[df1.note==note])
    print(count, "\t",note)
    
print('\nStatistics by NIST matches')
df = df[~pd.isnull(df['nist_mat'])]
df = df[~df.duplicated(subset='nist_mat', keep='first')]

tot = 0
for note in df['note'].unique():
    count = len(df[df.note==note])
    tot+=count
    print(count, "\t",note)
print(tot, "\t","TOTAL")
    
print("\nNOTE: You can find some little mismatch in the NIST stats because df.duplicates() may remove rows from different categories,")
print("       but I used df.sort_values and ascending Notes to avoid it.")

Statistics by CSD entries
101360 	 -
370 	 Exclude DOI already matched by synonym
1913 	 Excluding more than 3 same MOFs from the same DOI
394 	 Matched by synonym
40 	 Matched by synonym (corrected)
325 	 One-to-one match by DOI
583 	 OneNIST-to-manyCSD matches by DOI (to check)
937 	 VariousNIST-to-paper matched by DOI

Statistics by Paper
42919 	 -
70 	 Exclude DOI already matched by synonym
8 	 Excluding more than 3 same MOFs from the same DOI
159 	 Matched by synonym
22 	 Matched by synonym (corrected)
325 	 One-to-one match by DOI
165 	 OneNIST-to-manyCSD matches by DOI (to check)
330 	 VariousNIST-to-paper matched by DOI

Statistics by NIST matches
302 	 Matched by synonym
32 	 Matched by synonym (corrected)
319 	 One-to-one match by DOI
157 	 OneNIST-to-manyCSD matches by DOI (to check)
810 	 TOTAL

NOTE: You can find some little mismatch in the NIST stats because df.duplicates() may remove rows from different categories,
       but I used df.sort_values and ascending Notes to 

In [29]:
# Add structure SGH (Structure Graph Hash) for AllSolventRemoved structures
df = pd.read_csv("data/step-06.csv")

missing = 0
for i, csd_id in enumerate(tqdm(df['identifier'])):
    try:
        graph = mof_checker[csd_id]['asr']['graph_hash']
    except:
        missing+=1
        graph = "MISSING_" + csd_id   
    df.at[i, 'sgh_asr'] = graph

print('Missing SGH:', missing)
df.to_csv("data/step-07.csv", index=False) 
# step-07.csv: add SGH

100%|███████████████████████████████| 105922/105922 [00:00<00:00, 162402.08it/s]


Missing SGH: 1128


In [30]:
df = pd.read_csv("data/step-07.csv")
df.head(3)

Unnamed: 0,identifier,ccdc_number,formula,has_disorder,publication_year,publication_doi,synonyms_orig,synonyms,note,nist_mat,nist_mat_by_doi,sgh_asr
0,ABACUF,1100034.0,(C6 H14 Ba2 Cu1 O16)n,False,1958,,[],[],-,,,e15bb6793b1e615a15d2086631ee9290
1,ABACUF01,230290.0,(C6 H14 Ba2 Cu1 O16)n,False,2004,10.1016/j.molstruc.2004.03.051,[],[],-,,,4a5274dbb47d6011b4ac1b11f5066f27
2,ABAFUH,1498688.0,(C9 H8 Cu2 O4)n,False,2016,10.1016/j.poly.2016.09.043,[],[],-,,,2eac7429e128fbdb695d7980ce88563a


In [41]:
# Identify if CSD structures linked to the same papers are actually the same once desolvated (ASR)

PRINT = True    

df = pd.read_csv("data/step-07.csv")
df_to_check = df[df['note'] == "OneNIST-to-manyCSD matches by DOI (to check)"]

# Loop over the papers with same DOI
all_identifiers = []
for i, doi in enumerate(set(df_to_check['publication_doi'])):
    df_same_doi = df[df['publication_doi'] == doi]
    nist_mat = df_same_doi['nist_mat'].values[0]
    all_identifiers+= list(df_same_doi['identifier'].values)
    if PRINT: print(i, nist_mat, doi)
    graphs = df_same_doi['sgh_asr'].values
    if all([x==graphs[0] for x in graphs]):
        if PRINT: print(' >>> ALL THE SAME!', *df_same_doi['identifier'].values)
        for index in df_same_doi.index:
            df.at[index, "note"] = "OneNIST-to-manyCSD matches by DOI >> MATCH, same SGH"
    else:
        if PRINT: print(' >>> SOME DIFFERENT, REMOVE!', *df_same_doi['identifier'].values)
        
        for index in df_same_doi.index:
            df.at[index, 'nist_mat'] = None
            df.at[index, "note"] = "OneNIST-to-manyCSD matches by DOI >> EXCLUDE, different SGH"

df.to_csv("data/step-08.csv", index=False) 
# data/step-08.csv: filter entries with multiple 

0 [Cd(PDCO)(bipy)*(H2o)*5H2O]n 10.1016/j.inoche.2008.09.002
 >>> SOME DIFFERENT, REMOVE! NONWUJ NONXAQ
1 C40H26CoN8O6S2 10.1021/ja039914m
 >>> SOME DIFFERENT, REMOVE! AVUBAU AVUBEY AVUBIC
2 [Ni2(NCS)4(azpy)4]n 10.1021/ic061052d
 >>> SOME DIFFERENT, REMOVE! XESBED XESBIH XETNUG XETPAO XETPES
3 [Zn3L2(HCOO)1.5][(CH3)2NH2]1.5 xDMF 10.1039/c4ta01993h
 >>> SOME DIFFERENT, REMOVE! HOKLIE HOKLOK
4 [Zn3-(ntb)2]n 10.1002/chem.200601534
 >>> ALL THE SAME! PIDMEV PIDMEV01 PIFPIE
5 C99H72Cd6O42 10.1021/cg101222w
 >>> SOME DIFFERENT, REMOVE! AROFAP AROFET EREGOY
6 ABT*2ClO4 10.1002/anie.201307650
 >>> SOME DIFFERENT, REMOVE! YIKJOT YIKJUZ
7 MIL-53(Cr) 10.1039/c0cc03882b
 >>> ALL THE SAME! GUSNEN01 QALGUH
8 C15H13O13La 10.1021/cg100078b
 >>> SOME DIFFERENT, REMOVE! RUWWOW RUWWOW01 RUWWUC RUWXAJ RUWXEN RUWXOX
9 RIWMUH 10.1039/c3ce41428k
 >>> SOME DIFFERENT, REMOVE! RIWMUH RIWNAO RIWQIZ RIWQOF
10 [Zn(TeGly)]n 10.1039/c5nj00011d
 >>> SOME DIFFERENT, REMOVE! REYNOA01 REYNUG01
11 C4N8O4Cd2 10.1039/c0cc03

In [42]:
df = pd.read_csv("data/step-08.csv")
df[df.note.str.startswith("OneNIST-to-manyCSD")].head(3)

Unnamed: 0,identifier,ccdc_number,formula,has_disorder,publication_year,publication_doi,synonyms_orig,synonyms,note,nist_mat,nist_mat_by_doi,sgh_asr
292,ACOCOM,889572.0,"(C40 H24 Cu2 N2 O10)n,5.2n(C5 H9 N1 O1)",True,2012,10.1039/c2cc34840c,['DUT-49(Cu)'],['DUT-49(Cu)'],"OneNIST-to-manyCSD matches by DOI >> EXCLUDE, ...",,['DUT-49'],16a310f2c3a7af6421186dc7747dca86
293,ACOCUS,890363.0,"(C40 H24 N2 O10 Zn2)n,5.3n(C5 H9 N1 O1)",True,2012,10.1039/c2cc34840c,['DUT-49(Zn)'],['DUT-49(Zn)'],"OneNIST-to-manyCSD matches by DOI >> EXCLUDE, ...",,['DUT-49'],94b4e7020e7afe887e01388b96c528af
295,ACODAZ,890364.0,"(C40 H24 Co2 N2 O10)n,4.6n(C5 H9 N1 O1)",True,2012,10.1039/c2cc34840c,['DUT-49(Co)'],['DUT-49(Co)'],"OneNIST-to-manyCSD matches by DOI >> EXCLUDE, ...",,['DUT-49'],d79b2122c49a9f0c3302a9c05240164f


In [43]:
# Shows statistics
df = pd.read_csv("data/step-08.csv")
df = df.sort_values('note', ascending=True) # When I keep=first, these are sorted by note

print('Statistics by CSD entries')
for note in set(df['note']):
    count = len(df[df.note==note])
    print(count, "\t",note)
    
tot = df[df.note.isin([
    'Matched by synonym', 
    'Matched by synonym (corrected)',
    'One-to-one match by DOI',
    'OneNIST-to-manyCSD matches by DOI >> MATCH, same SGH',
])]  

print("\n Total CSD matched:", len(tot))
    
print()
print('Statistics by NIST matches')
df = df[~pd.isnull(df['nist_mat'])]
df = df[~df.duplicated(subset='nist_mat', keep='first')]
tot=0
for note in set(df['note']):
    count = len(df[df.note==note])
    tot+=count
    print(count, "\t",note)

print("\n Total NIST matched:", tot)

Statistics by CSD entries
1913 	 Excluding more than 3 same MOFs from the same DOI
500 	 OneNIST-to-manyCSD matches by DOI >> EXCLUDE, different SGH
40 	 Matched by synonym (corrected)
370 	 Exclude DOI already matched by synonym
325 	 One-to-one match by DOI
937 	 VariousNIST-to-paper matched by DOI
394 	 Matched by synonym
83 	 OneNIST-to-manyCSD matches by DOI >> MATCH, same SGH
101360 	 -

 Total CSD matched: 842

Statistics by NIST matches
302 	 Matched by synonym
30 	 OneNIST-to-manyCSD matches by DOI >> MATCH, same SGH
319 	 One-to-one match by DOI
32 	 Matched by synonym (corrected)

 Total NIST matched: 683
