# Scrape CSD and curate the synonyms
NOTE: you need a CSD licence to install and use the CSD API

In [12]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd

In [13]:
import ccdc
import ccdc.io
import ccdc.search
print("ccdc.__version__", ccdc.__version__)
print("ccdc.io.csd_directory()", ccdc.io.csd_directory())
print("ccdc.io.csd_version()", ccdc.io.csd_version())

subset_mofs_all = Path(ccdc.io.csd_directory()) / "subsets" / "MOF_subset.gcd"
print(f"Number of MOFs: {len(ccdc.io.EntryReader(str(subset_mofs_all)))}")

ccdc.__version__ 3.0.4
ccdc.io.csd_directory() /home/daniele/Programs/CCDC/CSD_2021/csd
ccdc.io.csd_version() 542
Number of MOFs: 105922


In [17]:
n_entries = len(ccdc.io.EntryReader(str(subset_mofs_all)))
rows = [{} for _ in range(n_entries)] # Pre allocate for speed
for i, entry in enumerate(tqdm(ccdc.io.EntryReader(str(subset_mofs_all)))): # Takes 2 minutes
    for col in ['identifier', 'ccdc_number','formula', 'has_disorder']:
        rows[i][col] = getattr(entry, col)
    rows[i]['publication_year'] = entry.publication.year
    doi = entry.publication.doi
    if doi: # can be None
        doi = doi.lower() # Lower caption to improve match in a later step
    rows[i]['publication_doi'] = doi 
    synonyms = list(entry.synonyms)
    rows[i]['synonyms_orig'] = synonyms
df = pd.DataFrame(rows) 

df.to_csv("data/step-00.csv", index=False) 
# step-00.csv contains all CSD entries of MOF subset: id, formula, has_disorder, pub year, doi, synonyms_orig

100%|██████████| 105922/105922 [01:54<00:00, 921.87it/s]


In [18]:
df.head(3)

Unnamed: 0,identifier,ccdc_number,formula,has_disorder,publication_year,publication_doi,synonyms_orig
0,ABACUF,1100034.0,(C6 H14 Ba2 Cu1 O16)n,False,1958,,[]
1,ABACUF01,230290.0,(C6 H14 Ba2 Cu1 O16)n,False,2004,10.1016/j.molstruc.2004.03.051,[]
2,ABAFUH,1498688.0,(C9 H8 Cu2 O4)n,False,2016,10.1016/j.poly.2016.09.043,[]


In [21]:
df = pd.read_csv("data/step-00.csv")
print("CSD entries having synonyms:", len(df[df['synonyms_orig']!="[]"]))

CSD entries having synonyms: 8034


In [22]:
# Correct the names from "synonyms_orig", and create a new column "synonyms": exclude misleading ones, and solvents/adsorbate.
# NOTE: the reason why I'm doing it here and not later in the name match, is to exclude those names that are obviously problematic,
#       and I want to remove completely, to make later checks easier.

ADSORBATES_LIST = [
    "carbon", # carbon dioxide
    "deuteromethane",
    "methane",
    "ethane",
    "ethene",
    "dinitrogen",
    "acetylene",
    "acetylene",
    "acetaldehyde",
    "ethylamine"  
]

df = pd.read_csv("data/step-00.csv")
df['synonyms'] = None # create new column for .at
for i, synonyms in enumerate(tqdm(df['synonyms_orig'])):
    synonyms_valid = [] 
    for name in eval(synonyms): #there may be more than one synonym, but it is not very frequent
        # Exclude some fake names:
        if any([
            name.startswith('catena'),
            name.startswith('Teaching Subset'),
            name.startswith('DrugBank')
            #len(name)>20
        ]):
            continue
            
        if name in ["","MOF-1", "MOF-2", 1, 2, "1", "2"]:
            continue
            
        name_split = name.split(" ")
        if len(name_split)>1 and name_split[1] in ADSORBATES_LIST: # remove adsorbent from name
            name = name_split[0] 
            
        synonyms_valid.append(name)
        
    df.at[i,'synonyms'] = synonyms_valid

df.to_csv("data/step-01.csv", index=False) 
# step-01.csv: add "synonyms" column with corrected CSD syns

100%|██████████| 105922/105922 [00:01<00:00, 90775.74it/s]


In [26]:
df = pd.read_csv("data/step-01.csv")
print("CSD entries having valid synonyms:", len(df[df['synonyms']!="[]"]))

CSD entries having valid synonyms: 5759


In [28]:
df = pd.read_csv("data/step-01.csv")
df[df['synonyms']!="[]"].head(3)

Unnamed: 0,identifier,ccdc_number,formula,has_disorder,publication_year,publication_doi,synonyms_orig,synonyms
47,ABEXEN,1029033.0,"(C20 H10 In1 O8 S1 1-)n,n(C2 H8 N1 1+),2.5n(C3...",True,2016,10.1039/c6ta07939c,['MROF-1'],['MROF-1']
150,ABUWOJ,156028.0,"(C32 H38 O18 S4 Zn4)n,2n(H2 O1)",False,2001,10.1107/s1600536801009175,['C32 H42 O20 S4 Zn4'],['C32 H42 O20 S4 Zn4']
244,ACIBEU,258461.0,(C21 H27 Co3 N3 O15)n,False,2006,10.1016/j.micromeso.2005.11.049,['MOF-CJ4'],['MOF-CJ4']


In [30]:
# Inspect names that have a space: one can see that many more can be fixed but it is hard to find general rules
df = pd.read_csv("data/step-01.csv")
for i in df.index:
    synonyms = eval(df.at[i,'synonyms'])
    if len(synonyms)>0:
        first = synonyms[0]
        if len(first.split(" "))>1:
            print(first)

C32 H42 O20 S4 Zn4
di-ammonium catena-(diuranyl trioxalate)
Anhydrous sodium naproxen
PNU 21
Sodium Acetate
Sodium Acetate
Uranyl phenylphosphinate
Im Fe MOF
Hexa-aqua-nickel(ii) dipotassium tetrahydrogen tetra-o-phthalate tetrahydrate
NJU-Bai 14
C14 H21 Na O13
MIL89 Lutidine
F-MOF 2
F-MOF 3
F-MOF 4
F-MOF 5
PCN-224Co(ii) plus CO
PCN-224Co(ii) plus CO
PCN-224Co(ii) plus CO
PCN-224Co(ii) plus CO
JLU-Liu 33
COV-1 CdS-TMDPY
COV-2 CdS-TMDPy-TPhP
COV-3 CdS-BPy
COV-4 CdS-TMDPy
COV-3 CdSeS-BPy
AlFFIVE-1-Ni (KAUST-8) dehydrated
AlFFIVE-1-Ni (KAUST-8) rehydrated
AlFFIVE-1-Ni (KAUST-8) as-synthesised
AlFFIVE-1-Ni (KAUST-8) CO2
bis(Cimetidine)-copper(ii) dinitrate
MFM-115a pentakis(deuteromethane) clathrate
MFM-115a nonakis(deuteromethane) clathrate
MFM-132a bis(deuteromethane) clathrate
MFM-132a tris(deuteromethane) clathrate
Anhydrous zinc(ii) heptanoate
PCM-101 (partially oxidized)
Bionectriol D
the activated NJU-Bai62
Silver pefloxacin trihydrate
MIL-53 mη-xylene
MIL-53 ortho-cymene
MIL-53as p

In [31]:
# Check papers with a lot of entries
doi_note = {
    "10.1002/cssc.201601752": "in-situ study of M-MOF-74",
}

for doi, note in doi_note.items():
    df_doi = df[df['publication_doi']==doi]
    print(doi, f"({note})", f"Number of CIFs: {len(df_doi)}", 'Synonyms:', *list(set(df_doi['synonyms'])))

10.1002/cssc.201601752 (in-situ study of M-MOF-74) Number of CIFs: 1853 Synonyms: ['CPO-27-Zn'] ['CPO-27-Mg'] ['CPO-27-Cu'] [] ['CPO-27-Co'] ['CPO-27-Mn'] ['CPO-27-Ni']


In [32]:
# Keep only 3 MOFs for the same DOI with the same synonyms, to avoid having paper with hundreds of structures
df = pd.read_csv("data/step-01.csv")
df['note'] = "-" # Assign a string for all the notes, to make the later filtering easier
df = df.sort_values(by=['publication_doi','synonyms','identifier'])
df = df.reset_index() # keeping the old as column "index"

max_same = 3
count_same=0
for i in tqdm(df.index[1:]): # skipping the first because it is comparing with the previous (no privious for the first!)
    if df.at[i,'synonyms']!='[]':
        if df.at[i,'publication_doi']==df.at[i-1,'publication_doi'] and df.at[i,'synonyms']==df.at[i-1,'synonyms']:
            count_same+=1
        else:
            count_same=0
        if count_same>=3:
            df.at[i, 'note'] = f"Excluding more than {max_same} same MOFs from the same DOI"

df = df.sort_values(by='index')
df = df.drop(columns='index') # remove the dummy column
df.to_csv("data/step-02.csv", index=False)
# step-02.csv: contains a "note" for excluding structures if >3 from the same paper

100%|██████████| 105921/105921 [00:00<00:00, 229919.82it/s]


In [34]:
df = pd.read_csv("data/step-02.csv")
len_after_exclusions = len(df[(df['synonyms']!="[]") & (df['note']=='-')])
print("CSD entries having synonyms:", len_after_exclusions, f"(including only {max_same} same materials per paper)")

CSD entries having synonyms: 3846 (including only 3 same materials per paper)


In [36]:
df = pd.read_csv("data/step-02.csv")
df[df['synonyms']!="[]"].head(3)

Unnamed: 0,identifier,ccdc_number,formula,has_disorder,publication_year,publication_doi,synonyms_orig,synonyms,note
47,ABEXEN,1029033.0,"(C20 H10 In1 O8 S1 1-)n,n(C2 H8 N1 1+),2.5n(C3...",True,2016,10.1039/c6ta07939c,['MROF-1'],['MROF-1'],-
150,ABUWOJ,156028.0,"(C32 H38 O18 S4 Zn4)n,2n(H2 O1)",False,2001,10.1107/s1600536801009175,['C32 H42 O20 S4 Zn4'],['C32 H42 O20 S4 Zn4'],-
244,ACIBEU,258461.0,(C21 H27 Co3 N3 O15)n,False,2006,10.1016/j.micromeso.2005.11.049,['MOF-CJ4'],['MOF-CJ4'],-
