In [None]:
import json
import csv
import os
from tqdm.notebook import tqdm

from rdkit import RDLogger
# Suppress RDKit warnings
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)


output_file = "MOFX.csv"
directory = "hMOF_10"  # Default directory

with open(output_file, mode="w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["MOF_ID", "MOFid", "InChIKey", "Name", "Formula"])  # CSV header
    
    # Loop through all possible file numbers as in your original code
    for i in tqdm(range(7003381), total=7003381, desc="Processing files"):
        filename = f"{directory}/hMOF-{i}.json"
        
        if not os.path.exists(filename):
            continue
            
        try:
            with open(filename, 'r') as f:
                data = json.load(f)
                
                # Extract MOF identifiers
                mof_id = data.get("id", "")
                mofid = data.get("mofid", "")
                
                # Extract adsorbates data (which contains InChI keys)
                if "adsorbates" in data:
                    for adsorbate in data["adsorbates"]:
                        inchikey = adsorbate.get("InChIKey", "")
                        name = adsorbate.get("name", "")
                        formula = adsorbate.get("formula", "")
                        if formula != "CO2": continue
                        
                        if inchikey:
                            writer.writerow([mof_id, mofid, inchikey, name, formula])
                
                # print(f"Processed file: {filename}")
                # print(f"MOF ID: {mof_id}, MOFid: {mofid}, InChIKey: {inchikey}, Name: {name}, Formula: {formula}")
                            
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            continue
        # break


Processing files:   0%|          | 0/7003381 [00:00<?, ?it/s]

In [15]:
smi

0                                      O.[Ba].[Cu].[O-]C=O
2                                   [Co].[O-]C(=O)c1ccncc1
3                                   [Co].[O-]C(=O)c1ccncc1
4        [O-]C(=O)c1cccc(c1)c1nccs1.[Zn][Zn].n1ccc(cc1)...
5        [O-]C(=O)C1=NN=C([CH]1)C(=O)[O-].[O-]C(=O)C1=N...
                               ...                        
20369    N1=C[C](C=N1)C=Cc1cc(C=CC2=C[N]N=C2)cc(c1)C=CC...
20370    N1=C[C](C=N1)n1nnc(c1)c1cc(cc(c1)c1nnn(c1)C1=C...
20371    N1=C[C](C=N1)c1cc(cc(c1)C1=CN=N[CH]1)C1=C[N]N=...
20372    N1=C[C](C=N1)C=CC1=CC2=CC(=CC3=CC(=CC(=C1)[C]2...
20373    N1=C[C](C=N1)C=Cn1cc2c(c1)c1cn(cc1c1c2cn(c1)C=...
Name: info.mofid.smiles, Length: 17679, dtype: object

In [35]:
import pandas as pd
df = pd.read_csv("qmof.csv")
df = df.dropna(subset=['info.mofid.smiles'])
smi = df['info.mofid.smiles']
smi = smi.str.split(" ", expand=True)[0]
df2 = pd.read_csv("MOFX.csv")
df2 = df2.dropna(subset=['MOFid'])
smi2 = df2['MOFid']
smi2 = smi2.str.split(" ", expand=True)[0]

  df = pd.read_csv("qmof.csv")


In [None]:
from rdkit import Chem, RDLogger
# Suppress RDKit warnings
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
def canonicalize(smiles: str):
        """
        Canonicalize the input SMILES.

        Returns:
            str: canonicalized SMILES or empty str on failure
        """
        if "." in smiles:
             return ".".join(sorted([
                  canonicalize(smi) for smi in smiles.split(".")
                  if canonicalize(smi)]))
        try:
            mol = Chem.MolFromSmiles(smiles)
            smiles = Chem.MolToSmiles(mol)
        except Exception:
            return ""
        else:
            return smiles

In [38]:
cano_smi = smi.apply(canonicalize)
cano_smi2 = smi2.apply(canonicalize)
df['info.mofid.cano_smiles'] = cano_smi
# save the dataframe to a new CSV file
df.to_csv("qmof_cano.csv", index=False)
df2['info.mofid.cano_smiles'] = cano_smi2
# save the dataframe to a new CSV file
df2.to_csv("MOFX_cano.csv", index=False)


[22:07:07] Can't kekulize mol.  Unkekulized atoms: 4 5 6
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 4 5 6
[22:07:07] Explicit valence for atom # 4 N, 4, is greater than permitted
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 1 2 3
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 1 2 3
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 0 1 2
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 1 2 3
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 1 2 3
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 1 2 3
[22:07:07] Explicit valence for atom # 1 Cs, 2, is greater than permitted
[22:07:07] Explicit valence for atom # 1 O, 3, is greater than permitted
[22:07:07] Explicit valence for atom # 7 N, 4, is greater than permitted
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 1 4 5
[22:07:07] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[22:07:07] Explicit valence for atom # 1 N, 4, is greater than permitted
[22:07:07] Explicit valence for atom # 2 N, 4, is greater than p

In [39]:
common_smiles = set(cano_smi).intersection(set(cano_smi2))
len(common_smiles)

59

In [40]:
common_smiles_raw = set(smi).intersection(set(smi2))
len(common_smiles_raw)

53

In [41]:
common_smiles

{'.',
 '..',
 '...',
 '..O=C([O-])C#CC#CC(=O)[O-].[Zn][Zn]',
 '.CCCC(C#CC(=O)[O-])=CC#CC(=O)[O-].O=C([O-])C#CC=CC#CC(=O)[O-]',
 '.COC12C3C4(OC)C5(OC)C(C1(OC)C35C(=O)[O-])C24C(=O)[O-].O=C([O-])C#CC(=O)[O-]',
 '.Cc1cc(C(=O)[O-])ccc1C(=O)[O-]',
 '.Nc1cc(C(=O)[O-])ccc1C(=O)[O-]',
 '.O=C([O-])C#CC#CC(=O)[O-].O=C([O-])c1ccc(C#Cc2ccc(C(=O)[O-])cc2)cc1',
 '.O=C([O-])C#CC(=O)[O-]',
 '.O=C([O-])C#CC(=O)[O-].O=C([O-])C12C3C4C1C1C2C3C41C(=O)[O-]',
 '.O=C([O-])C#Cc1ccc(C#CC(=O)[O-])cc1.O=C([O-])c1ccc(C#Cc2ccc(C(=O)[O-])cc2)cc1',
 '.O=C([O-])C12C3C4C1C1C2C3C41C(=O)[O-]',
 '.O=C([O-])C12CCC(C(=O)[O-])(CC1)CC2',
 '.O=C([O-])C=CC(=O)[O-]',
 '.O=C([O-])C=CC=CC(=O)[O-]',
 '.O=C([O-])C=CC=CC(=O)[O-].O=C([O-])c1c2ccccc2c(C(=O)[O-])c2ccccc12',
 '.O=C([O-])C=CC=CC(=O)[O-].O=C([O-])c1ccc2ccc3c4cc(C(=O)[O-])ccc4ccc3c2c1',
 '.O=C([O-])c1c2ccccc2c(C(=O)[O-])c2ccccc12',
 '.O=C([O-])c1cc2ccc3cc(C(=O)[O-])cc4ccc(c1)c2c34',
 '.O=C([O-])c1cc2ccc3cc(C(=O)[O-])cc4ccc(c1)c2c34.O=C([O-])c1ccc(-c2ccc(-c3ccc(C(=O)[O-])cc3)