In [None]:
# Install pymatgen and matminer
!pip install pymatgen
!pip install matminer

Collecting pymatgen
[?25l  Downloading https://files.pythonhosted.org/packages/ce/c4/60295df146bdc5905c31b3b366004cd42343412f2c4f513c63becac2482d/pymatgen-2022.0.6.tar.gz (3.3MB)
[K     |████████████████████████████████| 3.3MB 5.6MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting scipy>=1.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/7d/e8/43ffca541d2f208d516296950b25fe1084b35c2881f4d444c1346ca75815/scipy-1.6.3-cp37-cp37m-manylinux1_x86_64.whl (27.4MB)
[K     |████████████████████████████████| 27.4MB 143kB/s 
Collecting uncertainties>=3.1.4
[?25l  Downloading https://files.pythonhosted.org/packages/45/41/fc7e7b73b603e7c2c9e040b7aa8caf4a88d74b6faa567601ed82b6f0d8e1/uncertainties-3.1.5-py2.py3-none-any.whl (246kB)
[K     |████████████████████████████████| 256kB 47.7MB/s 
Collecti

Collecting matminer
[?25l  Downloading https://files.pythonhosted.org/packages/18/67/319db03366448bf367f6239598da2da0021389b02a7f874380ee3c193890/matminer-0.6.5.tar.gz (5.8MB)
[K     |████████████████████████████████| 5.8MB 3.7MB/s 
Collecting tqdm>=4.46.0
[?25l  Downloading https://files.pythonhosted.org/packages/72/8a/34efae5cf9924328a8f34eeb2fdaae14c011462d9f0e3fcded48e1266d1c/tqdm-4.60.0-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 7.9MB/s 
Collecting pint>=0.11
[?25l  Downloading https://files.pythonhosted.org/packages/33/de/53a77b82553579affab7438d299f850acbc1c4dd741c5ce52594513cb0ef/Pint-0.17-py2.py3-none-any.whl (204kB)
[K     |████████████████████████████████| 215kB 40.6MB/s 
Collecting future>=0.18.2
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 39.9MB/s 
Collecting scikit_learn>=0.23

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
from tqdm import tqdm
from matminer.datasets import load_dataset
from pymatgen.core.structure import Structure


datasets = ['matbench_jdft2d', 'matbench_phonons', 'matbench_dielectric',
            'matbench_log_kvrh', 'matbench_log_gvrh', 'matbench_perovskites',
            'matbench_mp_gap', 'matbench_mp_is_metal', 'matbench_mp_e_form']
small_datasets = datasets[0:6]
large_datasets = datasets[6:]

def get_neighbors(structure, cutoff):
    neighbors = structure.get_all_neighbors(cutoff)
    return [sorted(nbrs, key=lambda x: x[1]) for nbrs in neighbors]


def cif_parse(dataset):
    df = load_dataset(dataset)
    df = df.values
    all_data = []
    for row in tqdm(df):
        data = {}
        struct = row[0]
        prop = row[1]
        neighbors = get_neighbors(struct, 8)
        for i in neighbors:
            if i != []:
                sites = i
                break

        r = 9
        # while ((len(sites) < 80) or (len(sites) < struct.num_sites)):
        while (len(sites) < 80):
            neighbors = get_neighbors(struct, r)
            sites = neighbors[0]
            r += 1

        supercell = Structure.from_sites(sites[:80])
        data['R'] = supercell.cart_coords
        data['Z'] = np.asarray(supercell.atomic_numbers)
        data['N'] = np.asarray(len(data['R']))
        data['Y'] = np.asarray(prop)
        data['formula'] = struct.formula.replace(' ', '')
        all_data.append(data)
    return all_data

path = 'drive/My Drive/DimeNet/eighty_atoms'
for dataset in large_datasets: # start with small datasets first
    parsed = cif_parse(dataset)
    np.save(f'{path}/{dataset}.npy', parsed)
    print(f'\n{dataset} parsed!\n')

Fetching matbench_mp_gap.json.gz from https://ml.materialsproject.org/projects/matbench_mp_gap.json.gz to /usr/local/lib/python3.7/dist-packages/matminer/datasets/matbench_mp_gap.json.gz


100%|██████████| 106113/106113 [1:46:35<00:00, 16.59it/s]



matbench_mp_gap parsed!

Fetching matbench_mp_is_metal.json.gz from https://ml.materialsproject.org/projects/matbench_mp_is_metal.json.gz to /usr/local/lib/python3.7/dist-packages/matminer/datasets/matbench_mp_is_metal.json.gz


100%|██████████| 106113/106113 [1:46:35<00:00, 16.59it/s]



matbench_mp_is_metal parsed!

Fetching matbench_mp_e_form.json.gz from https://ml.materialsproject.org/projects/matbench_mp_e_form.json.gz to /usr/local/lib/python3.7/dist-packages/matminer/datasets/matbench_mp_e_form.json.gz


100%|██████████| 132752/132752 [2:08:06<00:00, 17.27it/s]



matbench_mp_e_form parsed!

