<a href="https://colab.research.google.com/github/dmamur/elementsem/blob/main/element_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re, glob,os,sys,pickle,random
from collections import defaultdict

In [2]:
!pip install ase
!git clone https://github.com/dmamur/elementsem.git

Collecting ase
  Downloading ase-3.22.1-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ase
Successfully installed ase-3.22.1
Cloning into 'elementsem'...
remote: Enumerating objects: 525, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 525 (delta 48), reused 1 (delta 1), pack-reused 445[K
Receiving objects: 100% (525/525), 91.77 MiB | 11.22 MiB/s, done.
Resolving deltas: 100% (265/265), done.


# Load data from matbench

In [3]:
from ase.io import read,write
from scipy.ndimage import gaussian_filter1d
from ase.neighborlist import neighbor_list
from joblib import dump, load
from sklearn.cluster import KMeans
from sklearn import decomposition
from ase import Atoms

ellst=open('/content/elementsem/models/pcakm/ellist.txt','r').read().split('\n')
print(ellst)

km = {i: load('/content/elementsem/models/pcakm/'+i+'_kmeans.pkl') for i in ellst}
pca = {i: load('/content/elementsem/models/pcakm/'+i+'_pca.pkl') for i in ellst}

def getRawInputs(types,atoms,x,v):
    i, d = neighbor_list('id', atoms, 10.0, self_interaction=False)
    rdfatoms,ntypes=[],[]
    for k,l in enumerate(atoms):
        el=types[k]
        y = np.zeros(100)
        dist = np.round(d[i==k]*10)
        a,b=np.unique(dist, return_counts=True)
        np.put(y,a.astype(int)-1,b)
        values=gaussian_filter1d(y/v,1)
        num = km[el].predict(pca[el].transform(np.nan_to_num([values],nan=0,posinf=0, neginf=0)))[0]
        ntypes.append(el+str(num))#el2id[el+str(num)]
    return ntypes

['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Rn']


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
from ase.build import molecule
atoms = molecule('CH3CH2NH2')
#typesLstTest = getModelInputs(train_inputs)

In [5]:
x= np.arange(0,10,0.1)
v = np.concatenate([[1],4*np.pi/3*(x[1:]**3 - x[:-1]**3)])
typesN=getRawInputs(atoms.get_chemical_symbols(),atoms,x,v)

In [6]:
print(typesN)

['C10', 'C13', 'N4', 'H5', 'H5', 'H5', 'H5', 'H5', 'H4', 'H4']


In [7]:
from ase.io import read, write

In [12]:
atoms=read('/content/elementsem/models/pcakm/Figure2.xyz',format='xyz')
typesN1=getRawInputs(atoms.get_chemical_symbols(),atoms,x,v)

C13
C7
C5
C5
S12
C7
C5
C13
N4
C5
C13
S16
C5
C18
O6
O11
H5
H5
H11
H5
H11
H11
H11
H4
H4
H11
H11
H5
H4


In [11]:
from ase.visualize import view
write('image.png', atoms)

## Mat2Spec to input

In [None]:
!wget https://data.caltech.edu/records/y7nkc-g8k29/files/Mat2Spec_DATA.zip
!unzip -q Mat2Spec_DATA.zip

--2023-08-12 18:12:42--  https://data.caltech.edu/records/y7nkc-g8k29/files/Mat2Spec_DATA.zip
Resolving data.caltech.edu (data.caltech.edu)... 35.155.11.48
Connecting to data.caltech.edu (data.caltech.edu)|35.155.11.48|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://s3.us-west-2.amazonaws.com/caltechdata/5d/28/db0b-7fad-491b-8af7-439e6bfc4880/data?response-content-type=application%2Foctet-stream&response-content-disposition=attachment%3B%20filename%3DMat2Spec_DATA.zip&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARCVIVNNAP7NNDVEA%2F20230812%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20230812T181243Z&X-Amz-Expires=60&X-Amz-SignedHeaders=host&X-Amz-Signature=ddcade38252649e3d0e67ae6b144ad43b213527bbb6d8e1fdec02b45398359b2 [following]
--2023-08-12 18:12:43--  https://s3.us-west-2.amazonaws.com/caltechdata/5d/28/db0b-7fad-491b-8af7-439e6bfc4880/data?response-content-type=application%2Foctet-stream&response-content-disposition=attachment%3B%20f