# Loading of data

In [2]:
import numpy as np
import pandas as pd
import json
from ase import Atoms
import matplotlib.pyplot as plt
import matplotlib

### Description of function
Input: string where data is located

Output:
lists of    
species = []
number_of_atoms = []
atomic_numbers = []

In [3]:
### Set directory.
# with './xxx' syntax, it's assumed that datafile is in same place as current file

def data_load(data_dir):
    """Data loading function
    Input:
        data_dir (str): direcory to data folder, relative to current working directory
    Returns: 
        train, test (pd.dataframe): dataframe with info about 2D materials
    """

    # Loading the data as pandas DataFrame
    train = pd.DataFrame(json.load(open(data_dir + "train.json", "rb")))
    test = pd.DataFrame(json.load(open(data_dir + "test.json", "rb")))

    ## Transform atoms entry to ASE atoms object
    train.atoms = train.atoms.apply(lambda x: Atoms(**x)) # OBS This one is important!
    test.atoms = test.atoms.apply(lambda x: Atoms(**x))
    return train, test


train, test = data_load(data_dir="./project_data/")
type(train)

pandas.core.frame.DataFrame

In [4]:
### Viewing train or test data
def check_data(data):
    """print dimensions of leaded data"""  
    print('Train data shape: {}'.format(data.shape))
    print(data.head())

check_data(test)

Train data shape: (4000, 3)
      id     formula                                              atoms
0   8538    HfZrSTe2  (Atom('S', [-1.0327079693, 4.2080182974, 22.81...
1   9052    Nb2P2Te6  (Atom('Nb', [0.3517632032, -0.2042419274, 9.20...
2  10503  Au2InI2Se2  (Atom('Se', [0.2238411856, 0.8131388722, 26.83...
3    611   AuTlO6Sb2  (Atom('Au', [0.0002076097, -0.0001198635, 16.6...
4   7212       NbBr2  (Atom('Nb', [6.208919093e-19, -1.371643362e-17...


In [5]:
def summarize_1(train,test):
    """Get info about species, atomioc numbers, number of atoms in each species. 
    And of inputted species: max number of atoms, min, and max atomic number 
    Input:
        data (pd dataframe): dataframe with 'atoms' column, containing instances of ASE's atoms class
    Returns:
        dict with 3 lists and 3 vals
        species (list of str): atomic species in all materials
        atomic_number (list of int): all atomic numbers represented in materiaæ
        number_of_atoms (list of int): number of atoms for each species
        max_number_of_atoms (int): ...
        min_atomic_number (int): ...
        max_atomic_number (int): ...
    """
    species = []
    number_of_atoms = []
    atomic_numbers = []
    for atom in pd.concat([train.atoms,test.atoms]):
        species = list(set(species+atom.get_chemical_symbols()))
        atomic_numbers = list(set(atomic_numbers+list(atom.get_atomic_numbers())))
        number_of_atoms.append(len(atom))

    max_number_of_atoms = np.max(number_of_atoms)
    min_atomic_number = np.min(atomic_numbers)
    max_atomic_number = np.max(atomic_numbers)
    
    dic = {'species': species,
           'atomic_numbers': atomic_numbers,
           'number_of_atoms': number_of_atoms,
           'max_number_of_atoms': max_number_of_atoms,
           'min_atomic_number': min_atomic_number,
           'max_atomic_number': max_atomic_number}
    
    return dic

data_preproc = summarize_1(train,test)

print(data_preproc['max_number_of_atoms'])


20
