In [24]:
import numpy as np
import pandas as pd
import pymatgen as mg
import matplotlib.pyplot as plt
from matminer.featurizers.site import CrystalNNFingerprint,AGNIFingerprints, OPSiteFingerprint, VoronoiFingerprint
from matminer.featurizers.structure import SiteStatsFingerprint
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances,pairwise_distances_argmin

%matplotlib inline

In [None]:
def init_mprester():
    """
    Define The API object that is responsibile for connecting to the Materials PRoject back-end
    
    Keyword Arguments:
        None
    
    Returns:
        MPRester object
    """
    
    return mg.MPRester(api_key='******')

In [None]:
def load_materials_database(mat_type,mpr):
    """
    Create dataset based on choice of material type.
    Queries Materials Project based on datatype to get a list of dicts containing material properties
    Converts this list of dicts to a dataframe
    Each material in the dataframe (each row) of dicts has a unique ID - material_id.
    Using the material_id column, creates a 'structure' column containing Pymatgen structure objects
    Structure objects contain information about atom type, atom positions and relative positions of each atoms' neighbors.
    
    Keyword Arguments: 
        mat_type: str
            One of 'unary','binary','ternary'
        mpr: MPRester object
            Initialized with suitable API key
                           
    Returns:
        Pandas DataFrame containing dataset with 'structure' column.
    """
    
    dbdict = {'unary':'*',
              'binary':'**',
              'ternary':'***'
             }
    #Query Materials Project back-end
    materials_data = mpr.get_data(dbdict[mat_type])
    
    #Convert list of dicts into Pandas DataFrame
    materials_database = pd.DataFrame(materials_data)
    
    #For each material entry in the dataframe, get a pymatgen structure object using mpr.get_structure_by_material_id
    materials_database['structure'] = materials_database['material_id'].apply(lambda x: mpr.get_structure_by_material_id(x))
    
    return materials_database

In [None]:
#For each structure object, get an atomic fingerprint for all atoms in the structure
def get_fingerprint_for_material(structure,site_featurizer):
    """
    Get a fingerprint for each atom in each structure
    
    Keyword Arguments:
        structure: Pymatgen structure object.
           A structure object contains the coordinates for each atom in the structure.
           It also contains information about atom type and how atoms are arranged to form a crystal
        site_featurizer: A matminer site featurizer object.
           Converts structure information into a vector of numbers unique for each unique material.
           This vector is called a 'fingerprint' for that material.
           See https://hackingmaterials.github.io/matminer/featurizer_summary.html for full list of site
           featurizers.
           
    Returns:
        A 1D-numpy array of numbers corresponding to structure object                             
    """
    
    return np.array(site_featurizer.featurize(structure)).reshape((1,-1))

In [None]:
def get_ssf_featurizer(fingerprint_name, stats):
    """
    Return a SiteStatsFingerprint object that can compute statistics for each column of the fingerprint 
    for a multi-atom system. For  instance, return a SiteStatsFingerprint object that computes min, max,
    std, mean values of each column in fingerprint for all atoms.
    
    Keyword Arguments:
        fingerprint_name: str
            'CrystalNN' is the only acceptable option for now. Plan to add more in coming iterations.
        stats: tuple of strs
            Choose from 'mean','std_dev','minimum','maximum' for the str options.
    
    Returns:
        SiteStatsFingerprint object corresponding to the Fingerprint name
    """
    
    #Define dict mapping name to fingerprint object
    fingerprint_name_to_obj_dict = {'CrystalNN':CrystalNNFingerprint.from_preset('ops', distance_cutoffs=None, x_diff_weight=0)
                                   }
    
    return SiteStatsFingerprint(fingerprint_name_to_obj_dict[fingerprint_name],stats=stats)

In [None]:
def get_fingerprint_df_for_db(db,featurizer):
    """
    'Featurizes' a materials dataframe based on Pymatgen structure objects.
    
    Keyword Arguments:
        db:
            Pandas dataframe containing (at minimum) 'structure' and 'material_id' columns
        featurizer:
            A Pymatgen structure-based featurizer. Full list of structure featurizers at
            https://hackingmaterials.github.io/matminer/featurizer_summary.html
    Returns:
        Pandas dataframe containing site-averaged fingerprints
    """
    # Create a sub-dataframe containing only 'structure' and 'material_id'
    db_struct = db[['structure','material_id']]
    
    # Featurize the dataframe usinng the featurizer provided
    sitestats_db = featurizer.featurize_dataframe(db_struct,col_id='structure',ignore_errors=True)
    
    # Drop the structure column - this will not be needed in the analysis
    sitestats_db.drop(['structure'],axis=1,inplace=True)
    
    # In case there were errors in the process, np.nan is returned. Fill this with a large number.
    # Alternately, just drop nans in future iterations of this code.
    sitestats_db.fillna(1000000,inplace=True)
    
    return sitestats_db

In [23]:
def compute_pairwise_distances(material_fingerprint,fingerprints_database,scale=False):
    """
    Computes Pairwise distances between a target material and a material database using material fingerprints.
    
    Keyword Arguments:
        material_fingerprint: 
            A numpy vector generated using a structure-based matminer featurizers for a selected material
        fingerprint_database: 
            A Dataframe containing fingerprints of the same type as material_fingerprint.
            The Dataframe should be computed using get_fingerprint_df_for_db()
        scale: boolean
            True if the vectors need to be scaled before computing distances. False by default.
            This is preferable since large differences between features can skew distance results.
    
    Returns:
        Vector of pairwise distances between the target material and each material in the database.
    
    """
    
    if(scale==True):
        scaler = StandardScaler()
        Y = scaler.fit_transform(fingerprints_database)
        X = scaler.transform(material_fingerprint)
    else:
        Y = fingerprints_database.values
        X = material_fingerprint

    distances = pairwise_distances(X=X,Y=Y,metric='euclidean')
    
    return distances

In [64]:
def n_closest_materials(material_id,dbtype,num_closest=10):
    """
    Find the closest 'n' materials to a given material.
    The function first creates a materials database.
    It then computes fingerprints for all materials in the database.
    It finally computes pairwise distances from each material in the database to the target metal.
    
    Keyword Arguments:
        material_id - str
            Valid Materials Project material id
        dbtype - str
            The database type for which closest 'n' materials need to be found.
            Valid types are 'unary', 'binary' and 'ternary' for mono, di and tri-atomic systems.
        num_closest - int
            Number of closest neighbors sought
    
    Returns:
        A Pandas DataFrame sorted by distance of material to neighbors in database
    """
    print('Initializing connection to Materials Project...')
    # Create object to interface with API
    mpr = init_mprester()
    
    print('Creating materials database...')
    # Create materials database
    materials_database = load_materials_database('unary',mpr)
    
    print('Creating Site Stats fingerprint object...')
    # Create fingerprint object to compute statistics of fingerprints for all atoms in a structure.
    ssf = get_ssf_featurizer(fingerprint_name='CrystalNN',stats=('minimum','maximum','mean','std_dev'))
    
    print('Creating fingerprint database...')
    # Create a fingerprint database for all materials in the materials database
    fingerprints_db = get_fingerprint_df_for_db(db=materials_database,featurizer=ssf)

    print('Retrieving fingerprint for targeting material')
    # Compute fingerprint for target material
    material_fingerprint = get_fingerprint_for_material(site_featurizer=ssf,
                                                        structure=mpr.get_structure_by_material_id(material_id='mp-30'))
    
    #Drop material_id before computing pairwise distances since it is of str type
    fingerprints_db_no_material_id = fingerprints_db.drop(columns=['material_id'],axis=1)

    print('Compute pairwise distances between target material and each material in database...')
    # Compute pairwise distances
    distances = compute_pairwise_distances(fingerprints_database=fingerprints_db_no_material_id,
                                           material_fingerprint=material_fingerprint,
                                           scale=True,
                                          )
    
    print('Distance computation done...')
    # Add a distance column
    fingerprints_db['distance'] = distances.reshape((-1,1))
    
    # Add a formula column to inspect material name
    fingerprints_db['pretty_formula'] = materials_database['pretty_formula']

    # Sort dataframe by distance. Remove the superfluous fingerprinnt stuff - we don't need it to know what material is closest
    closest_neighbors = fingerprints_db[['material_id',
                                         'pretty_formula',
                                         'distance']].sort_values(ascending=True,by='distance')
    
    print('Returning closest {} neighbors'.format(num_closest))
    
    return closest_neighbors.head(num_closest)

In [65]:
closest_10_monoatomic_to_TiN = n_closest_materials(material_id='mp-492',dbtype='unary',num_closest=20)

Initializing connection to Materials Project...
Creating materials database...
Creating Site Stats fingerprint object...
Creating fingerprint database...


HBox(children=(IntProgress(value=0, description='SiteStatsFingerprint', max=716, style=ProgressStyle(descripti…


Retrieving fingerprint for targeting material
Compute pairwise distances between target material and each material in database...
Distance computation done...
Returning closest 20 neighbors


In [66]:
closest_10_monoatomic_to_TiN.head(20)

Unnamed: 0,material_id,pretty_formula,distance
401,mp-111,Ne,0.0
534,mp-684673,S,0.0
443,mp-674158,P,0.0
398,mp-159,Nd,0.0
501,mp-8642,Re,0.0
305,mp-101,Ir,0.0
669,mp-10660,Tm,0.0
258,mp-614456,He,0.0
10,mp-134,Al,0.0
639,mp-7163,Tb,0.0
