Questions:
- How to make the from_preset method for dejong? 
- How to speed up SD portion? (implementation copied from LocalPropertyDifference takes up more time)
- PropertyStats.HolderMeans seems to differ from the formula used in deJong's paper (because of the weights). Is it okay to use them like that? And how to input the weights in the function parameter??
- Dividing by zero and log of zero encountered a lot in std_dev stats value. 

NOTE: I did not run it here, but remotely on JetStream because it takes up too much computation power locally. This VBS_mean featurization works, it brought the mean absolute error down from 0.129 to 0.100 for log K prediction(But I'm not quite sure if my implementation is accurate).

In [None]:
import numpy as np 
import pandas as pd 

from matminer.featurizers.site import LocalPropertyDifference
from matminer.featurizers.structure import SiteStatsFingerprint
from matminer.datasets.dataframe_loader import load_elastic_tensor
from matminer.featurizers.base import MultipleFeaturizer, BaseFeaturizer
from matminer.featurizers.stats import PropertyStats
from matminer.utils.data import MagpieData
from matminer.utils.caching import get_nearest_neighbors
from pymatgen.analysis.local_env import VoronoiNN

class VBS_mean(BaseFeaturizer):
    def __init__(self, properties, stats=('mean', 'std_dev'), deviation='AD'):
        self.properties = properties
        self.stats = tuple([stats]) if type(stats) == str else stats
        self.deviation = tuple([deviation]) if type(deviation) == str else deviation
        if self.stats and '_mode' in ''.join(self.stats):
            nmodes = 0
            for stat in self.stats:
                if '_mode' in stat and int(stat[0]) > nmodes:
                    nmodes = int(stat[0])
            self.nmodes = nmodes
            
    @staticmethod
    def from_preset(preset):
        """
        Create a new LocalPropertyDifference class according to a preset

        Args:
            preset (str) - Name of preset
        """
        if preset == "dejong2016_AD":
            return VBS_mean(
                properties=["Number", "AtomicWeight",
                            "Column", "Row", "CovalentRadius",
                            "Electronegativity"], 
                stats=['holder_mean::%d'%d for d in range(0, 4+1)] + ['std_dev'],
                deviation='AD'
                )
        if preset == 'dejong2016_SD':
            return VBS_mean(
                properties=["Number", "AtomicWeight",
                            "Column", "Row", "CovalentRadius",
                            "Electronegativity"], 
                stats=['holder_mean::%d'%d for d in [1, 2, 4]] + ['std_dev'],
                deviation='SD'
                ) 
        else:
            raise ValueError('Unrecognized preset: ' + preset)
    
    def featurize(self, strc):
        
        ft = LocalPropertyDifference(properties=self.properties)
        _site_labels = ft.feature_labels()
        vals = []
        
        for dev in self.deviation:
            if dev == "AD":
                vals_AD = [[] for t in _site_labels]
                
                # Looping through the sites in structure and add featurized value of each site into vals
                for idx, site in enumerate(strc.sites):
                    opvalstmp = ft.featurize(strc, idx)
                    for j, opval in enumerate(opvalstmp):
                        if opval is None:
                            vals_AD[j].append(0.0)
                        else:
                            vals_AD[j].append(opval)
                vals += vals_AD

            elif dev == "SD":
                vals_SD = [[] for t in _site_labels]
                
                # Copied implementation from LocalPropertyDifference, removing only the np.abs portion
                # (how else to more efficiently implement this?)
                # Looping through the sites in structure and add featurized value of each site into vals
                for idx, site in enumerate(strc.sites):
                    # Get the targeted site
                    my_site = strc[idx]

                    # Get the tessellation of a site
                    nn = get_nearest_neighbors(VoronoiNN(weight='area'), strc, idx)

                    # Get the element and weight of each site
                    elems = [n['site'].specie for n in nn]
                    weights = [n['weight'] for n in nn]

                    # Compute the difference for each property
                    opvalstmp = np.zeros((len(self.properties),))
                    total_weight = np.sum(weights)
                    for i,p in enumerate(self.properties):
                        my_prop = MagpieData().get_elemental_property(my_site.specie, p)
                        n_props = MagpieData().get_elemental_properties(elems, p)
                        opvalstmp[i] = np.dot(weights, np.subtract(n_props, my_prop)) / total_weight
                        
                    for j, opval in enumerate(opvalstmp):
                        if opval is None:
                            vals_SD[j].append(0.0)
                        else:
                            vals_SD[j].append(opval)
                vals += vals_SD
            else:
                raise ValueError('Unrecognized mean difference type ' + dev)
                
        # calculate stats using the featurized values        
        if self.stats:
            stats = []
            for op in vals:
                for stat in self.stats:
                    stats.append(PropertyStats().calc_stat(op, stat))

            return stats
        else:
            return vals
    
    def feature_labels(self):
        if self.stats and self.deviation:
            labels = []
            for dev in self.deviation:
                for prop in self.properties:
                    for stat in self.stats:
                        labels.append("%s %s %s" %(dev, stat, prop))
            return labels
        else:
            return LocalPropertyDifference(properties=self.properties).feature_labels()

Featurization on Elastic Tensor data.

In [None]:
from matminer.datasets.dataframe_loader import load_elastic_tensor

data = load_elastic_tensor()

In [None]:
#AD
ft = VBS_mean(properties=["Number", "AtomicWeight",
                          "Column", "Row", "CovalentRadius",
                          "Electronegativity"], 
              stats=['holder_mean::%d'%d for d in range(0, 4+1)] + ['std_dev'],
              deviation='AD')
d = ft.featurize_dataframe(data, col_id='structure')

In [None]:
#SD
ft = VBS_mean(properties=["Number", "AtomicWeight",
                            "Column", "Row", "CovalentRadius",
                            "Electronegativity"], 
                stats=['holder_mean::%d'%d for d in [1, 2, 4]] + ['std_dev'],
                deviation='SD') 
d = ft.featurize_dataframe(data, col_id='structure')