# Explain models on tidy data with `lime` and `sklearn`

In [179]:
import pandas, sklearn.base, sklearn.pipeline
from toolz.curried import *

In [180]:
class DataFrameMixin(pandas.DataFrame):
    """Combine a DataFrame and BaseEstimator
    """
    _series = pandas.Series

    def __dir__(self):
        return concatv(super().__dir__(), self._metadata)

    @property
    def _constructor(self): return self.__class__

    @property
    def _constructor_expanddim(self): return self.__class__

    @property
    def _constructor_sliced(self): return self._series    

In [183]:
import sklearn.ensemble, sklearn.pipeline
from copy import copy

In [185]:
def create_combo(object, **dict):
    if isiterable(object):
        object = list(object)
        for i, model in enumerate(object):
            if not isinstance(model, tuple):
                object[i] = str(i), model    

        if all(isinstance(object, sklearn.base.ClassifierMixin) for i, object in models):
            object = sklearn.ensemble.VotingClassifier(object)

        if all(isinstance(object, sklearn.base.TransformerMixin) for i, object in models):
            object = sklearn.pipeline.Pipeline(object)
    object.set_params(**dict)
    return object

In [195]:
class FramePipe(DataFrameMixin):
    _metadata = pandas.DataFrame._metadata + ['model']
    def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False, steps=None, memory=None, model=None, **dict):
        super().__init__(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
        self.model = None
        model and self.steps(model, **dict)
        
    
    def fit(self, object=None, **dict):
        object and self.steps(object, **dict)
        if isinstance(self.index, pandas.RangeIndex): 
            return self.pipe(self.model.fit)
        return self.pipe(self.model.fit, self.index)
    
    def fold(self, n): 
        for fold in sklearn.model_selection.KFold(n).split(self.values, self.index):
            df = pandas.concat({'train': self.iloc[fold[0]], 'test': self.iloc[fold[1]]})
            yield FramePipe(df.values, df.index, model=self.model)
        
    
    def predict(self): return self.pipe(self.model.predict)
    
    def predict_proba(self, *args, **kwargs): return self.pipe(self.model.predict_proba, *args, **kwargs)
    def predict_log_proba(self, *args, **kwargs): return self.pipe(self.model.predict_proba, *args, **kwargs)

    def transform(self): return self.pipe(self.model.transform)

    def pipe(self, func, *tuple, **dict):
        result = super().pipe(func, *tuple, **dict)
        if result is None or isinstance(result, sklearn.base.BaseEstimator):
            return self
        if getattr(result, '__len__', lambda: None)() == len(self) and not isinstance(result, pandas.DataFrame):
            return type(self)(result, index=self.index)
        return result
    
    def steps(self, object=None, **dict):
        self.model = object and create_combo(object)
        dict and self.model.set_params(**dict)
        return self
    
    def __copy__(self, deep=True):
        model = copy(self.model)
        self = self.copy(deep=deep).steps(model)
        return self
    
    def __finalize__(self, other=None, method=None,):
        if method == 'merge': other = other.left
        if method == 'concat': other = other.objs[0]
        self.model = self.model or other.model
        return self
    
    def set_params(self, **dict):
        self.model.set_params(**dict)
        return self
    
    def get_params(self):
        return self.model.get_params()
        
    def explain(self, id=None):
        if id:
            explain = LimeTabularExplainer(self.values, 'classification', feature_names=self.columns)
            explain.explain_instance(
                self.iloc[id,:], self.model.predict_proba
            ).show_in_notebook(show_all=False)
        else:
            model = sklearn.tree.DecisionTreeClassifier()
            from graphviz import Source
            return Source(sklearn.tree.export_graphviz(model.fit(df, df.predict().values), None))
        

In [198]:
def _lda():
    from sklearn.datasets import load_iris

    data = load_iris()

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    df = FramePipe(
        data['data'], list(map(data['target_names'].__getitem__, data['target'])),
        data['feature_names']
    )

    train = next(df.steps(LinearDiscriminantAnalysis(n_components=4)).fold(4))
    train.loc['train'].fit().sample(3).predict()
    df.explain()
    df.explain(1)

In [None]:
if __name__ == '__main__':
    !ipython -m pytest -- 2018-08-28-A-dataframe-to-explain-models.ipynb