https://www.oreilly.com/learning/introduction-to-local-interpretable-model-agnostic-explanations-lime

https://homes.cs.washington.edu/~marcotcr/blog/lime/

https://github.com/TeamHG-Memex/eli5/blob/master/notebooks/TextExplainer.ipynb

https://github.com/marcotcr/lime/issues/39

In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=('headers', 'footers'),
)
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=('headers', 'footers'),
)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, make_pipeline

vec = TfidfVectorizer(min_df=3, stop_words='english',
                      ngram_range=(1, 2))
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
lsa = make_pipeline(vec, svd)

clf = SVC(C=150, gamma=2e-2, probability=True)
pipe = make_pipeline(lsa, clf)
pipe.fit(twenty_train.data, twenty_train.target)
pipe.score(twenty_test.data, twenty_test.target)

0.89014647137150471

In [3]:
def print_prediction(doc):
    y_pred = pipe.predict_proba([doc])[0]
    for target, prob in zip(twenty_train.target_names, y_pred):
        print("{:.3f} {}".format(prob, target))    

doc = twenty_test.data[0]
print_prediction(doc)

0.000 alt.atheism
0.000 comp.graphics
0.996 sci.med
0.004 soc.religion.christian


In [4]:
doc

"As I recall from my bout with kidney stones, there isn't any\nmedication that can do anything about them except relieve the pain.\n\nEither they pass, or they have to be broken up with sound, or they have\nto be extracted surgically.\n\nWhen I was in, the X-ray tech happened to mention that she'd had kidney\nstones and children, and the childbirth hurt less."

In [5]:
pipe.predict_proba([doc])

array([[  1.39773295e-05,   1.62757989e-05,   9.95975563e-01,
          3.99418417e-03]])

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# same as scikit-learn, but also matches single token
# (?u): simply switches on the unicode
# http://stackoverflow.com/questions/35043085/what-does-u-do-in-a-regex
DEFAULT_TOKEN_PATTERN = r'(?u)\b\w+\b'
token_pattern = DEFAULT_TOKEN_PATTERN

# when char base equals false
vec = CountVectorizer(token_pattern = token_pattern,
                      ngram_range = (1, 2))
vec_ = vec.fit([doc])
vec_

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w+\\b', tokenizer=None,
        vocabulary=None)

In [7]:
import numpy as np
from sklearn.utils import check_random_state

n_samples = 5000

sampler_params = [{'bow': False}, {'bow': True}]
weights = [0.7, 0.3]
weights = np.array(weights)
weights /= weights.sum()
weights

token_pattern = token_pattern
rng_ = check_random_state(None)
rng_

<mtrand.RandomState at 0x10d29cea0>

In [8]:
from eli5.lime.samplers import MaskingTextSampler

def create_sampler(extra):
    params = dict(
        token_pattern = token_pattern,
        random_state = rng_,
    )
    params.update(extra)
    sampler = MaskingTextSampler(**params)
    return sampler

# create the sampler inside the MaskingTextSamplers,
# default uses a fix weight one for bow = False, one
# for bow = True
samplers = list(map(create_sampler, sampler_params))
samplers

[MaskingTextSampler(bow=False, group_size=1, max_replace=1.0, min_replace=1,
           random_state=<mtrand.RandomState object at 0x10d29cea0>,
           replacement='', token_pattern='(?u)\\b\\w+\\b'),
 MaskingTextSampler(bow=True, group_size=1, max_replace=1.0, min_replace=1,
           random_state=<mtrand.RandomState object at 0x10d29cea0>,
           replacement='', token_pattern='(?u)\\b\\w+\\b')]

In [9]:
from scipy.stats import itemfreq

# see how much weight to use which sampler
sampler_indices = rng_.choice(range(len(samplers)),
                                           size=n_samples,
                                           replace=True,
                                           p=weights)
# count the frequency of using each sampler
idx, freq = itemfreq(sampler_indices)[0]
samplers[idx]

MaskingTextSampler(bow=False, group_size=1, max_replace=1.0, min_replace=1,
          random_state=<mtrand.RandomState object at 0x10d29cea0>,
          replacement='', token_pattern='(?u)\\b\\w+\\b')

In [17]:
from functools import partial
from eli5.lime.textutils import TokenizedText, generate_samples

# generated n_samples docs
doc_ = TokenizedText(doc, token_pattern=token_pattern)
gen_samples = partial(generate_samples, doc_, n_samples=n_samples)
docs, similarity, mask = gen_samples(bow= True)

In [21]:
docs[1]

"As I recall      stones,  ' \nmedication    anything about   relieve the .\n\n they pass,  they  to be    ,  they \nto be extracted .\n\n I was , the -ray tech  to mention  'd  \nstones  children,  the childbirth hurt ."

In [15]:
docs, similarity, mask = generate_samples(doc_, n_samples, bow = True)

In [28]:
from eli5.lime.samplers import MaskingTextSamplers

sampler = MaskingTextSamplers(
                sampler_params = [{'bow': False}, {'bow': True}],
                weights = [0.7, 0.3],
                token_pattern = token_pattern,
                random_state = rng_,
)

In [47]:
# similarity of each perturbed sample 
# with the original document
samples, sims = sampler.sample_near(
                doc=doc,
                n_samples=n_samples
            )

# transformed sampled text
X = vec_.transform(samples)
X

<5000x113 sparse matrix of type '<class 'numpy.int64'>'
	with 240600 stored elements in Compressed Sparse Row format>

In [57]:
# black box model predicted probability
y_proba = pipe.predict_proba(samples)

expand_factor = te.expand_factor
test_size=0.3
# a simple classifier
clf = te.clf_
clf

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5,
       random_state=<mtrand.RandomState object at 0x12376ccf0>,
       shuffle=True, verbose=0, warm_start=False)

In [58]:
from sklearn.model_selection import train_test_split

(X_train, X_test,
 similarity_train, similarity_test, 
 y_proba_train, y_proba_test) = train_test_split(samples,
                                                 similarity,
                                                 y_proba,
                                                 test_size = test_size,
                                                 random_state = rng_)

In [60]:
extra_arrays = None or []
n_classes = y_proba.shape[1]
classes = np.arange(n_classes, dtype=int)
sample_weight = similarity_train

In [62]:
len(X_train)

3500

In [64]:
y_proba_train

array([[  2.37788848e-02,   3.59063710e-01,   4.73205720e-01,
          1.43951685e-01],
       [  9.68979692e-06,   1.15610580e-05,   9.96674834e-01,
          3.30391468e-03],
       [  7.66760986e-03,   2.12091170e-02,   9.18889919e-01,
          5.22333537e-02],
       ..., 
       [  2.07350047e-02,   2.18546176e-01,   6.76744716e-01,
          8.39741037e-02],
       [  2.30409684e-03,   3.98893329e-03,   9.77908761e-01,
          1.57982090e-02],
       [  7.06938773e-04,   6.78381551e-04,   9.84683161e-01,
          1.39315183e-02]])

In [None]:
for el in zip(X, y_proba, *extra_arrays):
        x, probs = el[0:2]
        rest = el[2:]
        for label in rng.choice(classes, size=factor, p=probs):
            yield (x, label) + rest

In [34]:
def rbf(distance, sigma=1.0):
    """
    Convert distance to similarity in [0, 1] range using RBF (Gaussian)
    kernel.
    """
    return np.exp(-distance ** 2 / (2 * sigma ** 2))

sims_ = rbf(1-sims, sigma=1)

In [36]:
sims_.mean()

0.92189977073259399

In [None]:
   def sample_near(self, doc, n_samples=1):
        # type: (str, int) -> Tuple[List[str], np.ndarray]
        assert n_samples >= 1
        all_docs = []  # type: List[str]
        similarities = []
        for sampler, freq in self._sampler_n_samples(n_samples):
            docs, sims = sampler.sample_near(doc, n_samples=freq)
            all_docs.extend(docs)
            similarities.append(sims)
        return all_docs, np.hstack(similarities)

In [None]:
    def _sampler_n_samples(self, n_samples):
        """ Return (sampler, n_samplers) tuples """
        sampler_indices = self.rng_.choice(range(len(self.samplers)),
                                           size=n_samples,
                                           replace=True,
                                           p=self.weights)
        return [
            (self.samplers[idx], freq)
            for idx, freq in itemfreq(sampler_indices)
        ]

In [41]:
te.show_weights(target_names = twenty_train.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.467,to be,,
+0.394,isn t,,
+0.301,or they,,
+0.296,have to,,
+0.285,do anything,,
+0.256,isn,,
… 16 more positive …,… 16 more positive …,,
… 33 more negative …,… 33 more negative …,,
-0.261,ray,,
-0.267,relieve,,

Weight?,Feature
+0.467,to be
+0.394,isn t
+0.301,or they
+0.296,have to
+0.285,do anything
+0.256,isn
… 16 more positive …,… 16 more positive …
… 33 more negative …,… 33 more negative …
-0.261,ray
-0.267,relieve

Weight?,Feature
+1.027,ray
+0.218,to mention
+0.189,ray tech
+0.184,to be
+0.156,they pass
+0.152,my bout
… 17 more positive …,… 17 more positive …
… 22 more negative …,… 22 more negative …
-0.180,tech
-0.208,<BIAS>

Weight?,Feature
+2.034,pain
+0.998,kidney
+0.881,stones
+0.845,medication
+0.544,recall
+0.373,tech
+0.257,hurt
+0.212,the pain
+0.107,relieve
… 14 more positive …,… 14 more positive …

Weight?,Feature
+0.379,children
+0.265,happened
+0.231,be extracted
+0.229,bout with
+0.219,mention
+0.216,broken up
+0.212,x ray
+0.212,they have
… 34 more positive …,… 34 more positive …
… 26 more negative …,… 26 more negative …


In [39]:
te.explain_prediction(target_names = twenty_train.target_names)

Contribution?,Feature
-0.359,<BIAS>
-9.428,Highlighted in text (sum)

Contribution?,Feature
-0.208,<BIAS>
-8.106,Highlighted in text (sum)

Contribution?,Feature
5.981,Highlighted in text (sum)
-0.15,<BIAS>

Contribution?,Feature
-0.328,<BIAS>
-5.258,Highlighted in text (sum)


In [37]:
import eli5
from eli5.lime import TextExplainer

te = TextExplainer(random_state = 42)
te.fit(doc, pipe.predict_proba)
te.show_prediction(target_names = twenty_train.target_names)

Contribution?,Feature
-0.359,<BIAS>
-9.428,Highlighted in text (sum)

Contribution?,Feature
-0.208,<BIAS>
-8.106,Highlighted in text (sum)

Contribution?,Feature
5.981,Highlighted in text (sum)
-0.15,<BIAS>

Contribution?,Feature
-0.328,<BIAS>
-5.258,Highlighted in text (sum)
