### Import

In [35]:
import sklearn
import numpy as np
import sklearn
import pickle
import sklearn.ensemble
import sklearn.metrics
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### Load Data

In [36]:
data = pickle.load( open( "../../data/cancer_corpi/articles/pickle/processed.p","rb"))    

### Transform data

In [77]:
classes = []
y_names = []
X = []
for item in data['records']:
    label = item[-1]
    y_names.append(label)
    X.append(item[0])
X = np.expand_dims(np.array(X),-1)
y_names = np.expand_dims(np.array(y_names),-1)
classes = np.unique(y_names)
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_names)
y = enc.transform(y_names).toarray()

### Count examples for class and restrict to 2 classes

In [78]:
counts = np.sum(y,axis=0)
idx1 = np.where(y_names == 'breast_cancer')[0]
idx2 = np.where(y_names == 'leukemia')[0]
mincount = np.min([len(idx1),len(idx2)])
idx1 = idx1[:mincount]
idx2 = idx2[:mincount]
idx = np.union1d(idx1,idx2)
randperm = np.random.permutation(len(idx))
idx = idx[randperm]
X = X[idx,:]
y = y[idx,:]
y = y[:,[0,2]]

### Divide into train and test

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Get the vocabulary and vectorize the text

In [82]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=True,stop_words='english')
train_vectors = vectorizer.fit_transform(X_train[:,0])
test_vectors = vectorizer.transform(X_test[:,0])

### Create Random Forest Classifier

In [84]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500);
rf.fit(train_vectors, y_train);
# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=None, max_features='auto', max_leaf_nodes=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False)

RandomForestClassifier(n_estimators=500)

In [None]:
# vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
# train_vectors = vectorizer.fit_transform(newsgroups_train.data)
# test_vectors = vectorizer.transform(newsgroups_test.data)
# Now, let's say we want to use random forests for classification. It's usually hard to understand what random forests are doing, especially with many trees.

# rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
# rf.fit(train_vectors, newsgroups_train.target)
# RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=None, max_features='auto', max_leaf_nodes=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False)
# pred = rf.predict(test_vectors)
# sklearn.metrics.f1_score(newsgroups_test.target, pred, average='binary')
# 0.92093023255813955
# We see that this classifier achieves a very high F score. The sklearn guide to 20 newsgroups indicates that Multinomial Naive Bayes overfits this dataset by learning irrelevant stuff, such as headers. Let's see if random forests do the same.

# Explaining predictions using lime
# Lime explainers assume that classifiers act on raw text, but sklearn classifiers act on vectorized representation of texts. For this purpose, we use sklearn's pipeline, and implements predict_proba on raw_text lists.

# from lime import lime_text
# from sklearn.pipeline import make_pipeline
# c = make_pipeline(vectorizer, rf)
# print(c.predict_proba([newsgroups_test.data[0]]))
# [[ 0.274  0.726]]
# Now we create an explainer object. We pass the class_names a an argument for prettier display.

# from lime.lime_text import LimeTextExplainer
# explainer = LimeTextExplainer(class_names=class_names)
# We then generate an explanation with at most 6 features for an arbitrary document in the test set.

# idx = 83
# exp = explainer.explain_instance(newsgroups_test.data[idx], c.predict_proba, num_features=6)
# print('Document id: %d' % idx)
# print('Probability(christian) =', c.predict_proba([newsgroups_test.data[idx]])[0,1])
# print('True class: %s' % class_names[newsgroups_test.target[idx]])
# Document id: 83
# Probability(christian) = 0.414
# True class: atheism
# The classifier got this example right (it predicted atheism).
# The explanation is presented below as a list of weighted features.

# exp.as_list()
# [(u'Posting', -0.15748303818990594),
#  (u'Host', -0.13220892468795911),
#  (u'NNTP', -0.097422972255878093),
#  (u'edu', -0.051080418945152584),
#  (u'have', -0.010616558305370854),
#  (u'There', -0.0099743822272458232)]
# These weighted features are a linear model, which approximates the behaviour of the random forest classifier in the vicinity of the test example. Roughly, if we remove 'Posting' and 'Host' from the document , the prediction should move towards the opposite class (Christianity) by about 0.27 (the sum of the weights for both features). Let's see if this is the case.

# print('Original prediction:', rf.predict_proba(test_vectors[idx])[0,1])
# tmp = test_vectors[idx].copy()
# tmp[0,vectorizer.vocabulary_['Posting']] = 0
# tmp[0,vectorizer.vocabulary_['Host']] = 0
# print('Prediction removing some features:', rf.predict_proba(tmp)[0,1])
# print('Difference:', rf.predict_proba(tmp)[0,1] - rf.predict_proba(test_vectors[idx])[0,1])
# Original prediction: 0.414
# Prediction removing some features: 0.684
# Difference: 0.27
# Pretty close!
# The words that explain the model around this document seem very arbitrary - not much to do with either Christianity or Atheism.
# In fact, these are words that appear in the email headers (you will see this clearly soon), which make distinguishing between the classes much easier.

# Visualizing explanations
# The explanations can be returned as a matplotlib barplot:

# %matplotlib inline
# fig = exp.as_pyplot_figure()

# The explanations can also be exported as an html page (which we can render here in this notebook), using D3.js to render graphs.

# exp.show_in_notebook(text=False)
# Prediction probabilities
# 0.59
# atheism
# 0.41
# christian
# atheism
# christian
# Posting
# 0.16
# Host
# 0.13
# NNTP
# 0.10
# edu
# 0.05
# have
# 0.01
# There
# 0.01
# Alternatively, we can save the fully contained html page to a file:

# exp.save_to_file('/tmp/oi.html')
# Finally, we can also include a visualization of the original document, with the words in the explanations highlighted. Notice how the words that affect the classifier the most are all in the email header.

# exp.show_in_notebook(text=True)
# Prediction probabilities
# 0.59
# atheism
# 0.41
# christian
# atheism
# christian
# Posting
# 0.16
# Host
# 0.13
# NNTP
# 0.10
# edu
# 0.05
# have
# 0.01
# There
# 0.01
# Text with highlighted words
# From: johnchad@triton.unm.edu (jchadwic)
# Subject: Another request for Darwin Fish
# Organization: University of New Mexico, Albuquerque
# Lines: 11
# NNTP-Posting-Host: triton.unm.edu

# Hello Gang,

# There have been some notes recently asking where to obtain the DARWIN fish.
# This is the same question I have and I have not seen an answer on the
# net. If anyone has a contact please post on the net or email me.

# Thanks,

# john chadwick
# johnchad@triton.unm.edu
# or
# That's it for this tutorial. Random forests were just an example, this explainer works for any classifier you may want to use, as long as it implements predict_proba.

In [1]:
for i in range(1000):
    order = np.random.permutation()

NameError: name 'np' is not defined