In [1]:
import lime
import sklearn
import numpy as np
import sklearn
import sklearn.feature_extraction
import sklearn.ensemble
import sklearn.metrics
from __future__ import print_function
import pandas as pd
import spacy
np.random.seed(500)
nlp = spacy.load("fr_core_news_md")

In [None]:
TRAIN_PATH = "train_disjoint.csv" # "datasets/articles/train_text_dataset.csv"
TEST_PATH = "test_disjoint.csv"# "datasets/articles/test_text_dataset.csv"

fields = ["label", "article"]

train_df = pd.read_csv(TRAIN_PATH, usecols=fields)
test_df = pd.read_csv(TEST_PATH, usecols=fields)

def cleansing(doc):
    # Remove stop words
    doc = [token for token in doc if not token.is_stop]
    return doc

def keep_specific_pos(doc, pos=["ADV", "ADJ", "VERB", "NOUN"]):
    doc = [token for token in doc if token.pos_ in pos]
    return doc

def preprocess(data):
    docs = list(nlp.pipe(data))
    preprocess_docs = [keep_specific_pos(cleansing(doc)) for doc in docs]
    # Doc -> Text (+ lemmatization)
    output_texts = [" ".join([token.lemma_ for token in doc]) for doc in preprocess_docs]
    return output_texts

x_train = preprocess([str(text) for text in train_df["article"].values])
x_test = preprocess([str(text) for text in test_df["article"].values])
y_train, y_test = train_df["label"].values - 1, test_df["label"].values - 1

In [None]:
# making class names shorter
class_names = ["true", "biased", "fake"]
print(','.join(class_names))

In [None]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=.01)
nb.fit(train_vectors, y_train)

In [None]:
pred = nb.predict(test_vectors)
sklearn.metrics.f1_score(y_test, pred, average='weighted')

In [None]:
print(sklearn.metrics.confusion_matrix(y_test, pred))

In [None]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, nb)

In [None]:
print(c.predict_proba([x_train[0]]).round(3))

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
idx = 3
exp = explainer.explain_instance(x_train[idx], c.predict_proba, num_features=6, labels=[0, 1, 2])
print('Document id: %d' % idx)
print('Predicted class =', class_names[nb.predict(test_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_test[idx]])

In [None]:
print ('Explanation for class %s' % class_names[0])
print ('\n'.join(map(str, exp.as_list(label=0))))
print ()
print ('Explanation for class %s' % class_names[1])
print ('\n'.join(map(str, exp.as_list(label=1))))
print ()
print ('Explanation for class %s' % class_names[2])
print ('\n'.join(map(str, exp.as_list(label=2))))

In [None]:
exp = explainer.explain_instance(x_test[idx], c.predict_proba, num_features=6, top_labels=2)
print(exp.available_labels())

In [None]:
exp.show_in_notebook(text=False)

In [None]:
exp.show_in_notebook(text=x_test[idx], labels=(2,))