In [None]:
!pip install lime

* Reference: https://github.com/marcotcr/lime

In [None]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import string
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics

# downloading the nltk data for preprocessing
nltk.download('stopwords')
nltk.download('punkt')

# reading the data as pandas dataframe
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head()

In [None]:
# NLP pre-processing
# remove urls, handles, and the hashtag from hashtags 
# (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text):
  new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
  return new_text

# make all text lowercase
def text_lowercase(text): 
  return text.lower()

# remove numbers
def remove_numbers(text): 
  result = re.sub(r'\d+', '', text) 
  return result

# remove punctuation
def remove_punctuation(text): 
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

# function for all pre-processing steps
def preprocessing(text):
  text = text_lowercase(text)
  text = remove_urls(text)
  text = remove_numbers(text)
  text = remove_punctuation(text)
  return text

# pre-processing the text body column
pp_text = []
for text_data in train['text']:
  # check if string
  if isinstance(text_data, str):
    pp_text_data = preprocessing(text_data)
    pp_text.append(pp_text_data)
   # if not string
  else:
    pp_text.append(np.NaN)

# add pre-processed column to dataset
train['pp_text'] = pp_text

In [None]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train["pp_text"], train["target"])

In [None]:
# create bag-of-words with weights using tfid vectoriser
# strip accents and remove stop words during vectorisation
tf=(strip_accents = 'ascii', stop_words='english')

# transform and fit the training set with vectoriser
X_train_tf = tf.(X_train)
# transform the test set with vectoriser
X_test_tf = tf.(X_test)

In [None]:
# create logistic ------------------- model
logreg = (verbose=1, random_state=0, penalty='l2', solver='newton-cg')
# train model on  vectorised training data
model = logreg.fit(X_train_tf, y_train)
# evaluate model performance on the test set
pred = model.predict(X_test_tf)
sklearn.metrics.f1_score(y_test, pred, average='weighted')

### Using Lime

In [None]:
# importing the libraries
import lime
import sklearn.ensemble
from __future__ import print_function
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import 

# converting the vectoriser and model into a pipeline
# this is necessary as LIME takes a model pipeline as an input
c = make_pipeline(tf, model)

# saving a list of strings version of the X_test object
ls_X_test= list(X_test)

# saving the class names in a dictionary to increase interpretability
class_names = {0: 'non-disaster', 1:'disaster'}

In [None]:
ls_X_test= list(X_test)

In [None]:
ls_X_test[15]

In [None]:
# create the LIME explainer
# add the class names for interpretability
LIME_explainer = LimeTextExplainer(class_names=class_names)

# choose a random single prediction
idx = 15
# explain the chosen prediction 
# use the probability results of the logistic regression
# can also add num_features parameter to reduce the number of features explained
LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)
# print results
print('Document id: %d' % idx)
print('Tweet: ', ls_X_test[idx])
print('Probability disaster =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])
print('True class: %s' % class_names.get(list(y_test)[idx]))

In [None]:
print("1 = disaster class, 0 = non-disaster class")
# show the explainability results with highlighted text
LIME_exp.show_in_notebook(text=True)