# NLP: Model Interpretability Using LIME

In [None]:
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

Data is: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
df = pd.read_csv('data/IMDB_Reviews.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Our target value
df['sentiment'].unique()

### Pre-Split Preprocessing

Doing some initial preprocessing that can be done before the train/test split

In [None]:
# Let's check out an example review...
index_num = 15903 # Defining the index number of the review to explore

df['review'].iloc[index_num]

We have some HTML tags inside these texts... will want to remove them. But how?

Enter: Regular Expressions (regex).

Testing: https://regexr.com/

In [None]:
# Find the pattern to remove html tags
import re

html_tag_pattern = re.compile(r'<[^>]*>')

test = html_tag_pattern.sub('', df['review'].iloc[index_num])

In [None]:
test

In [None]:
# Apply our pattern to the dataset
df['review'] = df['review'].map(lambda x: re.sub(r'<[^>]*>', '', x))

# Same as
# df['review'] = df['review'].map(lambda x: html_tag_pattern.sub('', x))

In [None]:
# Sanity check
df['review'].iloc[index_num]

Let's also remove stopwords

In [None]:
stop_words = stopwords.words('english')

In [None]:
# Neat bit of code!
df['review'] = df['review'].apply(lambda x: ' '.join(
    [word for word in x.split() if word.lower() not in (stop_words)]))

Can also pre-process our target variable

In [None]:
# Create a target map
target_map = {'positive': 1,
              'negative': 0}

In [None]:
# Map it
df['sentiment'] = df['sentiment'].map(target_map)

In [None]:
# Sanity check
df.head()

### Split, and then Post-Split Processing
Now let's perform a train/test split:

In [None]:
# Define our X and y
X = df['review']
y = df['sentiment']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.shape

In [None]:
# Need to find that same review now that the index is shuffled
train_index_num = X_train.index.get_loc(15903)
X_train.iloc[train_index_num]

### Vanilla Text Classification... What Would We Do?

Aka what would this look like without a NN?

In [None]:
# Let's use a TF-IDF vectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# What parameters should we set? What steps have we already done, what do we still need to do?
# Already removed stopwords!
vectorizer = TfidfVectorizer(
    max_df=.95,  # removes words that appear in more than 95% of docs
    min_df=2 # removes words that appear 2 or fewer times
)  

In [None]:
vectorizer.fit(X_train)

X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

#### Explore Our Vectorized Text

In [None]:
# Let's look at that second example again
X_train.iloc[train_index_num]

In [None]:
train_index_num

In [None]:
X_train.loc[X_train.str.contains('CHILDREN\'S MOVIE!!!')]

In [None]:
# Creating a df of tf-idf values, where each column is a word in the vocabulary
tfidf_train_df = pd.DataFrame(X_train_vec.toarray(), 
                              columns=vectorizer.get_feature_names(), 
                              index=X_train.index)

In [None]:
# Grabbing that row once it's been vectorized
test_doc = tfidf_train_df.iloc[train_index_num]

test_doc[test_doc > 0].sort_values(ascending=False).head(15) # Showing values > 0

What does this tell you about the word "censure" in the this document?

- 


In [None]:
# Now let's model
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

In [None]:
classifier.fit(X_train_vec, y_train)

classifier.score(X_test_vec, y_test)

Evaluate:

- 


## Model Interpretability Using LIME!

We'll follow this example: https://marcotcr.github.io/lime/tutorials/Lime%20-%20basic%20usage%2C%20two%20class%20case.html

We'll need to install LIME!

In [None]:
!pip install lime

### Let's Follow the Example And Put This Into Practice!

In [None]:
from lime import lime_text
from sklearn.pipeline import make_pipeline

In [None]:
c = make_pipeline(vectorizer, classifier)

In [None]:
# Keep following this example!