# DEPENDENCIES

In [None]:
# if needed, install required packages
# %pip install -r ../requirements.txt

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt

In [None]:
# Load model + tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

In [None]:
def classifier_fn(texts):
    """Takes a list of strings, returns probability array shape (batch_size, 2)."""
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=1).numpy()
    return probs

In [None]:
class_names = ["NEGATIVE", "POSITIVE"]
explainer = LimeTextExplainer(class_names=class_names)

### Basic LIME Explanation

In [None]:
sentence = "I don't like this film."

exp = explainer.explain_instance(
    sentence,
    classifier_fn,
    num_features=10
)
def lime_plot(exp):
    html = exp.as_html()
    with open("lime_explanation.html", "w") as f:
        f.write(html)

    with open("lime_explanation.html", "r") as f:
        display_html = f.read()


    from IPython.display import HTML
    display(HTML(display_html))

lime_plot(exp)

### Negation Tests

In [None]:
negation_sentences = [
    "I like this movie.",
    "I don't like this movie.",
    "I do not like this movie.",
    "This movie is not good.",
    "This movie is not bad.",
    "This movie is not bad at all."
]

negation_results = []

for sent in negation_sentences:
    exp = explainer.explain_instance(sent, classifier_fn, num_features=8)
    important_words = exp.as_list()
    pred = classifier_fn([sent])[0]

    negation_results.append({
        "sentence": sent,
        "prediction_NEG": float(pred[0]),
        "prediction_POS": float(pred[1]),
        "important_words": important_words
    })

    lime_plot(exp)

pd.DataFrame(negation_results)

### Emotional Word Weighting

In [None]:
emotional_tests = [
    "The plot was boring but the cinematography was stunning.",
    "The movie was absolutely amazing but the acting was awful.",
    "The movie was decent but not great.",
    "The characters were fantastic and the music was terrible.",
]

emotional_results = []

for sent in emotional_tests:
    exp = explainer.explain_instance(sent, classifier_fn, num_features=8)
    important = exp.as_list()
    probs = classifier_fn([sent])[0]

    emotional_results.append({
        "sentence": sent,
        "positive_prob": float(probs[1]),
        "negative_prob": float(probs[0]),
        "important_words": important
    })

    lime_plot(exp)

pd.DataFrame(emotional_results)


### Sarcasm Tests

In [None]:
sarcasm_tests = [
    "Great. Just what I needed today.",
    "Fantastic job ruining everything.",
    "I totally loved waiting 45 minutes in line.",
]

sarcasm_results = []

for sent in sarcasm_tests:
    exp = explainer.explain_instance(sent, classifier_fn, num_features=8)
    important = exp.as_list()
    probs = classifier_fn([sent])[0]

    sarcasm_results.append({
        "sentence": sent,
        "positive_prob": float(probs[1]),
        "negative_prob": float(probs[0]),
        "important_words": important
    })

    lime_plot(exp)

pd.DataFrame(sarcasm_results)

### Fairness Tests

In [None]:
fairness_tests = [
    "He is a doctor.",
    "She is a doctor.",
    "He is a nurse.",
    "She is a nurse.",
    "He is a leader.",
    "She is a leader.",
]

fairness_results = []

for sent in fairness_tests:
    exp = explainer.explain_instance(sent, classifier_fn, num_features=5)
    important = exp.as_list()
    probs = classifier_fn([sent])[0]

    fairness_results.append({
        "sentence": sent,
        "positive_prob": float(probs[1]),
        "negative_prob": float(probs[0]),
        "important_words": important
    })

    lime_plot(exp)


pd.DataFrame(fairness_results)

### Adversarial / Misspelling Robustness

In [None]:
adversarial_tests = [
    "This movie was good.",
    "This movie was gooood.",
    "This movie was gud.",
    "This movie was goood!",
    "This movie was good??",
    "This movie was bad.",
]

adv_results = []

for sent in adversarial_tests:
    exp = explainer.explain_instance(sent, classifier_fn, num_features=6)
    probs = classifier_fn([sent])[0]
    important = exp.as_list()

    adv_results.append({
        "sentence": sent,
        "positive_prob": float(probs[1]),
        "negative_prob": float(probs[0]),
        "important_words": important
    })

    lime_plot(exp)

pd.DataFrame(adv_results)

### Mixed Sentiment / Multi-Clause

In [None]:
mixed_tests = [
    "The acting was amazing but the plot was boring.",
    "The visuals were incredible but the writing was weak.",
    "The first half was great, the second half was terrible."
]

mixed_results = []

for sent in mixed_tests:
    exp = explainer.explain_instance(sent, classifier_fn, num_features=8)
    important = exp.as_list()
    probs = classifier_fn([sent])[0]

    mixed_results.append({
        "sentence": sent,
        "positive_prob": float(probs[1]),
        "negative_prob": float(probs[0]),
        "important_words": important
    })

    lime_plot(exp)   

pd.DataFrame(mixed_results)

### How LIME Perturbation Works w/ Lime Maps

In [None]:
sentence = "I don't like this film."

exp = explainer.explain_instance(sentence, classifier_fn, num_features=10)

# LIME internal token importance maps
lime_map = exp.as_map()
lime_map

### Count most influential words across all categories:

In [None]:
all_sentences = negation_sentences + emotional_tests + sarcasm_tests + fairness_tests + mixed_tests

word_importance_records = []

for sent in all_sentences:
    exp = explainer.explain_instance(sent, classifier_fn, num_features=10)
    for word, score in exp.as_list():
        word_importance_records.append((word, score))

df_importance = pd.DataFrame(word_importance_records, columns=["word", "importance"])
df_importance.groupby("word")["importance"].mean().sort_values(ascending=True).head(10)

In [None]:
df_importance.groupby("word")["importance"].mean().sort_values(ascending=False).head(10)