In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from datasets.utils import disable_progress_bar
disable_progress_bar()

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import urllib.request
import os
import json
import gzip
from transformers import create_optimizer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification
from datasets import Dataset
from lime.lime_text import LimeTextExplainer
import pandas as pd
import nltk
stop_words_list = nltk.corpus.stopwords.words('english')
[stop_words_list.append(arg) for arg in [".", "*", "!", "'", ":", '"', "?!", ",", "(", ")", "?"]]

from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/pwesolowski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
DATA_PATH = "./data"

In [5]:
def get_reviews_from_tropes(filename):
    with open(filename, "rb") as f:
        reviews = []
        for line in tqdm(f):
            book = json.loads(line)
            reviews.append(
                {
                    "label": book["has_spoiler"],
                    "sentences": book["sentences"]
                }
            )
        return reviews


tropes_train = get_reviews_from_tropes(
    f"{DATA_PATH}/tropes/tvtropes_books-train.json"
)
tropes_test = get_reviews_from_tropes(
    f"{DATA_PATH}/tropes/tvtropes_books-test.json"
)
tropes_val = get_reviews_from_tropes(
    f"{DATA_PATH}/tropes/tvtropes_books-val.json"
)

273262it [00:02, 93616.80it/s] 
34158it [00:00, 209026.16it/s]
34158it [00:00, 52728.06it/s] 


In [6]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)


def preprocess_function(data):
    return tokenizer(data["text"], truncation=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
if model_name == "distilbert-base-uncased":
    model = TFAutoModelForSequenceClassification.from_pretrained(
        model_name, dropout=0.2
    )
elif model_name == "bert-base-uncased":
    model = TFAutoModelForSequenceClassification.from_pretrained(
        model_name, classifier_dropout=0.1
    )

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
checkpoint_name = f"./checkpoints/best_val_tropes_{model_name}"
model.load_weights(checkpoint_name)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7425eea860>

In [8]:
def create_tf_dataset(review_list):
    tokenized_reviews = Dataset.from_list([{"text": x} for x in review_list]).map(preprocess_function, batched=True)
    return model.prepare_tf_dataset(
        tokenized_reviews, shuffle=False, batch_size=32, collate_fn=data_collator
    )

In [9]:
def predictor(texts):
    tf_data = create_tf_dataset(texts)
#     outputs = model(**tokenizer(texts, return_tensors="tf", truncation=True))
    outputs = model.predict(tf_data, verbose=0)
    probas = tf.nn.softmax(outputs.logits).numpy()
    return probas

In [10]:
class_names = ["nonspoiler", "spoiler"]
explainer = LimeTextExplainer(class_names=class_names)

In [11]:
def merge_sentences(data_item):
    return " ".join(sentence[1] for sentence in data_item["sentences"])

In [12]:
def word_(word: str) -> str:
    for i in [".", "*", "!", "'", ":", '"', "?!", ",", "(", ")", "?", ";"]:
        word = word.replace(i, "")
    return word.lower()

In [13]:
def dict_try(word):
    try:
        return exp_dct[word]
    except:
        return 0

In [14]:
def explain_sentece(sentence):
    review_to_predict = merge_sentences(sentence)
    
    review = [word_(word) for word in review_to_predict.split(" ") if word_(word) not in stop_words_list]
    review = " ".join(review)
    
    exp = explainer.explain_instance(review, predictor, num_features=10, num_samples=1000)
    
    exp_dct = dict(exp.as_list())
    exp_dct = {key: value for key, value in exp_dct.items() if value > 0}

    lst = []
    for sen in tropes_train[0]['sentences']:
        sen_ = [word_(word) for word in sen[1].split(" ")]        
        lst.append(sum([dict_try(word) for word in sen_]))
        
    arr_troops = np.array([false_true[0] for false_true in sentence['sentences']])
    max_int = arr_troops.sum()
    choice_troop = arr_troops.argsort()[-max_int:][::-1]    
    
    arr_exp = np.array(lst)
    chooice_exp = arr_exp.argsort()[-max_int:][::-1]
    
    if list(set(chooice_exp).intersection(set(choice_troop))):
        val = 1
    else:
        val = 0
    
    return val, sum([1 for chooice in chooice_exp.tolist() if chooice in choice_troop.tolist()])/len(choice_troop)

In [15]:
t_t = [tropes_train_ for tropes_train_ in tropes_train if tropes_train_['label'] == True and len(tropes_train_['sentences']) > 4 and len(tropes_train_['sentences']) < 12]
len(t_t)

5337

In [16]:
t_t = t_t[:200]
len(t_t)

200

In [17]:
lst = [explain_sentece(tropes_train_) for tropes_train_ in tqdm(t_t, position=0, leave=True)]

  0%|                                                   | 0/200 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████████████████████████████████| 200/200 [11:44<00:00,  3.52s/it]


In [18]:
def empty_lst(lst):
    if not lst:
        return 0
    else:
        return 1

In [19]:
def intersection_of_explenation(review_):
    review_to_predict = merge_sentences(review_)

    review = [word_(word) for word in review_to_predict.split(" ") if word_(word) not in stop_words_list]
    review = " ".join(review)

    exp = explainer.explain_instance(review, predictor, num_features=10, num_samples=1000)

    exp_dct = dict(exp.as_list())
    post_val = {key: value for key, value in exp_dct.items() if value > 0}

    sens = [sen[1][sen[2][0][0]: sen[2][0][1]] for sen in review_['sentences'] if sen[0] == True]

    spoilers = word_tokenize(" ".join(sens))
    spoilers = [spoiler.lower() for spoiler in spoilers]
    
    post_val_5 = list(post_val.keys())[0:5]
    post_val_1 = list(post_val.keys())[0]
    
    find_words_10 = list(set(post_val).intersection(set(spoilers)))
    find_words_5 = list(set(post_val_5).intersection(set(spoilers)))
    find_words_1 = list(set(post_val_1).intersection(set(spoilers)))
    
    return (empty_lst(find_words_10), empty_lst(find_words_5), empty_lst(find_words_1))

In [20]:
results = [intersection_of_explenation(sen) for sen in tqdm(t_t, position=0, leave=True)]

100%|█████████████████████████████████████████| 200/200 [11:41<00:00,  3.51s/it]


In [21]:
print(lst)

[(1, 1.0), (1, 0.42857142857142855), (1, 0.16666666666666666), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (1, 0.25), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (1, 0.16666666666666666), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (1, 0.16666666666666666), (1, 0.25), (0, 0.0), (0, 0.0), (0, 0.0), (1, 0.3333333333333333), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (1, 0.3333333333333333), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (1, 0.3333333333333333), (0, 0.0), (1, 0.16666666666666666), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (1, 0.5), (0, 0.0), (1, 0.5), (0, 0.0), (0, 0.0), (1, 0.5714285714285714), (0, 0.0), (1, 0.2), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (0, 0.0), (1, 0.16666666666666666), (0, 0.0), (0, 0.0), (

In [22]:
print(results)

[(1, 1, 0), (1, 1, 1), (1, 1, 1), (1, 1, 0), (1, 1, 1), (1, 1, 0), (1, 1, 1), (1, 1, 0), (0, 0, 0), (1, 1, 0), (1, 1, 1), (1, 1, 0), (1, 1, 0), (1, 1, 0), (1, 1, 0), (1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 0), (1, 1, 1), (1, 1, 0), (1, 1, 0), (1, 1, 1), (1, 1, 1), (1, 1, 0), (1, 1, 0), (1, 1, 0), (0, 0, 0), (1, 1, 0), (1, 1, 0), (1, 1, 1), (1, 1, 0), (1, 1, 1), (1, 1, 1), (1, 1, 0), (0, 0, 0), (1, 1, 0), (0, 0, 0), (1, 1, 1), (0, 0, 0), (1, 1, 0), (1, 1, 0), (1, 1, 0), (1, 1, 1), (1, 1, 1), (1, 1, 0), (0, 0, 0), (1, 1, 1), (1, 1, 0), (1, 1, 0), (1, 1, 1), (1, 1, 0), (1, 1, 0), (1, 1, 1), (0, 0, 0), (1, 1, 0), (0, 0, 0), (1, 1, 0), (0, 0, 0), (0, 0, 0), (1, 1, 0), (1, 1, 0), (1, 1, 0), (1, 1, 0), (1, 1, 0), (1, 1, 1), (1, 1, 0), (1, 1, 0), (0, 0, 1), (1, 1, 0), (1, 1, 0), (1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 0), (1, 1, 1), (1, 1, 1), (1, 1, 0), (1, 1, 0), (0, 0, 0), (1, 1, 0), (1, 1, 0), (0, 0, 0), (1, 1, 1), (1, 1, 0), (1, 1, 0), (1, 1, 1), (1, 1, 0), (1, 1, 0), (1, 1, 0), (1, 1, 0)

## Średnia po pierwszym i drugim elemencie w lst

In [23]:
np.mean([x[0] for x in lst]), np.mean([x[1] for x in lst])

(0.16, 0.0638095238095238)

## Średnie po trzech elementach w results

In [24]:
np.mean([x[0] for x in results]), np.mean([x[1] for x in results]), np.mean([x[2] for x in results])

(0.825, 0.815, 0.335)