In [1]:
from datasets import load_dataset
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import create_optimizer, AutoTokenizer, DefaultDataCollator, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('lucadiliello/newsqa')

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'key', 'labels'],
        num_rows: 74160
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'key', 'labels'],
        num_rows: 4212
    })
})

In [4]:
# Train/test split
train_test_split = dataset['train'].train_test_split(test_size=0.3)
train_partition = train_test_split['train']
test_partition = train_test_split['test']

# Update the dataset
dataset['train'] = train_partition
dataset['test'] = test_partition
dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'key', 'labels'],
        num_rows: 51912
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'key', 'labels'],
        num_rows: 4212
    })
    test: Dataset({
        features: ['context', 'question', 'answers', 'key', 'labels'],
        num_rows: 22248
    })
})

In [6]:
# Tokenizer and model
# model_checkpoint = 'distilbert/distilbert-base-uncased'
model_checkpoint = 'mrbach/extractive_question_answering'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

2024-11-06 11:36:38.237438: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-11-06 11:36:38.237469: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-11-06 11:36:38.237480: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-11-06 11:36:38.237528: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-06 11:36:38.237547: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.

In [7]:
# Note - this function is based on some example code found at https://huggingface.co/docs/transformers/tasks/question_answering
def tokenize_and_label(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=384,
        truncation='only_second',  # Only truncate the context, if needed
        return_offsets_mapping=True,
        padding='max_length',
    )

    offset_mapping = inputs.pop('offset_mapping')
    answers = examples['answers']
    labels = examples['labels']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        label = labels[i][0]
        start_char = label['start'][0]
        end_char = label['end'][0]
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
            
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    
    return inputs

In [8]:
# Tokenize and label
tokenized_dataset = dataset.map(tokenize_and_label, batched=True)

# Remove the original label columns (otherwise the data collator gets confused)
tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns(['labels'])
tokenized_dataset['test'] = tokenized_dataset['test'].remove_columns(['labels'])
tokenized_dataset['validation'] = tokenized_dataset['validation'].remove_columns(['labels'])

tokenized_dataset

Map: 100%|██████████| 51912/51912 [00:21<00:00, 2418.80 examples/s]
Map: 100%|██████████| 4212/4212 [00:01<00:00, 2392.49 examples/s]
Map: 100%|██████████| 22248/22248 [00:09<00:00, 2416.89 examples/s]


DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'key', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 51912
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'key', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 4212
    })
    test: Dataset({
        features: ['context', 'question', 'answers', 'key', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 22248
    })
})

In [9]:
def answer_first_n(n=5):
    for i in range(n):
        # Inputs
        question = tokenized_dataset['test'][i]['question']
        context = tokenized_dataset['test'][i]['context']
        true_answer = tokenized_dataset['test'][i]['answers'][0]
        inputs = tokenized_dataset['test'][i]['input_ids']
        
        # Make prediction
        scores = model(input_ids=tf.convert_to_tensor(inputs))
        start_index = tf.math.argmax(scores['start_logits'], axis=1).numpy()[0]
        end_index = tf.math.argmax(scores['end_logits'], axis=1).numpy()[0]
        pred_answer = tokenizer.decode(inputs[start_index:end_index + 1])
        
        # Print the results
        print(f'Question: {question}')
        print(f'True answer: {true_answer}')
        print(f'Predicted answer: {pred_answer}\n')

In [10]:
answer_first_n()

Question: Who was in a standoff with the U.S. over nuclear weapons?
True answer: North Korea
Predicted answer: tense

Question: Which player scored a hat-trick?
True answer: Bendtner
Predicted answer: nicklas bendtner

Question: Where is Momeni being held?
True answer: Tehran's notorious Evin Prison
Predicted answer: 

Question: How many people died?
True answer: 25
Predicted answer: 25

Question: What injury was he suffering from?
True answer: lost his entire body below the hips.
Predicted answer: triumph and unbearable tragedy. andrew kinard testifies before a senate armed services subcommittee on april 29, 2009. but i would not actually know everything that happened until the night was long over. a couple of weeks before july 15, a friend who works with injured troops emailed me to say it was time for andrew's going away party. andrew kinard is a young marine i first met a few years ago at walter reed army medical center in washington where he was recovering from a devastating ied a

In [11]:
def test_metrics(n=1000):
    exact_matches, f1_scores = [], []

    for i in range(n):
        true_answer = tokenized_dataset['test'][i]['answers'][0]
        inputs = tokenized_dataset['test'][i]['input_ids']
        scores = model(input_ids=tf.convert_to_tensor(inputs))
        start_index = tf.math.argmax(scores['start_logits'], axis=1).numpy()[0]
        end_index = tf.math.argmax(scores['end_logits'], axis=1).numpy()[0]
        pred_answer = tokenizer.decode(inputs[start_index:end_index + 1])

        exact_matches.append(pred_answer.strip().lower() == true_answer.strip().lower())
        true_tokens = set(true_answer.split())
        pred_tokens = set(pred_answer.split())
        common_tokens = true_tokens.intersection(pred_tokens)
        precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
        recall = len(common_tokens) / len(true_tokens) if true_tokens else 0
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        f1_scores.append(f1)

    # Exact matches
    em = sum(exact_matches) / len(exact_matches)

    # F1 score
    f1 = sum(f1_scores) / len(f1_scores)

    # Print the metrics
    print(f'Exact matches: {round(em, 3)}')
    print(f'F1: {round(f1, 3)}')

In [12]:
test_metrics()

Exact matches: 0.046
F1: 0.062


In [13]:
batch_size = 16
n_epochs = 1
total_train_steps = (len(tokenized_dataset['train']) // batch_size) * n_epochs
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)



In [14]:
data_collator = DefaultDataCollator(return_tensors='tf')

tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset['train'],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_dataset['validation'],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

In [15]:
model.compile(optimizer=optimizer)

In [16]:
model_checkpoint = ModelCheckpoint(f'./finetuned_{model_checkpoint}', save_best_only=True, verbose=1)

model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=n_epochs, callbacks=[model_checkpoint])

2024-11-06 11:40:11.822493: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2024-11-06 11:40:12.249477: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.




2024-11-06 12:30:10.637515: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.



Epoch 1: val_loss improved from inf to 1.94571, saving model to ./finetuned_mrbach/extractive_question_answering
























INFO:tensorflow:Assets written to: ./finetuned_mrbach/extractive_question_answering/assets


INFO:tensorflow:Assets written to: ./finetuned_mrbach/extractive_question_answering/assets




<keras.src.callbacks.History at 0x379c7ba30>

In [17]:
answer_first_n()

Question: Who was in a standoff with the U.S. over nuclear weapons?
True answer: North Korea
Predicted answer: christiane amanpour

Question: Which player scored a hat-trick?
True answer: Bendtner
Predicted answer: nicklas bendtner

Question: Where is Momeni being held?
True answer: Tehran's notorious Evin Prison
Predicted answer: in a section of tehran's notorious evin prison

Question: How many people died?
True answer: 25
Predicted answer: 25

Question: What injury was he suffering from?
True answer: lost his entire body below the hips.
Predicted answer: a devastating ied attack in iraq



In [18]:
test_metrics()

Exact matches: 0.172
F1: 0.17


In [19]:
print(tokenized_dataset['test'][0]['context'])

(CNN Student News) -- Record the CNN Special Investigations Unit Classroom Edition: Notes from North Korea when it airs commercial-free on CNN. (A short feature begins at 4:00 a.m. and precedes the program.)



Program Overview



CNN chief international correspondent Christiane Amanpour travels to North Korea as the New York Philharmonic Orchestra makes a historic visit to one of the world's most closed societies. She examines the tense standoff with the U.S. over nuclear weapons and provides a rare look inside a notorious, top-secret nuclear facility.



Grade Levels: 9 -- 12, College



Subject Areas: U.S. History, World History, Current Events, Political Science, Government



Objectives



The CNN Special Investigations Unit Classroom Edition: Notes from North Korea and its corresponding discussion questions and suggested activities challenge students to:



Curriculum Connections



Social Studies



Standard VI. Power, Authority, and Governance: Social studies programs should in

In [20]:
print(tokenized_dataset['test'][3]['context'])

(CNN) -- Trains and text messages made a deadly combination when two locomotives collided head-on last year near Los Angeles, California, witnesses told an investigative panel this week.



Firefighters and investigators inspect the wreckage a day after a train collision in California killed 25 people.



Metrolink commuter train engineer Robert Sanchez missed a stop signal while trading text messages with a friend on September 12, leading to a collision with a Union Pacific freight train that killed Sanchez and 24 other people in Chatsworth, California.



The accident injured 101 people and caused $10.6 million in damages, according to a report by federal investigators.



One National Transportation Safety Board member worries other disasters loom on the nation's rail system.



"One train, one day, one crew. It raises questions for me as to what the heck else is going on out there," said Kitty Higgins, chairwoman of a two-day NTSB hearing in Washington on the accident.



Sanchez v

In [22]:
positive_question = 'What does Rihanna think about the heavy chain?'
negative_question = 'What is the U.S. military doing?'
positive_context = tokenized_dataset['test'][0]['context']
negative_context = tokenized_dataset['test'][3]['context']

positive_encoding = tokenizer.encode_plus(text=positive_question, text_pair=positive_context)
negative_encoding = tokenizer.encode_plus(text=negative_question, text_pair=negative_context)
positive_inputs = positive_encoding['input_ids']
negative_inputs = negative_encoding['input_ids']

positive_scores = model(input_ids=tf.convert_to_tensor(positive_inputs))
negative_scores = model(input_ids=tf.convert_to_tensor(negative_inputs))
start_index_p = tf.math.argmax(positive_scores['start_logits'], axis=1).numpy()[0]
end_index_p = tf.math.argmax(positive_scores['end_logits'], axis=1).numpy()[0]
start_index_n = tf.math.argmax(negative_scores['start_logits'], axis=1).numpy()[0]
end_index_n = tf.math.argmax(negative_scores['end_logits'], axis=1).numpy()[0]
positive_answer = tokenizer.decode(positive_inputs[start_index_p:end_index_p + 1])
negative_answer = tokenizer.decode(negative_inputs[start_index_n:end_index_n + 1])

print(f'Positive answer: {positive_answer}')
print(f'Negative answer: {negative_answer}')

Positive answer: [CLS]
Negative answer: [CLS]


In [47]:
classifier_tok_checkpoint = 'albert/albert-base-v2'
classifier_model_checkpoint = './albert/albert-base-v2_finetuned_sentiment/checkpoint-10000/'

classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_tok_checkpoint)
classifier_model = TFAutoModelForSequenceClassification.from_pretrained(classifier_model_checkpoint, local_files_only=True, num_labels=2)

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

All the weights of TFAlbertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertForSequenceClassification for predictions without further training.


In [79]:
positive_encoded = tokenizer(positive_answer, truncation=True, padding='max_length', max_length=35, return_tensors='tf')['input_ids']
negative_encoded = tokenizer(negative_answer, truncation=True, padding='max_length', max_length=35, return_tensors='tf')['input_ids']

In [80]:
positive_pred = classifier_model(input_ids=positive_encoded).logits.numpy()
negative_pred = classifier_model(input_ids=negative_encoded).logits.numpy()

print(f'Positive article predicted class: {positive_pred.argmax()}')
print(f'Positive article predicted logits: {positive_pred[0]}')
print(f'Negative article predicted class: {negative_pred.argmax()}')
print(f'Negative article predicted logits: {negative_pred[0]}')

Positive article predicted class: 0
Positive article predicted logits: [0.7568402  0.04390718]
Negative article predicted class: 0
Negative article predicted logits: [0.60612404 0.01932076]
