## Linear Probe for Word Boundaries

Adding a linear probe to the pre-trained model to see if it can detect word boundaries.

In [1]:
import sys
sys.path.append("../")
import pandas as pd
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from typing import Optional, Tuple, Union
from transformers import AutoTokenizer, GPT2ForTokenClassification
from transformers.modeling_outputs import TokenClassifierOutput
from datasets import load_dataset

## Load Corpus

In [2]:
from datasets import load_dataset
dataset = load_dataset('transformersegmentation/CHILDES_EnglishNA')


No config specified, defaulting to: childes_english_na/full
Found cached dataset childes_english_na (/Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes_english_na/full/1.0.0/fae6154891a4bae471c04c7f324ef76b5da7f8c32981606c5ceec3d1373563cf)


  0%|          | 0/3 [00:00<?, ?it/s]

## Define New Model

In [3]:
class GPT2Segmenter:
    def __init__(self, model_path, tokenizer_path):
        self.model = GPT2ForTokenClassification.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        
    def segment(self, text):
        input_ids = self.tokenizer.encode(text, return_tensors='pt')
        output = self.model(input_ids)
        logits = output.logits
        predictions = np.argmax(logits.detach().numpy(), axis=2)
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
        return pd.DataFrame({'token': tokens, 'prediction': predictions[0]})

In [4]:
class GPT2Probe(GPT2ForTokenClassification):

    def __init__(self, config):
        config.num_labels = 2
        super().__init__(config)

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Freeze the transformer layer
        with torch.no_grad():
            transformer_outputs = self.transformer(
                input_ids,
                past_key_values=past_key_values,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                use_cache=use_cache,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

        hidden_states = transformer_outputs[0]
        hidden_states = self.dropout(hidden_states)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

## Load Model

In [5]:
model = GPT2Probe.from_pretrained('transformersegmentation/GPT2-gpt2_lm_head_model-model')
tokenizer = AutoTokenizer.from_pretrained('transformersegmentation/Space-Tokenizer')

Some weights of the model checkpoint at transformersegmentation/GPT2-gpt2_lm_head_model-model were not used when initializing GPT2Probe: ['lm_head.weight']
- This IS expected if you are initializing GPT2Probe from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Probe from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2Probe were not initialized from the model checkpoint at transformersegmentation/GPT2-gpt2_lm_head_model-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from src.preprocessing import DataPreprocessor
from src.config import DataPreprocessingParams

dp = DataPreprocessor(DataPreprocessingParams(max_input_length=64, join_utts=False), tokenizer, labels_are_word_boundaries=True)

In [7]:
processed_dataset = dataset.map(
        dp,
        batched=True,
        # num_proc=64,
        remove_columns=["text"],
)

# Remove all items that are shorter than the minimum length
processed_dataset = processed_dataset.filter(
    lambda x: len(x["input_ids"]) == 64
)


Map:   0%|          | 0/898356 [00:00<?, ? examples/s]

Loading cached processed dataset at /Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes_english_na/full/1.0.0/fae6154891a4bae471c04c7f324ef76b5da7f8c32981606c5ceec3d1373563cf/cache-61e2693e224df5e6.arrow
Loading cached processed dataset at /Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes_english_na/full/1.0.0/fae6154891a4bae471c04c7f324ef76b5da7f8c32981606c5ceec3d1373563cf/cache-fe3230628325abab.arrow


Filter:   0%|          | 0/898356 [00:00<?, ? examples/s]

Loading cached processed dataset at /Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes_english_na/full/1.0.0/fae6154891a4bae471c04c7f324ef76b5da7f8c32981606c5ceec3d1373563cf/cache-5d3c002a83769b83.arrow
Loading cached processed dataset at /Users/zebulongoriely/.cache/huggingface/datasets/transformersegmentation___childes_english_na/full/1.0.0/fae6154891a4bae471c04c7f324ef76b5da7f8c32981606c5ceec3d1373563cf/cache-ac3151bdb6ab61e2.arrow


In [8]:
model(torch.tensor(processed_dataset['input_ids'][0]), labels=torch.tensor(processed_dataset['labels'][0], dtype=torch.long))

KeyError: 'input_ids'

In [None]:
# Train the model
from transformers import Trainer, TrainingArguments

import evaluate
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,             # total # of training epochs
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=processed_dataset,         # training dataset
    eval_dataset=processed_dataset,          # evaluation dataset
)


In [None]:
NUM_EPOCHS = 10
# BEST_MODEL_PATH = 'best_model.pth'
best_accuracy = 0.0

import torch.optim as optim
import torch.nn.functional as F
from src.data.data import subsequent_mask
from src.segmentation.evaluate import evaluate

optimizer = optim.SGD(probe.classifier_layer.parameters(), lr=0.001, momentum=0.9)
length = len(train_dataloader)

for epoch in range(NUM_EPOCHS):
    
    i = 0
    for phonemes, boundaries in iter(train_dataloader):
        phonemes = phonemes.to(device)
        boundaries = boundaries.to(device)
        optimizer.zero_grad()
        mask = subsequent_mask(phonemes.shape[1])
        outputs = probe(phonemes, mask)
        loss = F.cross_entropy(outputs[0], boundaries[0])
        loss.backward()
        optimizer.step()
        i+=1
        if i % 100 == 0:
            print('Epoch: %d, Loss: %f, Batch: %d/%d' % (epoch, loss.item(), i, length))
    
    test_error_count = 0.0
    total_boundaries = 0
    gold_utterances = []
    predicted_utterances = []
    for phonemes, boundaries in iter(test_dataloader):
        phonemes = phonemes.to(device)
        boundaries = boundaries.to(device)
        mask = subsequent_mask(phonemes.shape[1])
        outputs = probe(phonemes, mask)
        test_error_count += float(torch.sum(torch.abs(boundaries[0] - outputs[0].argmax(1))))
        total_boundaries += outputs[0].shape[0]
        predicted_boundaries = outputs[0].argmax(1)
        gold_utterances.append(' '.join([(';eword ' if b.item() else '') + corpus.dictionary.idx2word[c.item()] for c, b in zip(phonemes[0,1:], boundaries[0,1:])]))
        predicted_utterances.append(' '.join([(';eword ' if b.item() else '') + corpus.dictionary.idx2word[c.item()] for c, b in zip(phonemes[0,1:], predicted_boundaries[1:])]))
    
    results = evaluate(gold_utterances, predicted_utterances)
    test_accuracy = 1.0 - float(test_error_count) / total_boundaries
    print('%d: %f' % (epoch, test_accuracy))
    print(results)
    if test_accuracy > best_accuracy:
        # torch.save(model.state_dict(), BEST_MODEL_PATH)
        best_accuracy = test_accuracy

ModuleNotFoundError: No module named 'src.data.data'