In [1]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import data
import embedding
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ridge_utils.ridge import bootstrap_ridge, ridge_corr, ridge_corr_pred, zs
from transformers import (
    BertTokenizer, BertForMaskedLM,
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling,
    BertTokenizerFast
)
from datasets import Dataset
from peft import PeftModel, get_peft_model, LoraConfig, TaskType
import torch

In [2]:
with open("../data/raw_text.pkl", "rb") as file:
    rawdata = pickle.load(file)

In [3]:
del rawdata['dialogue1']
del rawdata['dialogue2']
del rawdata['dialogue3']
del rawdata['dialogue4']
del rawdata['dialogue5']
del rawdata['dialogue6']
del rawdata['myfirstdaywiththeyankees']
del rawdata['onlyonewaytofindout']

In [4]:
full_stories = list(rawdata.keys())
train_stories, test_stories = train_test_split(full_stories, test_size = 0.3, random_state = 1)

print(len(train_stories))
print(len(test_stories))

model_train_stories, model_eval_stories = train_test_split(train_stories, test_size = 0.3, random_state = 1)

print(len(model_train_stories))
print(len(model_eval_stories))

70
31
49
21


In [5]:
train_sentences = data.split_based_on_flags(rawdata, model_train_stories, 0.7, 30, 100)
eval_sentences = data.split_based_on_flags(rawdata, model_eval_stories, 0.7, 30, 100)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(sentences):
    return tokenizer(sentences["text"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)

train_dataset = Dataset.from_dict({"text": train_sentences})
eval_dataset = Dataset.from_dict({"text": eval_sentences})

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Mask 15% of all tokens
)

# Define Base Model
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

# Define LoRA (MLM training)
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,  # Masked Language Model
    inference_mode=False,
    r=4,  # Rank
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["query", "value"]  # Where to use LoRA
)

model = get_peft_model(model, peft_config)


training_args = TrainingArguments(
    output_dir="./lora-bert-mlm",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    learning_rate=2e-4,
    logging_dir="./logs",
    save_strategy="epoch",
    label_names=["labels"],
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator, 
    tokenizer=tokenizer,
)

trainer.train()

Map:   0%|          | 0/2278 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

  0%|          | 0/2850 [00:00<?, ?it/s]

{'loss': 2.6853, 'grad_norm': 5.5580973625183105, 'learning_rate': 0.0001649122807017544, 'epoch': 0.88}


  0%|          | 0/257 [00:00<?, ?it/s]

{'eval_loss': 2.4195563793182373, 'eval_runtime': 92.0091, 'eval_samples_per_second': 11.14, 'eval_steps_per_second': 2.793, 'epoch': 1.0}
{'loss': 2.5962, 'grad_norm': 6.568733215332031, 'learning_rate': 0.0001298245614035088, 'epoch': 1.75}


  0%|          | 0/257 [00:00<?, ?it/s]

{'eval_loss': 2.3729665279388428, 'eval_runtime': 116.0928, 'eval_samples_per_second': 8.829, 'eval_steps_per_second': 2.214, 'epoch': 2.0}
{'loss': 2.468, 'grad_norm': 5.166666030883789, 'learning_rate': 9.473684210526316e-05, 'epoch': 2.63}


  0%|          | 0/257 [00:00<?, ?it/s]

{'eval_loss': 2.285646915435791, 'eval_runtime': 87.9047, 'eval_samples_per_second': 11.66, 'eval_steps_per_second': 2.924, 'epoch': 3.0}
{'loss': 2.5328, 'grad_norm': 5.008162975311279, 'learning_rate': 5.9649122807017544e-05, 'epoch': 3.51}


  0%|          | 0/257 [00:00<?, ?it/s]

{'eval_loss': 2.4378578662872314, 'eval_runtime': 81.2395, 'eval_samples_per_second': 12.617, 'eval_steps_per_second': 3.163, 'epoch': 4.0}
{'loss': 2.5036, 'grad_norm': 5.775305271148682, 'learning_rate': 2.456140350877193e-05, 'epoch': 4.39}


  0%|          | 0/257 [00:00<?, ?it/s]

{'eval_loss': 2.4063055515289307, 'eval_runtime': 83.1816, 'eval_samples_per_second': 12.322, 'eval_steps_per_second': 3.09, 'epoch': 5.0}
{'train_runtime': 2891.2559, 'train_samples_per_second': 3.939, 'train_steps_per_second': 0.986, 'train_loss': 2.549360779879386, 'epoch': 5.0}


TrainOutput(global_step=2850, training_loss=2.549360779879386, metrics={'train_runtime': 2891.2559, 'train_samples_per_second': 3.939, 'train_steps_per_second': 0.986, 'total_flos': 750765578757120.0, 'train_loss': 2.549360779879386, 'epoch': 5.0})

In [None]:
model.save_pretrained("./lora-bert-mlm-final")
tokenizer.save_pretrained("./lora-bert-mlm-final")

In [None]:
base_model = BertForMaskedLM.from_pretrained("bert-base-uncased")
model = PeftModel.from_pretrained(base_model, "./lora-bert-mlm-final")
model.eval()  


tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
def auto_embeddings_pretrained(data, model, tokenizer, stories, delay = [1, 2, 3, 4]):

    dict_embeddings = {}

    for story in stories:
        text = data[story].data
        encoded_input = tokenizer(text, is_split_into_words=True, return_overflowing_tokens=True,
                            return_token_type_ids=True, padding=True, truncation=True, return_tensors='pt')
        
        inputs = {
            "input_ids": encoded_input["input_ids"],
            "attention_mask": encoded_input["attention_mask"],
        }

        with torch.no_grad():
            output = model.bert(**inputs)
        
        hidden_state = output.last_hidden_state.cpu().numpy()
        embed = np.zeros((len(text), len(hidden_state[0][0])))
        count = np.zeros(len(text))

        for i in range(len(hidden_state)):
            word_ids = encoded_input.word_ids(batch_index = i)
            for j in range(len(hidden_state[i])):
                idx = word_ids[j]
                if idx is not None:
                    embed[idx, :] += hidden_state[i][j, :]
                    count[idx] += 1
                
        embed = np.where(count[:, np.newaxis] != 0, embed / count[:, np.newaxis], 0)
        dict_embeddings[story] = embed


    dict_embeddings = embedding.dict_downsample(data, dict_embeddings)
    dict_embeddings = embedding.dict_makedelayed(dict_embeddings, delay)

    return dict_embeddings

In [15]:
finetuned_embedding = auto_embeddings_pretrained(rawdata, model, tokenizer, train_stories)

  embed = np.where(count[:, np.newaxis] != 0, embed / count[:, np.newaxis], 0)


In [16]:
finetuned_embedding["sweetaspie"]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.0324818 , -0.10738009,  1.08426415, ...,  0.        ,
         0.        ,  0.        ],
       [-3.30375685,  0.07242581,  1.86833126, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.10034275,  0.3476674 ,  1.22564087, ..., -4.73889171,
        -1.42923256,  0.65081707],
       [ 0.05154551, -0.56172777,  1.49509351, ..., -2.14137482,
         0.62682603,  0.01403062],
       [ 1.66828381, -0.24144794,  0.7808191 , ...,  0.40249676,
        -0.01989881, -0.0352881 ]])