In [3]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
sys.path.append('/content/drive/MyDrive/lab3.3/lab3.3')

import data

import embedding
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ridge_utils.ridge import bootstrap_ridge, ridge_corr, ridge_corr_pred, zs
from transformers import (
    BertTokenizer, BertForMaskedLM,
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling,
    BertTokenizerFast
)
from datasets import Dataset
from peft import PeftModel, get_peft_model, LoraConfig, TaskType
import torch

In [4]:
with open("../data/raw_text.pkl", "rb") as file:
    rawdata = pickle.load(file)

  rawdata = pickle.load(file)


# New Section

In [5]:
del rawdata['dialogue1']
del rawdata['dialogue2']
del rawdata['dialogue3']
del rawdata['dialogue4']
del rawdata['dialogue5']
del rawdata['dialogue6']
del rawdata['myfirstdaywiththeyankees']
del rawdata['onlyonewaytofindout']

In [6]:
full_stories = list(rawdata.keys())
train_stories, test_stories = train_test_split(full_stories, test_size = 0.3, random_state = 1)

print(len(train_stories))
print(len(test_stories))

model_train_stories, model_eval_stories = train_test_split(train_stories, test_size = 0.3, random_state = 1)

print(len(model_train_stories))
print(len(model_eval_stories))

70
31
49
21


In [7]:
train_sentences = data.split_based_on_flags(rawdata, model_train_stories, 0.7, 30, 100)
eval_sentences = data.split_based_on_flags(rawdata, model_eval_stories, 0.7, 30, 100)

In [8]:
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model

# Your existing data preparation
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(sentences):
    return tokenizer(sentences["text"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)

train_dataset = Dataset.from_dict({"text": train_sentences})
eval_dataset = Dataset.from_dict({"text": eval_sentences})

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Focused parameter sets to try (ordered by likely impact)
param_sets = [
    # Baseline (your original)
    {"rank": 4, "alpha": 16, "dropout": 0.1, "target_modules": ["query", "value"], "lr": 2e-4, "batch_size": 4, "epochs": 3},

    # Higher rank
    {"rank": 8, "alpha": 32, "dropout": 0.1, "target_modules": ["query", "value"], "lr": 2e-4, "batch_size": 4, "epochs": 3},

    # More modules
    {"rank": 8, "alpha": 32, "dropout": 0.1, "target_modules": ["query", "key", "value"], "lr": 2e-4, "batch_size": 4, "epochs": 3},

    # Higher learning rate
    {"rank": 8, "alpha": 32, "dropout": 0.1, "target_modules": ["query", "key", "value"], "lr": 5e-4, "batch_size": 4, "epochs": 3},

    # Lower dropout
    {"rank": 8, "alpha": 32, "dropout": 0.05, "target_modules": ["query", "key", "value"], "lr": 5e-4, "batch_size": 4, "epochs": 3},
]
best_loss = float('inf')
best_params = None

for i, params in enumerate(param_sets):
    print(f"\n=== Trial {i+1}/{len(param_sets)} ===")
    print(f"Testing params: {params}")

    model = BertForMaskedLM.from_pretrained("bert-base-uncased")

    peft_config = LoraConfig(
        task_type=TaskType.FEATURE_EXTRACTION,
        inference_mode=False,
        r=params['rank'],
        lora_alpha=params['alpha'],
        lora_dropout=params['dropout'],
        target_modules=params['target_modules']
    )

    model = get_peft_model(model, peft_config)

    training_args = TrainingArguments(
        output_dir=f"./lora-bert-mlm-trial-{i}",
        eval_strategy="epoch",
        per_device_train_batch_size=params['batch_size'],
        per_device_eval_batch_size=params['batch_size'],
        num_train_epochs=params['epochs'],
        learning_rate=params['lr'],
        logging_dir="./logs",
        save_strategy="epoch",
        label_names=["labels"],
        # Added for efficiency
        gradient_accumulation_steps=2,
        warmup_steps=100,
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    trainer.train()
    eval_results = trainer.evaluate()

    if eval_results['eval_loss'] < best_loss:
        best_loss = eval_results['eval_loss']
        best_params = params
        print(f"New best! Loss: {best_loss:.4f}")

    print(f"Current best loss: {best_loss:.4f} with params: {best_params}")

print("\n=== Best Configuration Found ===")
print(f"Loss: {best_loss:.4f}")
print(f"Parameters: {best_params}")

# Final training with best params (you can increase epochs)
print("\nTraining final model with best parameters...")
final_model = BertForMaskedLM.from_pretrained("bert-base-uncased")

# Using this to select best parameters for training
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    inference_mode=False,
    r=best_params['rank'],
    lora_alpha=best_params['alpha'],
    lora_dropout=best_params['dropout'],
    target_modules=best_params['target_modules']
)

final_model = get_peft_model(final_model, peft_config)

training_args = TrainingArguments(
    output_dir="./lora-bert-mlm-final",
    eval_strategy="epoch",
    per_device_train_batch_size=best_params['batch_size'],
    per_device_eval_batch_size=best_params['batch_size'],
    num_train_epochs=15,  # Increased for final training
    learning_rate=best_params['lr'],
    logging_dir="./logs",
    save_strategy="epoch",
    label_names=["labels"],
    warmup_steps=100,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=final_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/2278 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]


=== Trial 1/5 ===
Testing params: {'rank': 4, 'alpha': 16, 'dropout': 0.1, 'target_modules': ['query', 'value'], 'lr': 0.0002, 'batch_size': 4, 'epochs': 3}


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m211980067[0m ([33m211980067-gift-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.6345,2.415262
2,2.5109,2.505315
3,2.552,2.404398


New best! Loss: 2.4762
Current best loss: 2.4762 with params: {'rank': 4, 'alpha': 16, 'dropout': 0.1, 'target_modules': ['query', 'value'], 'lr': 0.0002, 'batch_size': 4, 'epochs': 3}

=== Trial 2/5 ===
Testing params: {'rank': 8, 'alpha': 32, 'dropout': 0.1, 'target_modules': ['query', 'value'], 'lr': 0.0002, 'batch_size': 4, 'epochs': 3}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.6247,2.398872
2,2.4953,2.485325
3,2.5344,2.385466


New best! Loss: 2.4527
Current best loss: 2.4527 with params: {'rank': 8, 'alpha': 32, 'dropout': 0.1, 'target_modules': ['query', 'value'], 'lr': 0.0002, 'batch_size': 4, 'epochs': 3}

=== Trial 3/5 ===
Testing params: {'rank': 8, 'alpha': 32, 'dropout': 0.1, 'target_modules': ['query', 'key', 'value'], 'lr': 0.0002, 'batch_size': 4, 'epochs': 3}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.6202,2.398003
2,2.4807,2.478733
3,2.5214,2.379773


New best! Loss: 2.4471
Current best loss: 2.4471 with params: {'rank': 8, 'alpha': 32, 'dropout': 0.1, 'target_modules': ['query', 'key', 'value'], 'lr': 0.0002, 'batch_size': 4, 'epochs': 3}

=== Trial 4/5 ===
Testing params: {'rank': 8, 'alpha': 32, 'dropout': 0.1, 'target_modules': ['query', 'key', 'value'], 'lr': 0.0005, 'batch_size': 4, 'epochs': 3}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.594,2.362696
2,2.4459,2.449733
3,2.4891,2.351205


New best! Loss: 2.4070
Current best loss: 2.4070 with params: {'rank': 8, 'alpha': 32, 'dropout': 0.1, 'target_modules': ['query', 'key', 'value'], 'lr': 0.0005, 'batch_size': 4, 'epochs': 3}

=== Trial 5/5 ===
Testing params: {'rank': 8, 'alpha': 32, 'dropout': 0.05, 'target_modules': ['query', 'key', 'value'], 'lr': 0.0005, 'batch_size': 4, 'epochs': 3}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.593,2.362209
2,2.4384,2.451414
3,2.4926,2.35005


Current best loss: 2.4070 with params: {'rank': 8, 'alpha': 32, 'dropout': 0.1, 'target_modules': ['query', 'key', 'value'], 'lr': 0.0005, 'batch_size': 4, 'epochs': 3}

=== Best Configuration Found ===
Loss: 2.4070
Parameters: {'rank': 8, 'alpha': 32, 'dropout': 0.1, 'target_modules': ['query', 'key', 'value'], 'lr': 0.0005, 'batch_size': 4, 'epochs': 3}

Training final model with best parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,2.368922
2,2.634800,2.455584
3,2.634800,2.383236
4,2.485100,2.371396
5,2.485100,2.431938
6,2.414800,2.266464
7,2.414800,2.332599
8,2.427700,2.332047
9,2.380900,2.356575
10,2.380900,2.320402


TrainOutput(global_step=4275, training_loss=2.4219236710754752, metrics={'train_runtime': 866.861, 'train_samples_per_second': 39.418, 'train_steps_per_second': 4.932, 'total_flos': 2260035982126080.0, 'train_loss': 2.4219236710754752, 'epoch': 15.0})

In [9]:
# Saving my model
final_model.save_pretrained("./lora-bert-mlm-final")
tokenizer.save_pretrained("./lora-bert-mlm-final")

('./lora-bert-mlm-final/tokenizer_config.json',
 './lora-bert-mlm-final/special_tokens_map.json',
 './lora-bert-mlm-final/vocab.txt',
 './lora-bert-mlm-final/added_tokens.json')

In [23]:
base_model = BertForMaskedLM.from_pretrained("bert-base-uncased")
model = PeftModel.from_pretrained(base_model, "./lora-bert-mlm-final")
model.eval()


tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
def auto_embeddings_pretrained(data, model, tokenizer, stories, delay = [1, 2, 3, 4]):

    dict_embeddings = {}

    for story in stories:
        text = data[story].data
        encoded_input = tokenizer(text, is_split_into_words=True, return_overflowing_tokens=True,
                            return_token_type_ids=True, padding=True, truncation=True, return_tensors='pt')

        inputs = {
            "input_ids": encoded_input["input_ids"],
            "attention_mask": encoded_input["attention_mask"],
        }

        with torch.no_grad():
            output = model.bert(**inputs)

        hidden_state = output.last_hidden_state.cpu().numpy()
        embed = np.zeros((len(text), len(hidden_state[0][0])))
        count = np.zeros(len(text))

        for i in range(len(hidden_state)):
            word_ids = encoded_input.word_ids(batch_index = i)
            for j in range(len(hidden_state[i])):
                idx = word_ids[j]
                if idx is not None:
                    embed[idx, :] += hidden_state[i][j, :]
                    count[idx] += 1

        embed = np.where(count[:, np.newaxis] != 0, embed / count[:, np.newaxis], 0)
        dict_embeddings[story] = embed


    dict_embeddings = embedding.dict_downsample(data, dict_embeddings)
    dict_embeddings = embedding.dict_makedelayed(dict_embeddings, delay)

    return dict_embeddings

In [27]:
finetuned_embedding = auto_embeddings_pretrained(rawdata, model, tokenizer, train_stories)

  embed = np.where(count[:, np.newaxis] != 0, embed / count[:, np.newaxis], 0)


In [28]:
finetuned_embedding["sweetaspie"]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.06677172,  0.09042621,  1.03845564, ...,  0.        ,
         0.        ,  0.        ],
       [-2.78565026,  0.54313905,  1.65991041, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.20097093,  0.42542226,  1.0338737 , ..., -4.79346807,
        -0.7036959 ,  0.70413964],
       [ 0.2645505 , -0.30262116,  1.10821335, ..., -2.24329931,
         0.97797726, -0.02003607],
       [ 2.19194306,  0.14173025,  1.06896852, ...,  0.434722  ,
        -0.10733182,  0.02338018]])