In [77]:
import torch
import pandas as pd
from transformers import DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, \
    AutoTokenizer, GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, OpenAIGPTLMHeadModel, OpenAIGPTConfig, \
    AutoModelForSequenceClassification, pipeline
from datasets import load_dataset
import numpy as np
import evaluate
from sklearn.metrics import classification_report
import math


In [3]:
# Set to GPU if available.
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('Using GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using CPU instead.')

Using GPU: NVIDIA A100 80GB PCIe


In [80]:
dataset = load_dataset("wikitext", 'wikitext-2-raw-v1', download_mode="force_redownload", ignore_verifications=True)



Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [81]:
#tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('openai-gpt')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({'eos_token': '[EOS]'})


Using pad_token, but it is not set yet.
Using eos_token, but it is not set yet.


In [82]:
#print(dataset["train"][3])
print(tokenizer.vocab_size)

40478


In [83]:
def preproc_data(data):
    return tokenizer([s for s in data["text"]])

In [84]:
tokenized_dataset = dataset.map(preproc_data, batched=True, remove_columns=dataset["train"].column_names)
# tokenized_dataset = tokenized_dataset.remove_columns('text')

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [85]:
print(tokenized_dataset['train'][1])
#tokenizer([' = Valkyria Chronicles III = \n'])

{'input_ids': [303, 2007, 3680, 8593, 32543, 17887, 303], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [86]:
block_size = 128

def group_texts(examples):
    # This partially copied from hugging face causal modeling tutorial.
    # Concatenate all the tokens in a batch
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # Cut off any extra that won't be divisible by block size
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    #result["labels"] = result["input_ids"].copy()
    return result

grouped_dataset = tokenized_dataset.map(group_texts, batched=True)
#print(concat)
#tokenized_dataset.map(test_f)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [87]:
print(grouped_dataset['train'][1])

{'input_ids': [481, 244, 18220, 244, 240, 246, 1922, 679, 4381, 6611, 6255, 481, 7037, 498, 2857, 6149, 2167, 481, 1420, 10291, 6847, 2557, 763, 9036, 2752, 1301, 10940, 488, 640, 29185, 1006, 481, 12514, 6611, 244, 926, 14231, 253, 6996, 244, 239, 481, 2467, 1351, 10051, 500, 27812, 240, 3807, 715, 246, 1719, 8487, 498, 481, 1129, 1256, 504, 2007, 3680, 8593, 32543, 7735, 239, 1000, 507, 19914, 481, 7789, 4374, 498, 481, 5025, 240, 507, 1359, 1079, 1040, 9163, 25112, 240, 1389, 557, 1457, 481, 2467, 725, 14741, 562, 5025, 24019, 239, 6371, 11723, 565, 6017, 1642, 832, 254, 488, 35526, 1617, 490, 571, 8944, 540, 951, 31990, 1109, 2118, 617, 5144, 23688, 240, 1412, 556, 2007, 3680, 8593, 32543, 7735, 6760, 14152, 8944, 21821, 30937, 239, 246], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [88]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [89]:
N_HEAD = 4
N_DIM = 128
N_LAYER = 4
VOCAB_SIZE = tokenizer.vocab_size

configuration = OpenAIGPTConfig(n_head=N_HEAD, n_embd=N_DIM, n_layer=N_LAYER, vocab_size=VOCAB_SIZE)

In [90]:
model = OpenAIGPTLMHeadModel(configuration)

In [91]:
print(configuration)

OpenAIGPTConfig {
  "afn": "gelu",
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "openai-gpt",
  "n_embd": 32,
  "n_head": 2,
  "n_layer": 2,
  "n_positions": 512,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.32.1",
  "vocab_size": 40478
}


In [92]:
metric = evaluate.load('accuracy')
def compute_metrics(model_out):
    # Also from huggingface tutorial (not used)
    logits, labels = model_out
    print(logits)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [93]:
model_dir = 'trained_hw2_model'

In [131]:
training_args = TrainingArguments(
    output_dir='trained_hw2_model',
    evaluation_strategy='epoch',
    disable_tqdm=False,
    learning_rate=1e-4,
    num_train_epochs=50,
    log_level='info',
    logging_strategy='epoch',
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [132]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=grouped_dataset['train'],
    eval_dataset=grouped_dataset['validation'],
    #compute_metrics=compute_metrics,
    data_collator=data_collator
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [133]:
#print(h for h in trainer.state.log_history[2])

In [134]:
trainer.train()

***** Running training *****
  Num examples = 18,391
  Num Epochs = 50
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 14,400
  Number of trainable parameters = 1,337,088


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 1912
  Batch size = 64
Saving model checkpoint to trained_hw2_model/checkpoint-500
Configuration saved in trained_hw2_model/checkpoint-500/config.json
Configuration saved in trained_hw2_model/checkpoint-500/generation_config.json
Model weights saved in trained_hw2_model/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1912
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1912
  Batch size = 64
Saving model checkpoint to trained_hw2_model/checkpoint-1000
Configuration saved in trained_hw2_model/checkpoint-1000/config.json
Configuration saved in trained_hw2_model/checkpoint-1000/generation_config.json
Model weights saved in trained_hw2_model/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1912
  Batch size = 64
***** Running Evaluation *****
  Num examples = 1912
  Batch size = 64
Saving model checkpoint to trained_hw2_model/checkpoint-1500
Configuration save

TrainOutput(global_step=14400, training_loss=5.33978251139323, metrics={'train_runtime': 440.5097, 'train_samples_per_second': 2087.468, 'train_steps_per_second': 32.689, 'total_flos': 17943495475200.0, 'train_loss': 5.33978251139323, 'epoch': 50.0})

In [136]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1912
  Batch size = 64


Perplexity: 319.66


In [137]:
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(model_dir)

Configuration saved in trained_hw2_model/config.json
Configuration saved in trained_hw2_model/generation_config.json
Model weights saved in trained_hw2_model/pytorch_model.bin


In [138]:
# Finetuning
intent_train_path = "./nlp244_hw2_data/hw2_train.csv"
intent_data = load_dataset("csv", data_files=intent_train_path)['train']

intent_data = intent_data.train_test_split(test_size=0.2)
intent_train = intent_data['train'].flatten()
intent_val = intent_data['test'].flatten()


In [139]:
all_relations = list(sorted(set([j for d in intent_train["Core Relations"] if d is not None for j in d.split()] + ["None"])))
relation_to_idx = {r:i for i, r in enumerate(all_relations)}
idx_to_relation = {i:r for i, r in enumerate(all_relations)}

def preproc_intent_data(d):
    # Tokenize
    res = tokenizer(d['utterances'], max_length=50, padding='longest', truncation=True)
    
    # Encode core relations
    if d["Core Relations"] == None:
        res['label'] = relation_to_idx["None"]
    else:
        for r in d["Core Relations"].split():
            res["label"] = relation_to_idx[r]
    return res

In [140]:
#intent_data = intent_data.map(preproc_intent_data, remove_columns=intent_train.column_names)
intent_train_tok = intent_train.map(preproc_intent_data, remove_columns=intent_train.column_names)
intent_val_tok = intent_val.map(preproc_intent_data, remove_columns=intent_val.column_names)
print(intent_train_tok)


Map:   0%|          | 0/1802 [00:00<?, ? examples/s]

Map:   0%|          | 0/451 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'label'],
    num_rows: 1802
})


In [141]:
print(intent_train_tok['input_ids'][0:10])

[[249, 823, 485, 1085, 6445, 702, 6760, 2703, 7738], [890, 562, 24997, 885, 522, 1785, 500, 8725], [626, 6097, 8395, 483, 5673, 775, 6445], [1572, 1085, 246, 4121, 500, 37088], [14121, 562, 27306], [640, 655, 775, 6445, 885, 500, 16237], [1085, 6445, 702, 23261], [1788, 510, 481, 11394, 498, 481, 4121, 2866, 15548], [1788, 4413, 20267, 17591], [1085, 589, 4132, 670, 3876, 11152, 260, 246, 783, 1777, 4121]]


In [142]:
from transformers import DataCollatorWithPadding
ft_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [143]:
ft_model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=len(all_relations), id2label=idx_to_relation, label2id=relation_to_idx)
ft_model.resize_token_embeddings(len(tokenizer))
ft_model.config.pad_token_id = tokenizer.pad_token_id
ft_model.to(device)

loading configuration file trained_hw2_model/config.json
Model config OpenAIGPTConfig {
  "_name_or_path": "trained_hw2_model",
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTLMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "id2label": {
    "0": "None",
    "1": "actor.gender",
    "2": "gr.amount",
    "3": "movie.country",
    "4": "movie.directed_by",
    "5": "movie.estimated_budget",
    "6": "movie.genre",
    "7": "movie.gross_revenue",
    "8": "movie.initial_release_date",
    "9": "movie.language",
    "10": "movie.locations",
    "11": "movie.music",
    "12": "movie.produced_by",
    "13": "movie.production_companies",
    "14": "movie.rating",
    "15": "movie.starring.actor",
    "16": "movie.starring.character",
    "17": "movie.subjects",
    "18": "person.date_of_birth"
  },
  "initializer_range": 0.02,
  "label2id": {
    "None": 0,
    "actor.gender": 1,
    "gr.amount": 2,
    "movie.country": 3,
    "movie.directed_by": 4,
    "movie.estimated_bu

OpenAIGPTForSequenceClassification(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(40480, 32)
    (positions_embed): Embedding(512, 32)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-1): 2 x Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (score): Linear(in_features=32, out_features=19, bias=False)
)

In [145]:
ft_training_args = TrainingArguments(
    output_dir='ft_trained_hw2_model',
    evaluation_strategy='epoch',
    disable_tqdm=False,
    learning_rate=1e-4,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    log_level='info'
)

ft_trainer = Trainer(
    model=ft_model,
    args=ft_training_args,
    train_dataset=intent_train_tok,
    eval_dataset=intent_val_tok,
    data_collator=ft_data_collator,
    #compute_metrics=compute_metrics
)


Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [146]:
ft_trainer.train()

***** Running training *****
  Num examples = 1,802
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1,130
  Number of trainable parameters = 1,337,760


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 451
  Batch size = 16
***** Running Evaluation *****
  Num examples = 451
  Batch size = 16
***** Running Evaluation *****
  Num examples = 451
  Batch size = 16
***** Running Evaluation *****
  Num examples = 451
  Batch size = 16
Saving model checkpoint to ft_trained_hw2_model/checkpoint-500
Configuration saved in ft_trained_hw2_model/checkpoint-500/config.json
Model weights saved in ft_trained_hw2_model/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 451
  Batch size = 16
***** Running Evaluation *****
  Num examples = 451
  Batch size = 16
***** Running Evaluation *****
  Num examples = 451
  Batch size = 16
***** Running Evaluation *****
  Num examples = 451
  Batch size = 16
Saving model checkpoint to ft_trained_hw2_model/checkpoint-1000
Configuration saved in ft_trained_hw2_model/checkpoint-1000/config.json
Model weights saved in ft_trained_hw2_model/checkpoint-1000/pytorch_model.bin
***** Running E

TrainOutput(global_step=1130, training_loss=1.913348469691994, metrics={'train_runtime': 9.9398, 'train_samples_per_second': 1812.917, 'train_steps_per_second': 113.685, 'total_flos': 34773818112.0, 'train_loss': 1.913348469691994, 'epoch': 10.0})

In [147]:
ft_trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

Saving model checkpoint to trained_hw2_model
Configuration saved in trained_hw2_model/config.json
Model weights saved in trained_hw2_model/pytorch_model.bin
tokenizer config file saved in trained_hw2_model/tokenizer_config.json
Special tokens file saved in trained_hw2_model/special_tokens_map.json


('trained_hw2_model/tokenizer_config.json',
 'trained_hw2_model/special_tokens_map.json',
 'trained_hw2_model/vocab.json',
 'trained_hw2_model/merges.txt',
 'trained_hw2_model/added_tokens.json',
 'trained_hw2_model/tokenizer.json')

In [148]:
# Evaluate
infer = pipeline(task="text-classification", model=model_dir)
val_preds = [infer(utt)[0]["label"] for utt in intent_val["utterances"]]
val_true =  [relation for relation in intent_val["Core Relations"]]
val_true =  ['None' if relation is None else relation for relation in val_true]
print(classification_report(val_preds, val_true, zero_division=0.0))

loading configuration file trained_hw2_model/config.json
Model config OpenAIGPTConfig {
  "_name_or_path": "trained_hw2_model",
  "afn": "gelu",
  "architectures": [
    "OpenAIGPTForSequenceClassification"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "id2label": {
    "0": "None",
    "1": "actor.gender",
    "2": "gr.amount",
    "3": "movie.country",
    "4": "movie.directed_by",
    "5": "movie.estimated_budget",
    "6": "movie.genre",
    "7": "movie.gross_revenue",
    "8": "movie.initial_release_date",
    "9": "movie.language",
    "10": "movie.locations",
    "11": "movie.music",
    "12": "movie.produced_by",
    "13": "movie.production_companies",
    "14": "movie.rating",
    "15": "movie.starring.actor",
    "16": "movie.starring.character",
    "17": "movie.subjects",
    "18": "person.date_of_birth"
  },
  "initializer_range": 0.02,
  "label2id": {
    "None": 0,
    "actor.gender": 1,
    "gr.amount": 2,
    "movie.country": 3,
    "movie.directed_by": 4,
    "movi

                                                       precision    recall  f1-score   support

                                                 None       0.82      0.54      0.65       100
                                        movie.country       0.53      0.64      0.58        25
                         movie.country movie.language       0.00      0.00      0.00         0
movie.country movie.language movie.genre movie.rating       0.00      0.00      0.00         0
                                    movie.directed_by       0.83      0.60      0.70        96
         movie.directed_by movie.initial_release_date       0.00      0.00      0.00         0
                               movie.estimated_budget       0.27      1.00      0.43         3
                                          movie.genre       0.19      0.50      0.27         6
                           movie.genre movie.subjects       0.00      0.00      0.00         0
                                  movie.gross_rev