In [1]:
import argparse
import logging
import os
import random
from dataclasses import dataclass
from itertools import chain
from typing import Optional, Union
import csv
import math

import datasets
import torch
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    PreTrainedTokenizerBase,
    default_data_collator,
    DataCollatorForSeq2Seq,
    AdamW,
    SchedulerType,
    get_scheduler,
    set_seed,
)
from transformers.file_utils import PaddingStrategy
from promptsource.templates import DatasetTemplates


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_files = {"train": "train.csv"}
raw_train_dataset = load_dataset('data_test', data_files=data_files, split="train")

Using custom data configuration data_test-7d72f7f8f62e79b5
Reusing dataset csv (/home/gikok/.cache/huggingface/datasets/csv/data_test-7d72f7f8f62e79b5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)
Using custom data configuration data_test-7d72f7f8f62e79b5
Reusing dataset csv (/home/gikok/.cache/huggingface/datasets/csv/data_test-7d72f7f8f62e79b5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


In [3]:
#Prepare training data
column_names = raw_train_dataset.column_names
padding='max_length'
max_length=1024
tokenizer = AutoTokenizer.from_pretrained("/home/transformers2/", use_fast=not True)
def preprocess_train(examples):
    bs = len(examples)
    input_texts = []
    target_texts = []
    for i in range(bs):
        ex = examples[column_names[1]][i].strip('][').split(',')[:10]
        ex = [x.rsplit(' ', 1)[0] for x in ex]
        input = f'With the following query: {examples[column_names[0]][i]}. Which is the best answer?'
        target = ex[0]
        input_texts.append(input)
        target_texts.append(target)

    model_inputs = tokenizer(
        input_texts,
        padding=padding,
        max_length=max_length,
        truncation=True,
        add_special_tokens=True,
    )

    with tokenizer.as_target_tokenizer():
        tokenized_targets = tokenizer(
            target_texts,
            padding=padding,
            max_length=max_length,
            truncation=True,
            add_special_tokens=False,
        )
        model_inputs['labels'] = [
            [(t if t != tokenizer.pad_token_id else -100) for t in targets]
            for targets in tokenized_targets["input_ids"]
        ]
    return model_inputs
train_dataset = raw_train_dataset.map(preprocess_train, batched=True, remove_columns=column_names)


Loading cached processed dataset at /home/gikok/.cache/huggingface/datasets/csv/data_test-7d72f7f8f62e79b5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-2a01debe3f3c17d9.arrow


In [None]:
#Load model
accelerator = Accelerator()
config = AutoConfig.from_pretrained('/home/transformers2/')
model = AutoModelForSeq2SeqLM.from_pretrained(
    '/home/transformers2/',
    from_tf=bool(".ckpt" in '/home/transformers2/'),
    config=config,
)


In [5]:
train_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 if accelerator.use_fp16 else None
)
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=train_collator,
    batch_size=1
)

In [15]:
eval_collator = default_data_collator
eval_dataloader = DataLoader(eval_dataset, collate_fn=eval_collator, batch_size=1)
eval_dataloader = accelerator.prepare(eval_dataloader)


In [7]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.01,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=10*math.ceil(len(train_dataloader)),
)



In [8]:
#Parallelize model and inputs
model.parallelize()
optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
            optimizer, train_dataloader, eval_dataloader)

In [9]:
#Initialize first epoch
model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 2048)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 2048)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=2048, out_features=2048, bias=False)
              (k): Linear(in_features=2048, out_features=2048, bias=False)
              (v): Linear(in_features=2048, out_features=2048, bias=False)
              (o): Linear(in_features=2048, out_features=2048, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedGeluDense(
              (wi_0): Linear(in_features=2048, out_features=5120, bias=False)
              (wi_1): Linear(in_features=2048, out_features=5120, bias=False)
      

In [10]:
#Do one training step
train_features = next(iter(train_dataloader))
outputs = model(**train_features)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
loss = loss.item()

In [17]:
#initiate metrics
metric = load_metric("accuracy")


In [23]:
#Initiate eval
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 2048)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 2048)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=2048, out_features=2048, bias=False)
              (k): Linear(in_features=2048, out_features=2048, bias=False)
              (v): Linear(in_features=2048, out_features=2048, bias=False)
              (o): Linear(in_features=2048, out_features=2048, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedGeluDense(
              (wi_0): Linear(in_features=2048, out_features=5120, bias=False)
              (wi_1): Linear(in_features=2048, out_features=5120, bias=False)
      

In [24]:
#Do one eval timestep
batch = next(iter(eval_dataloader))
model_inputs = {
    k: batch[k]
    for k in ["input_ids", "attention_mask", "labels"]
}
with torch.no_grad():
    logits = model(**model_inputs).logits
masked_log_probs = batch["labels_attention_mask"].unsqueeze(-1) * torch.log_softmax(logits, dim=-1)
seq_token_log_probs = torch.gather(masked_log_probs, -1, batch["labels"].unsqueeze(-1))
seq_log_prob = seq_token_log_probs.squeeze(dim=-1).sum(dim=-1)
seq_log_prob = seq_log_prob.view(batch["targets"].size(0), -1) 
predictions = seq_log_prob.argmax(dim=-1)

metric.add_batch(
    predictions=accelerator.gather(predictions),
    references=accelerator.gather(batch["targets"]),
)

ValueError: too many values to unpack (expected 2)