In [3]:
#!/usr/bin/env python
# coding=utf-8
# Copyright BigScience, The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning T0 in PyTorch, optionally few-shot.
This script is adapted from
https://github.com/huggingface/transformers/blob/master/examples/pytorch/multiple-choice/run_swag_no_trainer.py
as well as
https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/run_summarization_no_trainer.py
"""

import argparse
import logging
import os
import random
from dataclasses import dataclass
from itertools import chain
from typing import Optional, Union
import csv
import math

import datasets
import torch
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    PreTrainedTokenizerBase,
    default_data_collator,
    DataCollatorForSeq2Seq,
    AdamW,
    SchedulerType,
    get_scheduler,
    set_seed,
)
from transformers.file_utils import PaddingStrategy
from promptsource.templates import DatasetTemplates


logger = logging.getLogger(__name__)
accelerator = Accelerator()


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
args = {}
args['dataset_name'] = 'super_glue'
args['dataset_config_name'] = 'rte'
args['template_name'] = 'does this imply'
args['config_name'] = None
args['model_name_or_path'] = 'bigscience/T0_3B'
args['output_dir'] = '/home/gikok/output'
args['num_train_epochs'] = 1
args['per_device_train_batch_size'] = 4
args['per_device_eval_batch_size'] = 4
args['freeze_encoder'] = True
args['learning_rate'] = 10
args['parallelize'] = True
args['seed'] = 42
args['pad_to_max_length'] = True
args['input_eos'] = False
args['target_max_length'] = 64
args['max_length'] = 128
args['num_warmup_steps'] = 0
args['debug'] = False
args['lr_scheduler_type'] = 'linear'
args['num_shots'] = None
args['weight_decay'] = 0
args['gradient_checkpoint'] = False
args['gradient_accumulation_steps'] = 1
args['max_train_steps'] = None

In [13]:

# Initialize the accelerator. We will let the accelerator handle device placement for us.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info(accelerator.state)

# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
else:
    datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()

accelerator.wait_for_everyone()

# In distributed evaluation, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.

raw_train_dataset = load_dataset(args['dataset_name'], args['dataset_config_name'], split="train")
raw_eval_dataset = load_dataset(args['dataset_name'], args['dataset_config_name'], split="validation")

column_names = raw_eval_dataset.column_names
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args['config_name']:
    config = AutoConfig.from_pretrained(args['config_name'])
elif args['model_name_or_path']:
    config = AutoConfig.from_pretrained(args['model_name_or_path'])

tokenizer = AutoTokenizer.from_pretrained(args['model_name_or_path'])

if args['model_name_or_path']:
    model = AutoModelForSeq2SeqLM.from_pretrained(
        args['model_name_or_path'],
        from_tf=bool(".ckpt" in args['model_name_or_path']),
        config=config,
    )
else:
    logger.info("Training new model from scratch")
    model = AutoModelForSeq2SeqLM.from_config(config)

# Preprocessing the datasets.
# First we tokenize all the texts.
padding = "max_length" if args['pad_to_max_length'] else False

# Get the prompt to apply and the possible targets.
# TODO(Victor): If pulling from pre-processed data, remove this logic.
prompts = DatasetTemplates(
    f"{args['dataset_name']}"
    if args['dataset_config_name'] is None
    else f"{args['dataset_name']}/{args['dataset_config_name']}"
    )
template = prompts[args['template_name']]

def preprocess_train(examples):
    bs = len(examples[column_names[0]])

    input_texts = []
    target_texts = []
    for i in range(bs):
        ex = {
            k: examples[k][i]
            for k in column_names
        }
        input, target = template.apply(ex)
        ex_answer_choices = template.get_answer_choices_list(ex)
        assert target in ex_answer_choices
        input_texts.append(input)
        target_texts.append(target)

    model_inputs = tokenizer(
        input_texts,
        padding=padding,
        max_length=args['max_length'],
        truncation=True,
        add_special_tokens=args['input_eos'],
    )

    with tokenizer.as_target_tokenizer():
        tokenized_targets = tokenizer(
            target_texts,
            padding=padding,
            max_length=args['target_max_length'],
            truncation=True,
            add_special_tokens=False,
        )
        model_inputs['labels'] = [
            [(t if t != tokenizer.pad_token_id else -100) for t in targets]
            for targets in tokenized_targets["input_ids"]
        ]
    return model_inputs

def preprocess_eval(examples):
    bs = len(examples[column_names[0]])

    input_texts = []
    target_texts = []
    answer_choices_texts = []
    for i in range(bs):
        ex = {
            k: examples[k][i]
            for k in column_names
        }
        input, target = template.apply(ex)
        ex_answer_choices = template.get_answer_choices_list(ex)
        assert target in ex_answer_choices
        input_texts.append(input)
        target_texts.append(target)
        answer_choices_texts.append(ex_answer_choices)

    tokenized_inputs = tokenizer(
        input_texts,
        padding=padding,
        max_length=args['max_length'],
        truncation=True,
        add_special_tokens=False,
    )
    tokenized_targets = [
        tokenizer(
            ans_choi,
            padding=True,
            max_length=args['target_max_length'],
            truncation=True,
        )
        for ans_choi in answer_choices_texts
    ]

    features = {
        k: [
            [elem for _ in range(len(tokenized_targets[idx]["input_ids"]))]
            for idx, elem in enumerate(v)
        ]
        for k, v in tokenized_inputs.items()
    }

    features["labels"] = [
        tokenized_targets[idx]["input_ids"]
        for idx in range(bs)
    ]
    features["labels_attention_mask"] = [
        tokenized_targets[idx]["attention_mask"]
        for idx in range(bs)
    ]
    features["targets"] = [
        answer_choices_texts[idx].index(t)
        for idx, t in enumerate(target_texts)
    ]

    return features

with accelerator.main_process_first():
    eval_dataset = raw_eval_dataset.map(preprocess_eval, batched=True, remove_columns=column_names)
    train_dataset = raw_train_dataset.map(preprocess_train, batched=True, remove_columns=column_names)

# DataLoaders creation:
train_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 if accelerator.use_fp16 else None
)
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=train_collator,
    batch_size=args['per_device_train_batch_size']
)

if args['pad_to_max_length']:
    # If padding was already done ot max length, we use the default data collator that will just convert everything
    # to tensors.
    eval_collator = default_data_collator
else:
    # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
    # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
    # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
    eval_collator = DataCollatorForMultipleChoice(
        tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
    )
eval_dataloader = DataLoader(eval_dataset, collate_fn=eval_collator, batch_size=args['per_device_eval_batch_size'])


08/30/2022 10:07:54 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

loading configuration file https://huggingface.co/bigscience/T0_3B/resolve/main/config.json from cache at /home/gikok/.cache/huggingface/transformers/7b128e6b48089ae556964fea17b39635abd0124e77f8fa30267896af500a4d6d.a54ecffc6881ea8ae0af8a0dca40a7bcd51ccf51d434d2f7d0569844f6fb1c60
Model config T5Config {
  "_name_or_path": "bigscience/T0_3B",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 5120,
  "d_kv": 64,
  "d_model": 2048,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 24,
  "num_heads": 32,
  "num_layers": 24,
  "output_past": true,
  "pad_

In [26]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'labels_attention_mask', 'targets'],
    num_rows: 277
})

In [7]:
lm

Parameter containing:
tensor([[-0.0476, -0.0623, -0.0977,  ...,  0.0262,  0.0427, -0.0295],
        [-0.0674, -0.0129, -0.1523,  ..., -0.1436, -0.0286,  0.0645],
        [-0.0310, -0.1582,  0.2676,  ...,  0.0461, -0.0747,  0.1699],
        ...,
        [ 0.0055,  0.0084, -0.0009,  ..., -0.0090,  0.0089,  0.0145],
        [ 0.0079, -0.0142,  0.0160,  ..., -0.0086,  0.0213,  0.0113],
        [ 0.0073,  0.0013,  0.0113,  ...,  0.0068,  0.0009,  0.0005]],
       device='cuda:0', requires_grad=True)

In [10]:
import seaborn as sns
import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()
import torch
with torch.no_grad():
    heat_map = torch.matmul(lm.cpu(), emb.T.cpu())
    ax = sns.heatmap(heat_map)

In [6]:
a = embedding_module.register_full_backward_hook(big_hook)
b = embedding_module.weight.register_hook(hook_fkn)

In [49]:
a.remove()
b.remove()


UsageError: Line magic function `%which` not found.


In [6]:
for epoch in range(1, args['num_train_epochs'] + 1):
    #model.train()

    # freeze encoder updates if specified
    if args['freeze_encoder']:
        for name, param in model.named_parameters():
            if name.startswith("encoder") or name.startswith("decoder") or name.startswith("lm_head"):
                param.requires_grad = False
    for step, batch in enumerate(train_dataloader):
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
        global_steps += 1
        loss = loss.item()
        if step>=0:
            break


  0%|          | 1/24144 [00:01<7:20:18,  1.09s/it]

STEP WITH ADAMW LOL
Parameter containing:
tensor([[  0.1436,   3.8750,   0.5352,  ...,  30.8750,   1.3281, -21.5000],
        [ -4.7812,   7.3125,   3.3438,  ...,  10.3125,  -0.8711,  -1.3047],
        [ -0.4902,   2.3906,  -5.1562,  ...,  -0.5430,   9.8750, -13.5625],
        ...,
        [ -0.3020,  -0.3723,   1.2172,  ...,   1.8101,   1.0143,  -0.7559],
        [ -0.7375,   0.0380,   0.1171,  ...,   1.1145,  -0.9926,   1.0823],
        [  0.4283,   0.9865,   0.6769,  ...,  -1.0338,  -0.2796,  -0.3376]],
       device='cuda:0', requires_grad=True) PRINTING p
tensor([[-1.3869e-04,  2.3134e-04, -1.8927e-05,  ...,  5.3890e-04,
          4.5927e-04,  1.4674e-04],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        

In [13]:
p[1].data.add_

<function Tensor.add_>

In [46]:
p[-1][1][3,5]/p[0][1][3,5]

tensor(-0.0780, device='cuda:0', grad_fn=<DivBackward0>)

In [8]:
for name, param in model.named_parameters():
    if name.startswith("encoder") or name.startswith("decoder") or name.startswith("lm_head"):
        param.requires_grad = False

In [4]:

def fhook(module: nn.Module, _inputs, _outputs):
    print("FORWARD_HOOK", module)
    print(type(_inputs), len(_inputs), type(_outputs), len(_outputs))
    print("FORWARD_HOOK INPUT SHAPE",_inputs[0].shape)
    print("FORWARD_HOOK OUTUT SHAPE",_outputs[0].shape, _outputs[1].shape, _outputs[2].shape, _outputs[3].shape)
    print("")


In [5]:

def bhook(module: nn.Module, _inputs, _outputs):
    print("BACKWARD", module)
    print(type(_inputs), len(_inputs), type(_outputs), len(_outputs))
    if _inputs[0] is not None:
        print("BINPUT SHAPE",_inputs[0].shape)
    if _outputs[0] is not None:
        print("BOUTUT SHAPE",_outputs[0].shape)
    print("")

In [40]:
l = []
for m in model.modules():
    print(m)
    print("******")

T5ForConditionalGeneration(
  (shared): Embedding(38136, 2048)
  (encoder): T5Stack(
    (embed_tokens): Embedding(38136, 2048)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=2048, out_features=2048, bias=False)
              (k): Linear(in_features=2048, out_features=2048, bias=False)
              (v): Linear(in_features=2048, out_features=2048, bias=False)
              (o): Linear(in_features=2048, out_features=2048, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedGeluDense(
              (wi_0): Linear(in_features=2048, out_features=5120, bias=False)
              (wi_1): Linear(in_features=2048, out_features=5120, bias=False)
      

In [29]:
m = list(model.modules())

In [39]:
m

[T5ForConditionalGeneration(
   (shared): Embedding(38136, 2048)
   (encoder): T5Stack(
     (embed_tokens): Embedding(38136, 2048)
     (block): ModuleList(
       (0): T5Block(
         (layer): ModuleList(
           (0): T5LayerSelfAttention(
             (SelfAttention): T5Attention(
               (q): Linear(in_features=2048, out_features=2048, bias=False)
               (k): Linear(in_features=2048, out_features=2048, bias=False)
               (v): Linear(in_features=2048, out_features=2048, bias=False)
               (o): Linear(in_features=2048, out_features=2048, bias=False)
               (relative_attention_bias): Embedding(32, 32)
             )
             (layer_norm): T5LayerNorm()
             (dropout): Dropout(p=0.1, inplace=False)
           )
           (1): T5LayerFF(
             (DenseReluDense): T5DenseGatedGeluDense(
               (wi_0): Linear(in_features=2048, out_features=5120, bias=False)
               (wi_1): Linear(in_features=2048, out_features=51

In [37]:
m[-1].weight.shape

torch.Size([38136, 2048])

In [25]:

model.train()
for name, param in model.named_parameters():
    if name.startswith("encoder") or name.startswith("decoder"):
        param.requires_grad = False
    if name.startswith('shared') or name.startswith("lm_head"):
        print("lol")
        t = torch.tensor(np.zeros((param.shape[0], param.shape[1])))
        t[-len(items):,:] = 1
        param.register_hook(lambda grad: grad*t)
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
global_steps += 1
loss = loss.item()
if accelerator.is_main_process:
    tqdm.write(f"epoch = {1}, step = {global_steps}, loss = {loss}")


lol
lol


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [60]:
last(model.children())

NameError: name 'last' is not defined

In [32]:
old_values[0][1].grad

In [9]:
t = torch.tensor(np.zeros((new_values[0][1].shape[0], new_values[0][1].shape[1])))

In [10]:
t[-len(items):,:] = 1

In [62]:
hook.remove()

In [28]:
l = 0
for mod in model.modules():
    l +=1
    if isinstance(mod, Embedding):
        if mod.num_embeddings == 38136:
            print(l,mod)
            print(type(mod))
            print(mod.num_embeddings)
            print("****************")

2 Embedding(38136, 2048)
<class 'torch.nn.modules.sparse.Embedding'>
38136
****************


In [38]:
l = 0
for mod in model.modules():
    print(mod)
    print("**************")

T5ForConditionalGeneration(
  (shared): Embedding(38136, 2048)
  (encoder): T5Stack(
    (embed_tokens): Embedding(38136, 2048)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=2048, out_features=2048, bias=False)
              (k): Linear(in_features=2048, out_features=2048, bias=False)
              (v): Linear(in_features=2048, out_features=2048, bias=False)
              (o): Linear(in_features=2048, out_features=2048, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedGeluDense(
              (wi_0): Linear(in_features=2048, out_features=5120, bias=False)
              (wi_1): Linear(in_features=2048, out_features=5120, bias=False)
      

torch.Size([2048, 2048])

T5Attention(
  (q): Linear(in_features=2048, out_features=2048, bias=False)
  (k): Linear(in_features=2048, out_features=2048, bias=False)
  (v): Linear(in_features=2048, out_features=2048, bias=False)
  (o): Linear(in_features=2048, out_features=2048, bias=False)
  (relative_attention_bias): Embedding(32, 32)
)