In [15]:
import argparse
import logging
import os
import random
import pandas as pd
from dataclasses import dataclass
from itertools import chain
from typing import Optional, Union
import csv
import math

import datasets
import torch
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    PreTrainedTokenizerBase,
    default_data_collator,
    DataCollatorForSeq2Seq,
    AdamW,
    SchedulerType,
    get_scheduler,
    set_seed,
)
from transformers.file_utils import PaddingStrategy
from promptsource.templates import DatasetTemplates
pd.set_option("display.max_rows", 1200)


In [4]:
raw_train_dataset = load_dataset('data', data_files='prompts_001.parquet.gzip')

Using custom data configuration data-73d83a3e944add1f


Downloading and preparing dataset parquet/data to /home/gikok/.cache/huggingface/datasets/parquet/data-73d83a3e944add1f/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4364.52it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 882.08it/s]


Dataset parquet downloaded and prepared to /home/gikok/.cache/huggingface/datasets/parquet/data-73d83a3e944add1f/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 66.69it/s]


In [5]:
#Load model and tokenizer

model_name = "/home/transformers2"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model and tokenizer loaded")

model.parallelize()
print("Moved model to GPUs")

Model and tokenizer loaded
Moved model to GPUs


In [8]:
# get all item_no and add as tokens
items = pd.read_parquet('data/item_no_6k.parquet.gzip')['item_no'].values.tolist()
tokenizer.add_tokens(items)

0

In [20]:
data = load_dataset('data', data_files='prompts_001.parquet.gzip')

Using custom data configuration data-73d83a3e944add1f
Reusing dataset parquet (/home/gikok/.cache/huggingface/datasets/parquet/data-73d83a3e944add1f/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)
100%|██████████| 1/1 [00:00<00:00, 84.20it/s]


6180864

In [24]:
padding = "max_length"
max_length = 1024
def tokenize_train(examples):
    input_texts = examples["input"]
    target_texts = examples["target"]

    model_inputs = tokenizer(
        input_texts,
        padding=padding,
        max_length=max_length,
        truncation=True,
    )

    with tokenizer.as_target_tokenizer():
        tokenized_targets = tokenizer(
            target_texts,
            padding=padding,
            max_length=max_length,
            truncation=True,
            add_special_tokens=False,
        )
        model_inputs["labels"] = [
            [(t if t != tokenizer.pad_token_id else -100) for t in targets]
            for targets in tokenized_targets["input_ids"]
        ]
    return model_inputs

In [25]:
train_dataset = data.map(
    tokenize_train, batched=True
)

 15%|█▌        | 932/6181 [21:27<2:01:53,  1.39s/ba]