In [None]:
# 代码参考了 https://github.com/facebookresearch/MetaICL

In [20]:
import json
import os
import numpy as np
import pickle as pkl
import random
import transformers
import torch


In [5]:
!pwd

152.95s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


/home/PJLAB/gaoyufei/workdir/llm-deploy/MetaICL


# 数据准备

In [25]:
# 读取数据
train_data_files = ["data/financial_phrasebank/financial_phrasebank_16_100_train.jsonl"]
train_data = []
for train_data_file in train_data_files:
    with open("data/financial_phrasebank/financial_phrasebank_16_100_train.jsonl", "r") as f:
        data = []
        for line in f:
            data.append(json.loads(line))
    train_data.append(data)
train_data

[[{'task': 'financial_phrasebank',
   'input': 'In Finland , the city of Forssa has said it will not pay compensation to food industry companies HK Ruokatalo and Atria for the lye leak into tap water that occurred in March 2008 .',
   'output': 'negative',
   'options': ['negative', 'neutral', 'positive']},
  {'task': 'financial_phrasebank',
   'input': "In the reporting period , the company 's operating profit grew by 43.2 % to EUR 6 million .",
   'output': 'positive',
   'options': ['negative', 'neutral', 'positive']},
  {'task': 'financial_phrasebank',
   'input': 'Circulation revenue has increased by 5 % in Finland and 4 % in Sweden in 2008 .',
   'output': 'positive',
   'options': ['negative', 'neutral', 'positive']},
  {'task': 'financial_phrasebank',
   'input': 'The business had gross written premiums of EUR152 .4 m (  91.5 m ) in 2000 , a net combined ratio of 133 % and 175 staff in total with offices in the UK , Germany and Benelux .',
   'output': 'neutral',
   'options': 

In [26]:

max_length=1024
max_length_per_example=256

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")



def prepro_sentence_pair_single(ids1, ids2, max_length,
                                bos_token_id, eos_token_id,
                                allow_truncation=False):

    #if bos_token_id is not None:
    #    ids1 = [bos_token_id] + ids1
    #if eos_token_id is not None:
    #    ids2 = ids2 + [eos_token_id]
    if allow_truncation and len(ids1)+len(ids2) > max_length:
        ids1 = ids1[len(ids1)+len(ids2)-max_length:] # len = max_length-len(ids2)
        assert len(ids1)+len(ids2)==max_length

    n_mask = max_length-len(ids1)-len(ids2)
    assert n_mask>=0, (max_length, len(ids1), len(ids2))
    input_ids = ids1+ids2+[0 for _ in range(n_mask)]
    attention_mask = [1 for _ in ids1+ids2] + [0 for _ in range(n_mask)]
    token_type_ids = [0 for _ in ids1] + [1 for _ in ids2] + [0 for _ in range(n_mask)]
    return input_ids, attention_mask, token_type_ids



def _prepro_each_datapoint(dp, is_first=True, is_training=False, for_demonstrations=False):
    dp = dp.copy()

    no_label = np.all([option=="" for option in dp["options"]])
    no_input = dp["input"]==""
    if not is_first:
        dp["output"] = "\n\n\n" + dp["output"]
        if "options" in dp:
            dp["options"] = ["\n\n\n" + opt for opt in dp["options"]]
    if not no_input:
        if not no_label:
            dp["input"] = "\n" + dp["input"]

    input_tokens = tokenizer(dp["input"])["input_ids"]

    if is_training or for_demonstrations:
        output_tokens = tokenizer(dp["output"])["input_ids"]

        if "task" in dp:
            if len(input_tokens)>=max_length_per_example - 2 - len(output_tokens):
                if dp["task"].startswith("inst:") and len(input_tokens)<len(output_tokens):
                    output_tokens = output_tokens[:max_length_per_example - 2 - len(input_tokens)]
                else:
                    input_tokens = input_tokens[:max_length_per_example - 2 - len(output_tokens)]

        assert len(input_tokens)+len(output_tokens)+2<=max_length_per_example, \
            (dp.get("task", None), len(input_tokens), len(output_tokens), max_length_per_example)

        return output_tokens, input_tokens


    else:
        assert len(dp["options"])>=2, dp
        assert dp["output"] in dp["options"]
        option_tokens = [tokenizer(option)["input_ids"] for option in dp["options"]]
        option_length = np.max([len(option) for option in option_tokens])

        if len(input_tokens)>=max_length_per_example - 2 - option_length:
            input_tokens = input_tokens[:max_length_per_example - 2 - option_length]

        input_tokens = [input_tokens for _ in option_tokens]
        output_tokens = option_tokens
        option_tokens = [dp["options"].index(dp["output"])]

        return output_tokens, input_tokens, option_tokens


def _tensorize_for_training(train_data):
    for dp in train_data:
        assert type(dp)==dict, ("Each example should be a dictionary", dp)
        assert "input" in dp and "output" in dp, ("Training example should contain input and output", dp)

    # each datapoint: passage, question, options, output
    bos_token_id = tokenizer.bos_token_id
    eos_token_id = tokenizer.eos_token_id

    input_ids, attention_mask, token_type_ids = [], [], []


    for dp in train_data:
        inputs, outputs = _prepro_each_datapoint(
            dp, is_first=True, is_training=True)

        encoded = prepro_sentence_pair_single(
            inputs, outputs, max_length, bos_token_id, eos_token_id)

        input_ids.append(encoded[0])
        attention_mask.append(encoded[1])
        token_type_ids.append(encoded[2])

    return dict(input_ids=torch.LongTensor(input_ids),
                attention_mask=torch.LongTensor(attention_mask),
                token_type_ids=torch.LongTensor(token_type_ids))

# 数据转为tensor
def tensorize_for_training(train_data):
    inputs = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

    for in_ in train_data:
        out = _tensorize_for_training(in_)
        for key in ["input_ids", "attention_mask", "token_type_ids"]:
            inputs[key] += out[key].numpy().tolist()

    N = len(inputs["input_ids"])
    indices = np.random.permutation(range(N))
    for k, v in inputs.items():
        inputs[k] = np.array(v)[indices]

    with open("preprocessed_data", "wb") as f:
        pkl.dump({k:v for k, v in inputs.items()}, f)
    print("Finish saving preprocessed data ...")

tensorize_for_training(train_data)




Finish saving preprocessed data ...


# 训练模型

In [27]:
import logging

# 指定日志路径
log_file = "log.txt"
out_dir = "output"
handlers = [logging.StreamHandler()]
handlers.append(logging.FileHandler(log_file))
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO,
                    handlers=handlers)
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
logger.info(out_dir)

os.makedirs(out_dir, exist_ok=True)

model = MetaICLModel(logger, args.out_dir, args.fp16, args.local_rank)
model.load(args.init_checkpoint, args.gpt2)
model.to_device()
model.setup_optimizer(args.optimization, num_training_steps, args.lr,
                                args.weight_decay, args.warmup_steps)
model.parallel()
model.train()
model.do_train(metaicl_data, args.batch_size, num_training_steps, save_period, log_period)

NameError: name 'metaicl_data' is not defined