In [None]:
# 代码参考了 https://github.com/facebookresearch/MetaICL

In [1]:
import json
import os
import numpy as np
import pickle as pkl
import random
import transformers
import torch


In [5]:
!pwd

152.95s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


/home/PJLAB/gaoyufei/workdir/llm-deploy/MetaICL


# 数据准备

In [2]:
# 读取数据
with open("data/financial_phrasebank/financial_phrasebank_16_100_train.jsonl", "r") as f:
    train_data = []
    for line in f:
        train_data.append(json.loads(line))

train_data[:3]

max_length=1024
max_length_per_example=256

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

def prepro_sentence_pair_single(ids1, ids2, max_length,
                                bos_token_id, eos_token_id,
                                allow_truncation=False):

    #if bos_token_id is not None:
    #    ids1 = [bos_token_id] + ids1
    #if eos_token_id is not None:
    #    ids2 = ids2 + [eos_token_id]
    if allow_truncation and len(ids1)+len(ids2) > max_length:
        ids1 = ids1[len(ids1)+len(ids2)-max_length:] # len = max_length-len(ids2)
        assert len(ids1)+len(ids2)==max_length

    n_mask = max_length-len(ids1)-len(ids2)
    assert n_mask>=0, (max_length, len(ids1), len(ids2))
    input_ids = ids1+ids2+[0 for _ in range(n_mask)]
    attention_mask = [1 for _ in ids1+ids2] + [0 for _ in range(n_mask)]
    token_type_ids = [0 for _ in ids1] + [1 for _ in ids2] + [0 for _ in range(n_mask)]
    return input_ids, attention_mask, token_type_ids


def _prepro_each_datapoint(dp, is_first=True, is_training=False, for_demonstrations=False):
    dp = dp.copy()

    no_label = np.all([option=="" for option in dp["options"]])
    no_input = dp["input"]==""
    if not is_first:
        dp["output"] = "\n\n\n" + dp["output"]
        if "options" in dp:
            dp["options"] = ["\n\n\n" + opt for opt in dp["options"]]
    if not no_input:
        if not no_label:
            dp["input"] = "\n" + dp["input"]

    input_tokens = tokenizer(dp["input"])["input_ids"]

    if is_training or for_demonstrations:
        output_tokens = tokenizer(dp["output"])["input_ids"]

        if "task" in dp:
            if (dp["task"].startswith("inst:piqa") or dp["task"].startswith("inst:yahoo_answers_topics")) and \
                    len(input_tokens)+len(output_tokens)+2>max_length_per_example:
                input_tokens = input_tokens[:max_length_per_example // 2]
                output_tokens = output_tokens[:max_length_per_example // 2 - 2]

            elif len(input_tokens)>=max_length_per_example - 2 - len(output_tokens):
                if dp["task"].startswith("inst:") and len(input_tokens)<len(output_tokens):
                    output_tokens = output_tokens[:max_length_per_example - 2 - len(input_tokens)]
                else:
                    input_tokens = input_tokens[:max_length_per_example - 2 - len(output_tokens)]

        assert len(input_tokens)+len(output_tokens)+2<=max_length_per_example, \
            (dp.get("task", None), len(input_tokens), len(output_tokens), max_length_per_example)

        return output_tokens, input_tokens


    else:
        assert len(dp["options"])>=2, dp
        assert dp["output"] in dp["options"]
        option_tokens = [tokenizer(option)["input_ids"] for option in dp["options"]]
        option_length = np.max([len(option) for option in option_tokens])

        if len(input_tokens)>=max_length_per_example - 2 - option_length:
            input_tokens = input_tokens[:max_length_per_example - 2 - option_length]

        input_tokens = [input_tokens for _ in option_tokens]
        output_tokens = option_tokens
        option_tokens = [dp["options"].index(dp["output"])]

        return output_tokens, input_tokens, option_tokens


def _tensorize_for_training(train_data):
    for dp in train_data:
        assert type(dp)==dict, ("Each example should be a dictionary", dp)
        assert "input" in dp and "output" in dp, ("Training example should contain input and output", dp)

    # each datapoint: passage, question, options, output
    bos_token_id = tokenizer.bos_token_id
    eos_token_id = tokenizer.eos_token_id

    input_ids, attention_mask, token_type_ids = [], [], []
    n_answers = []


    for dp in train_data:
        inputs, outputs = _prepro_each_datapoint(
            dp, is_first=True, is_training=True)

        encoded = prepro_sentence_pair_single(
            inputs, outputs, max_length, bos_token_id, eos_token_id)

        input_ids.append(encoded[0])
        attention_mask.append(encoded[1])
        token_type_ids.append(encoded[2])

    return dict(input_ids=torch.LongTensor(input_ids),
                attention_mask=torch.LongTensor(attention_mask),
                token_type_ids=torch.LongTensor(token_type_ids))

# 数据转为tensor
def tensorize_for_training(train_data, keyword="SST-2", seed=0):

    # tensorize_dir = "tensorized"
    # if not os.path.exists(tensorize_dir):
    #     os.makedirs(tensorize_dir)

    # method = "channel"
    # method_name = method
    # k_name = len(train_data)
    # length_name = max_length
    # postfix = ""

    # tensorize_path = os.path.join(tensorize_dir,
    #                                 "{}_{}_k={}_seed={}_length={}{}-rank=%d.pkl".format(
    #                                     keyword, method_name, k_name, seed, length_name,
    #                                     postfix))

    # print(tensorize_path)
    # all_tensorize_paths = [tensorize_path % i for i in range(n_gpu)]

    # unique_task_names = set([dp["task"] for dp in train_data])
    sharded_inputs = train_data

    inputs = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

    for in_ in sharded_inputs:
        out = _tensorize_for_training(in_)
        for key in ["input_ids", "attention_mask", "token_type_ids"]:
            inputs[key] += out[key].numpy().tolist()

    N = len(inputs["input_ids"])
    indices = np.random.permutation(range(N))
    for k, v in inputs.items():
        inputs[k] = np.array(v)[indices]

    with open("preprocessed_data", "wb") as f:
        pkl.dump({k:v for k, v in inputs.items()}, f)
    print("Finish saving preprocessed data ...")



  from .autonotebook import tqdm as notebook_tqdm


torch.Size([3, 1024])
[tensor([[31591,   198,   818,  ...,     0,     0,     0],
        [31591,   198,   818,  ...,     0,     0,     0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])]


In [None]:

from metaicl.data import MetaICLData
from metaicl.model import MetaICLModel

data = MetaICLData(method="channel", max_length=1024, max_length_per_example=256)
input1 = "Both operating profit and net sales for the six-month period increased as compared to the corresponding period in 2007."
data.tensorize(train_data, [input1], options=["positive", "neutral", "negative"])
dataloader = data.get_dataloader(2, is_training=True)
for batch in dataloader:
     

In [2]:
from metaicl.data import MetaICLData
from metaicl.model import MetaICLModel

# Load the model
data = MetaICLData(method="channel", max_length=1024, max_length_per_example=256)
model = MetaICLModel()
model.load("channel-metaicl")
model.cuda()
model.eval()

# Make a prediction for `input1`
input1 = "Both operating profit and net sales for the six-month period increased as compared to the corresponding period in 2007."
data.tensorize(train_data, [input1], options=["positive", "neutral", "negative"])
prediction = model.do_predict(data)[0]
print (prediction) # positive

# Make another prediction for `input2`
input2 = "The deal will have no significant effect on the acquiring company's equity ratio."
data.tensorize(train_data, [input2], options=["positive", "neutral", "negative"])
prediction = model.do_predict(data)[0]
print (prediction) # neutral



Logging from MetaICLModel:	 Setting up for local_rank=-1, world_size=1


TypeError: info() takes 2 positional arguments but 4 were given