In [1]:
# 基本パラメータ
model_name = "cyberagent/open-calm-3b"   # モデルの名前
peft_name  = "lora-calm-3b"              # 学習用モデル(PEFTモデル)の名前
output_dir = "lora-calm-3b-results"      # 学習結果の出力先

In [2]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# モデルの準備
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto",
)

# トークンナイザーの準備
tokenizer = AutoTokenizer.from_pretrained(model_name)

# LoRAモデルの準備
model = PeftModel.from_pretrained(
    model, 
    # peft_name, 
    output_dir + "/checkpoint-1140",
    device_map="auto"
)

# 評価モード
model.eval()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(52224, 2560)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXSdpaAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2560, out_features=7680, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
         

In [3]:
def generate_query_prompt(data_point):
    if data_point['関係条件'].startswith('先生'):
        role = "先生"
    else:
        role = "後輩"
    if data_point['口調条件'].startswith('敬語'):
        tone = "敬語"
    else:
        tone = "タメ口"

    # roleによって少し様子が変わりそうなので入れることにしました．
    # toneは使わない方がよさそうなので外しました．

    prompt = f"""role: {role}
content: {data_point['内容']} 

### Response:
"""
    # print(prompt)
    return prompt

In [4]:
# テキスト生成関数の定義
def generate(instruction,role='先生', input=None,maxTokens=512):
    # 推論
    prompt = generate_query_prompt({'内容':instruction, '関係条件':role, '口調条件': ''})
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
    outputs = model.generate(
        input_ids=input_ids, 
        max_new_tokens=maxTokens, 
        do_sample=True,
        temperature=0.7, 
        top_p=0.75, 
        top_k=40,         
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,
    )
    outputs = outputs[0].tolist()

    # EOSトークンにヒットしたらデコード完了
    if tokenizer.eos_token_id in outputs:
        eos_index = outputs.index(tokenizer.eos_token_id)
        decoded = tokenizer.decode(outputs[:eos_index])

        # ラベル内容のみ抽出
        sentinel = "### Response:\n"
        sentinelLoc = decoded.find(sentinel)
        if sentinelLoc >= 0:
            response = decoded[sentinelLoc+len(sentinel):]
            # print(response)
        else:
            print('Warning: Expected prompt template to be emitted.  Ignoring output.')
            response = None
    else:
        print('Warning: no <eos> detected ignoring output')
        response = None
        decoded = None

    return response, prompt, decoded

In [5]:
generate("テスト")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


('テスト',
 'role: 先生\ncontent: テスト \n\n### Response:\n',
 'role: 先生\ncontent: テスト \n\n### Response:\nテスト')

In [6]:
from datasets import Dataset, DatasetDict

dataset_dict = DatasetDict.load_from_disk('dataset_tokenized_simple')

In [7]:
data_set_test = dataset_dict["valid"]

In [8]:
len(data_set_test)

175

In [9]:
# data = dataset_dict["test"][46]
data = data_set_test[2]

response, prompt, decoded = generate(data['内容'], data['関係条件'])
print("---- PROMPT ----")
print(prompt)
print("---- RESPONSE ----")
print(response)
print("---- TARGET ----")
print(data['添削'])

---- PROMPT ----
role: 先生
content: どういたしまして！間違いを見つけて解決できて良かったね。他にも何か質問があれば気軽に聞いてね。 

### Response:

---- RESPONSE ----
どういまして!間違えを見つけてくれて良かったですね。他に何か聞きたいことあれば気軽に聞いちゃってください。
---- TARGET ----
どういたしまして！間違いを見つけて解決できて良かったね。他にも何か質問があれば気軽に聞いてね。


In [10]:
results = []
for d in data_set_test:
    response, prompt, decoded = generate(d['内容'], d['関係条件'])
    results.append({
        'conv_id': d['会話ID'],
        'utt_id': d['発話ID'],
        'role': d['関係条件'],
        'text': d['内容'],
        'target': d['添削'],
        'predicted': response,
    })

In [11]:
# results

In [12]:
import pandas as pd
df = pd.DataFrame(results)
df.to_excel('valid-results.xlsx')

In [13]:
data_set_train = dataset_dict["train"]

results = []
for d in data_set_train:
    response, prompt, decoded = generate(d['内容'], d['関係条件'])
    results.append({
        'conv_id': d['会話ID'],
        'utt_id': d['発話ID'],
        'role': d['関係条件'],
        'text': d['内容'],
        'target': d['添削'],
        'predicted': response,
    })

df = pd.DataFrame(results)
df.to_excel('train-results.xlsx')