## https://mp.weixin.qq.com/s/zjV2w8WxgLo5FUbTtibBdA
## 环境：python

In [1]:
import os
# 仅设置一块可见
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import torch
print(torch.cuda.current_device())

0


In [4]:
import torch
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration

# load tokenizer and model 
pretrained_model = "/home/asus/文档/AIModel/Randeng-T5-784M-MultiTask-Chinese"

special_tokens = ["<extra_id_{}>".format(i) for i in range(100)]
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model,
    do_lower_case=True,
    max_length=512,
    truncation=True,
    additional_special_tokens=special_tokens,
)
config = T5Config.from_pretrained(pretrained_model)
model = T5ForConditionalGeneration.from_pretrained(pretrained_model, config=config)
model.resize_token_embeddings(len(tokenizer))
model.eval()

device = torch.device("cuda")# ("cuda:1")

# device = 'cpu'
model.to(device)


T5ForConditionalGeneration(
  (shared): Embedding(32596, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32596, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [5]:
text = "意图识别任务：还有双鸭山到淮阴的汽车票吗13号的 这篇文章的类别是什么？Travel-Query/Music-Play/FilmTele-Play/Video-Play/Radio-Listen/HomeAppliance-Control/Weather-Query/Alarm-Update/Calendar-Query/TVProgram-Play/Audio-Play/Other"
encode_dict = tokenizer(text, max_length=512, padding='max_length',truncation=True)

inputs = {
  "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
  "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
}

# generate answer
logits = model.generate(
  input_ids = inputs['input_ids'],
  max_length=100, 
  do_sample= True
)

logits=logits[:,1:]
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]

print(predict_label)

['Travel-Query']


In [6]:
# 读取数据集，这里是直接联网读取，也可以通过下载文件，再读取
import pandas as pd
import matplotlib.pyplot as plt

data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)

In [7]:
train_data

Unnamed: 0,0,1
0,还有双鸭山到淮阴的汽车票吗13号的,Travel-Query
1,从这里怎么回家,Travel-Query
2,随便播放一首专辑阁楼里的佛里的歌,Music-Play
3,给看一下墓王之王嘛,FilmTele-Play
4,我想看挑战两把s686打突变团竞的游戏视频,Video-Play
...,...,...
12095,一千六百五十三加三千一百六十五点六五等于几,Calendar-Query
12096,稍小点客厅空调风速,HomeAppliance-Control
12097,黎耀祥陈豪邓萃雯畲诗曼陈法拉敖嘉年杨怡马浚伟等到场出席,Radio-Listen
12098,百事盖世群星星光演唱会有谁,Video-Play


In [8]:
'/'.join(train_data[1].unique())

'Travel-Query/Music-Play/FilmTele-Play/Video-Play/Radio-Listen/HomeAppliance-Control/Weather-Query/Alarm-Update/Calendar-Query/TVProgram-Play/Audio-Play/Other'

In [9]:
%%time

text = "意图识别任务：【播放周杰伦的歌曲】 这篇文章的类别是什么？Travel-Query/Music-Play/FilmTele-Play/Video-Play/Radio-Listen/HomeAppliance-Control/Weather-Query/Alarm-Update/Calendar-Query/TVProgram-Play/Audio-Play/Other"
encode_dict = tokenizer(text)

inputs = {
  "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
  "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
}

logits = model.generate(
    input_ids = inputs['input_ids'],
    # attention_mask = inputs['attention_mask'],
    max_length=20, 
    do_sample= False
)

logits=logits[:,1:]
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]
predict_label

CPU times: user 238 ms, sys: 0 ns, total: 238 ms
Wall time: 237 ms


['Music-Play']

In [10]:
from tqdm import tqdm_notebook

pred_label = []
for train_text in tqdm_notebook(test_data[0].values):
    text = f"意图识别任务：{train_text} 这篇文章的类别是什么？Travel-Query/Music-Play/FilmTele-Play/Video-Play/Radio-Listen/HomeAppliance-Control/Weather-Query/Alarm-Update/Calendar-Query/TVProgram-Play/Audio-Play/Other"
    encode_dict = tokenizer(text, max_length=512, padding='max_length', truncation=True)
    
    inputs = {
      "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
      "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
    }
    
    # generate answer
    logits = model.generate(
        input_ids = inputs['input_ids'],
        max_length=100, 
        do_sample= True
    )
    
    logits = logits[:,:]
    pred_label += [tokenizer.decode(i,skip_special_tokens=True) for i in logits]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for train_text in tqdm_notebook(test_data[0].values):


  0%|          | 0/3000 [00:00<?, ?it/s]

In [11]:
pd.DataFrame({
    'ID': range(1, len(pred_label) + 1),
    'Target': pred_label,
}).to_csv('nlp_submit.csv', index=None)

In [12]:
from datasets import Dataset
train_data.columns = ["document", "summary"]
train_ds = Dataset.from_pandas(train_data.iloc[:-2000])
eval_ds = Dataset.from_pandas(train_data.iloc[-2000:])

In [13]:
from transformers import T5Tokenizer

pretrained_model = "/home/asus/文档/AIModel/Randeng-T5-784M-MultiTask-Chinese/"

special_tokens = ["<extra_id_{}>".format(i) for i in range(100)]
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model,
    do_lower_case=True,
    max_length=512,
    truncation=True,
    additional_special_tokens=special_tokens,
)

In [14]:
head_prefix = "意图识别任务: "
tail_prefix = " Travel-Query/Music-Play/FilmTele-Play/Video-Play/Radio-Listen/HomeAppliance-Control/Weather-Query/Alarm-Update/Calendar-Query/TVProgram-Play/Audio-Play/Other"

In [15]:
max_input_length = 60
max_target_length = 20

def preprocess_function(examples):
    model_inputs = tokenizer(head_prefix + examples["document"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
train_tokenized_id = train_ds.map(preprocess_function, remove_columns=train_ds.column_names)
eval_tokenized_id = eval_ds.map(preprocess_function, remove_columns=train_ds.column_names)

Map:   0%|          | 0/10100 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [17]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model,  device_map="cuda")# "cuda:1")

2024-05-15 21:01:19.181821: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

batch_size = 4
args = Seq2SeqTrainingArguments(
    "t5-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    gradient_accumulation_steps=10,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=50,
    num_train_epochs=5,
    save_steps=50,
    save_on_each_node=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True,

    fp16=True
)

In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tokenized_id,
    eval_dataset=eval_tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)

In [20]:
# 指定GPU
# device = torch.device("cuda:1")
# trainer.model.to(device)

# trainer.model.train()

trainer.train()

  0%|          | 0/1260 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 2.0035078525543213, 'eval_runtime': 49.3272, 'eval_samples_per_second': 40.546, 'eval_steps_per_second': 10.136, 'epoch': 0.2}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.6100071668624878, 'eval_runtime': 49.219, 'eval_samples_per_second': 40.635, 'eval_steps_per_second': 10.159, 'epoch': 0.4}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.5971646308898926, 'eval_runtime': 49.3465, 'eval_samples_per_second': 40.53, 'eval_steps_per_second': 10.132, 'epoch': 0.59}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.5969420671463013, 'eval_runtime': 48.9166, 'eval_samples_per_second': 40.886, 'eval_steps_per_second': 10.221, 'epoch': 0.79}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.597001314163208, 'eval_runtime': 49.2408, 'eval_samples_per_second': 40.617, 'eval_steps_per_second': 10.154, 'epoch': 0.99}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.5970066785812378, 'eval_runtime': 48.9656, 'eval_samples_per_second': 40.845, 'eval_steps_per_second': 10.211, 'epoch': 1.19}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.5969178676605225, 'eval_runtime': 48.614, 'eval_samples_per_second': 41.14, 'eval_steps_per_second': 10.285, 'epoch': 1.39}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.5969864130020142, 'eval_runtime': 49.1051, 'eval_samples_per_second': 40.729, 'eval_steps_per_second': 10.182, 'epoch': 1.58}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.597011685371399, 'eval_runtime': 49.1839, 'eval_samples_per_second': 40.664, 'eval_steps_per_second': 10.166, 'epoch': 1.78}




{'loss': 4.4109, 'grad_norm': nan, 'learning_rate': 1.4269841269841272e-05, 'epoch': 1.98}


  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': 1.5969576835632324, 'eval_runtime': 49.5989, 'eval_samples_per_second': 40.323, 'eval_steps_per_second': 10.081, 'epoch': 1.98}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 46.7539, 'eval_samples_per_second': 42.777, 'eval_steps_per_second': 10.694, 'epoch': 2.18}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 46.5356, 'eval_samples_per_second': 42.978, 'eval_steps_per_second': 10.744, 'epoch': 2.38}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 46.3958, 'eval_samples_per_second': 43.107, 'eval_steps_per_second': 10.777, 'epoch': 2.57}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 46.9519, 'eval_samples_per_second': 42.597, 'eval_steps_per_second': 10.649, 'epoch': 2.77}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 47.6397, 'eval_samples_per_second': 41.982, 'eval_steps_per_second': 10.495, 'epoch': 2.97}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 48.8887, 'eval_samples_per_second': 40.909, 'eval_steps_per_second': 10.227, 'epoch': 3.17}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 48.5618, 'eval_samples_per_second': 41.185, 'eval_steps_per_second': 10.296, 'epoch': 3.37}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 48.5598, 'eval_samples_per_second': 41.186, 'eval_steps_per_second': 10.297, 'epoch': 3.56}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 48.7327, 'eval_samples_per_second': 41.04, 'eval_steps_per_second': 10.26, 'epoch': 3.76}




{'loss': 144296053309.44, 'grad_norm': nan, 'learning_rate': 1.3873015873015875e-05, 'epoch': 3.96}


  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 49.0857, 'eval_samples_per_second': 40.745, 'eval_steps_per_second': 10.186, 'epoch': 3.96}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 48.8204, 'eval_samples_per_second': 40.966, 'eval_steps_per_second': 10.242, 'epoch': 4.16}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 48.9072, 'eval_samples_per_second': 40.894, 'eval_steps_per_second': 10.223, 'epoch': 4.36}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 48.6654, 'eval_samples_per_second': 41.097, 'eval_steps_per_second': 10.274, 'epoch': 4.55}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 49.3662, 'eval_samples_per_second': 40.514, 'eval_steps_per_second': 10.128, 'epoch': 4.75}




  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 200}


{'eval_loss': nan, 'eval_runtime': 49.1031, 'eval_samples_per_second': 40.731, 'eval_steps_per_second': 10.183, 'epoch': 4.95}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


{'train_runtime': 7487.9102, 'train_samples_per_second': 6.744, 'train_steps_per_second': 0.168, 'train_loss': 57260338616.607475, 'epoch': 4.99}


TrainOutput(global_step=1260, training_loss=57260338616.607475, metrics={'train_runtime': 7487.9102, 'train_samples_per_second': 6.744, 'train_steps_per_second': 0.168, 'total_flos': 5137942707634176.0, 'train_loss': 57260338616.607475, 'epoch': 4.99009900990099})

In [24]:
import torch
device = torch.device("cuda")

# tokenize
text = "意图识别任务: 还有双鸭山到淮阴的汽车票吗13号的"
encode_dict = tokenizer(text, max_length=512, padding='max_length',truncation=True)

inputs = {
  "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
  "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
  }

# generate answer
logits = model.generate(
  input_ids = inputs['input_ids'],
  max_length=100, 
  do_sample= True
  # early_stopping=True,
  )

logits=logits[:,1:]
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]
print(predict_label)



['Travel-Conduct']


In [25]:
from tqdm import tqdm_notebook

model.eval()

pred_label = []
for train_text in tqdm_notebook(test_data[0].values):
    text = f"意图识别任务: {train_text}"
    encode_dict = tokenizer(text, max_length=512, padding='max_length', truncation=True)
    
    inputs = {
      "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
      "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
    }
    
    # generate answer
    logits = model.generate(
        input_ids = inputs['input_ids'],
        max_length=100, 
        do_sample= True
    )
    
    logits = logits[:,:]
    pred_label += [tokenizer.decode(i,skip_special_tokens=True) for i in logits]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for train_text in tqdm_notebook(test_data[0].values):


  0%|          | 0/3000 [00:00<?, ?it/s]

In [26]:
pd.DataFrame({
    'ID': range(1, len(pred_label) + 1),
    'Target': pred_label,
}).to_csv('nlp_submit.csv', index=None)

In [27]:
pred_label[:10]

['Premiere-Play',
 'Home-Play',
 'QuickTimeMediaPlayer',
 'Plan-Plan',
 'Security-Disable',
 'VideoPlay',
 'Movie-Play',
 'ListenFilter-Player',
 'Time-Query',
 'Camera-Play']