In [1]:
# 读取数据集，这里是直接联网读取，也可以通过下载文件，再读取
import pandas as pd
import matplotlib.pyplot as plt

data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)

In [2]:
from datasets import Dataset
train_data.columns = ["document", "summary"]
train_ds = Dataset.from_pandas(train_data.iloc[:-2000])
eval_ds = Dataset.from_pandas(train_data.iloc[-2000:])

In [3]:
from transformers import T5Tokenizer

pretrained_model = "/home/lyz/hf-models/IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese/"

special_tokens = ["<extra_id_{}>".format(i) for i in range(100)]
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model,
    do_lower_case=True,
    max_length=512,
    truncation=True,
    additional_special_tokens=special_tokens,
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
head_prefix = "意图识别任务: "
tail_prefix = " Travel-Query/Music-Play/FilmTele-Play/Video-Play/Radio-Listen/HomeAppliance-Control/Weather-Query/Alarm-Update/Calendar-Query/TVProgram-Play/Audio-Play/Other"

In [5]:
max_input_length = 60
max_target_length = 20

def preprocess_function(examples):
    model_inputs = tokenizer(head_prefix + examples["document"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
train_tokenized_id = train_ds.map(preprocess_function, remove_columns=train_ds.column_names)
eval_tokenized_id = eval_ds.map(preprocess_function, remove_columns=train_ds.column_names)

Map:   0%|          | 0/10100 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model,  device_map="auto")

In [8]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

batch_size = 4
args = Seq2SeqTrainingArguments(
    "t5-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    gradient_accumulation_steps=10,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=50,
    num_train_epochs=5,
    save_steps=50,
    save_on_each_node=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True
)

In [9]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tokenized_id,
    eval_dataset=eval_tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)

In [10]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
50,No log,0.695113
100,No log,0.314147
150,No log,0.245422


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}


KeyboardInterrupt: 

In [15]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# tokenize
text = "意图识别任务: 还有双鸭山到淮阴的汽车票吗13号的"
encode_dict = tokenizer(text, max_length=512, padding='max_length',truncation=True)

inputs = {
  "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
  "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
  }

# generate answer
logits = model.generate(
  input_ids = inputs['input_ids'],
  max_length=100, 
  do_sample= True
  # early_stopping=True,
  )

logits=logits[:,1:]
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]
print(predict_label)



['Travel-Query']


In [19]:
from tqdm import tqdm_notebook

model.eval()

pred_label = []
for train_text in tqdm_notebook(test_data[0].values):
    text = f"意图识别任务: {train_text}"
    encode_dict = tokenizer(text, max_length=512, padding='max_length', truncation=True)
    
    inputs = {
      "input_ids": torch.tensor([encode_dict['input_ids']]).long().to(device),
      "attention_mask": torch.tensor([encode_dict['attention_mask']]).long().to(device),
    }
    
    # generate answer
    logits = model.generate(
        input_ids = inputs['input_ids'],
        max_length=100, 
        do_sample= True
    )
    
    logits = logits[:,:]
    pred_label += [tokenizer.decode(i,skip_special_tokens=True) for i in logits]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for train_text in tqdm_notebook(test_data[0].values):


  0%|          | 0/3000 [00:00<?, ?it/s]

In [20]:
pd.DataFrame({
    'ID': range(1, len(pred_label) + 1),
    'Target': pred_label,
}).to_csv('nlp_submit.csv', index=None)

# 提交一下吧~

In [21]:
pred_label[:10]

['Video-Play',
 'HomeAppliance-Control',
 'Video-Play',
 'Travel-Query',
 'HomeAppliance-Control',
 'FilmTele-Play',
 'FilmTele-Play',
 'Music-Play',
 'Calendar-Query',
 'Video-Play']