## 加载数据集

In [1]:
# comment if local else uncomment 
!pip install transformers datasets seqeval
!git clone https://github.com/fecat233/exp-ml-.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 33.6 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 64.1 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.3 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 67.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash

In [3]:
from datasets import load_from_disk

atis_dataset = load_from_disk("dataset/atis/atis")

In [4]:
atis_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'tags_name', 'tags'],
        num_rows: 4978
    })
    validate: Dataset({
        features: ['text', 'tags_name', 'tags'],
        num_rows: 893
    })
})

In [5]:
atis_dataset["train"][0]

{'text': ['i',
  'want',
  'to',
  'fly',
  'from',
  'boston',
  'at',
  '838',
  'am',
  'and',
  'arrive',
  'in',
  'denver',
  'at',
  '1110',
  'in',
  'the',
  'morning'],
 'tags_name': ['O',
  'O',
  'O',
  'O',
  'O',
  'B-fromloc.city_name',
  'O',
  'B-depart_time.time',
  'I-depart_time.time',
  'O',
  'O',
  'O',
  'B-toloc.city_name',
  'O',
  'B-arrive_time.time',
  'O',
  'O',
  'B-arrive_time.period_of_day'],
 'tags': [126,
  126,
  126,
  126,
  126,
  48,
  126,
  35,
  99,
  126,
  126,
  126,
  78,
  126,
  14,
  126,
  126,
  12]}

## Tokenizer

In [6]:
from transformers import AutoTokenizer
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
tokenizer.vocab_size

250002

In [8]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [None]:
index_0_data = atis_dataset["train"][0]

In [None]:
index_0_data_encoded = tokenizer(index_0_data["text"], is_split_into_words=True, return_tensors="pt")

In [None]:
tokens = index_0_data_encoded.tokens()

In [None]:
word_ids = index_0_data_encoded.word_ids()

In [None]:
index_0_data_encoded

{'input_ids': tensor([[     0,     17,   3444,     47,  12403,   1295,    337,  19386,     99,
              6, 190955,    444,    136,  54410,     23,    168,    814,     99,
            534,    963,     23,     70,  42141,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(index_0_data['tags'][word_idx])
    previous_word_idx = word_idx

In [None]:
import pandas as pd
index = ["Tokens", "Word IDs", "Label IDs"]
pd.DataFrame([tokens, word_ids, label_ids], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
Tokens,<s>,▁i,▁want,▁to,▁fly,▁from,▁bo,ston,▁at,▁,...,▁in,▁den,ver,▁at,▁11,10,▁in,▁the,▁morning,</s>
Word IDs,,0,1,2,3,4,5,5,6,7,...,11,12,12,13,14,14,15,16,17,
Label IDs,-100,126,126,126,126,126,48,-100,126,35,...,126,78,-100,126,14,-100,126,126,12,-100


In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, 
                                      is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None or word_idx == previous_word_idx:
            label_ids.append(-100)
        else:
            label_ids.append(examples["tags"][word_idx])
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [10]:
atis_dataset_encoded = atis_dataset.map(tokenize_and_align_labels, remove_columns=["text", "tags_name", "tags"])

  0%|          | 0/4978 [00:00<?, ?ex/s]

  0%|          | 0/893 [00:00<?, ?ex/s]

In [11]:
atis_dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4978
    })
    validate: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 893
    })
})

In [12]:
atis_dataset_encoded["train"][0]

{'input_ids': [0,
  17,
  3444,
  47,
  12403,
  1295,
  337,
  19386,
  99,
  6,
  190955,
  444,
  136,
  54410,
  23,
  168,
  814,
  99,
  534,
  963,
  23,
  70,
  42141,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  126,
  126,
  126,
  126,
  126,
  48,
  -100,
  126,
  35,
  -100,
  99,
  126,
  126,
  126,
  78,
  -100,
  126,
  14,
  -100,
  126,
  126,
  12,
  -100]}

## 加载XLMR模型

In [13]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
device

device(type='cuda')

In [15]:
from transformers import AutoModelForTokenClassification

num_labels = 127
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

In [None]:
index_0_data_encoded_input = {k:v.to(device) for k,v in index_0_data_encoded.items()}

In [None]:
index_0_encoded_output = model(**index_0_data_encoded_input)

In [None]:
index_0_encoded_output

TokenClassifierOutput(loss=None, logits=tensor([[[-0.1447,  0.1849, -0.5500,  ..., -0.1669, -0.0084, -0.0949],
         [-0.2162, -0.0523, -0.6306,  ..., -0.4601, -0.2152, -0.0867],
         [-0.2005, -0.0521, -0.5692,  ..., -0.4737, -0.2419, -0.0266],
         ...,
         [-0.2873,  0.0736, -0.6032,  ..., -0.3147, -0.1009, -0.1183],
         [-0.2899, -0.0458, -0.5619,  ..., -0.3323, -0.1595, -0.0937],
         [-0.1738,  0.1949, -0.5485,  ..., -0.1840, -0.0017, -0.1020]]],
       device='cuda:0', grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [None]:
index_0_encoded_output.logits.shape

torch.Size([1, 24, 127])

In [None]:
predictions = torch.argmax(index_0_encoded_output.logits, dim=-1)

In [None]:
predictions

tensor([[ 8,  6,  6, 73,  6,  6,  8,  8,  8,  6, 58,  6, 73,  6,  6,  8,  8,  8,
          6,  6,  6,  6,  8,  8]], device='cuda:0')

### 定义性能指标

In [None]:
from seqeval.metrics import classification_report

y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



In [16]:
label_names = ['B-aircraft_code', 'B-airline_code', 'B-airline_name',
       'B-airport_code', 'B-airport_name', 'B-arrive_date.date_relative',
       'B-arrive_date.day_name', 'B-arrive_date.day_number',
       'B-arrive_date.month_name', 'B-arrive_date.today_relative',
       'B-arrive_time.end_time', 'B-arrive_time.period_mod',
       'B-arrive_time.period_of_day', 'B-arrive_time.start_time',
       'B-arrive_time.time', 'B-arrive_time.time_relative',
       'B-booking_class', 'B-city_name', 'B-class_type', 'B-compartment',
       'B-connect', 'B-cost_relative', 'B-day_name', 'B-day_number',
       'B-days_code', 'B-depart_date.date_relative',
       'B-depart_date.day_name', 'B-depart_date.day_number',
       'B-depart_date.month_name', 'B-depart_date.today_relative',
       'B-depart_date.year', 'B-depart_time.end_time',
       'B-depart_time.period_mod', 'B-depart_time.period_of_day',
       'B-depart_time.start_time', 'B-depart_time.time',
       'B-depart_time.time_relative', 'B-economy', 'B-fare_amount',
       'B-fare_basis_code', 'B-flight', 'B-flight_days', 'B-flight_mod',
       'B-flight_number', 'B-flight_stop', 'B-flight_time',
       'B-fromloc.airport_code', 'B-fromloc.airport_name',
       'B-fromloc.city_name', 'B-fromloc.state_code',
       'B-fromloc.state_name', 'B-meal', 'B-meal_code',
       'B-meal_description', 'B-mod', 'B-month_name', 'B-or',
       'B-period_of_day', 'B-restriction_code',
       'B-return_date.date_relative', 'B-return_date.day_name',
       'B-return_date.day_number', 'B-return_date.month_name',
       'B-return_date.today_relative', 'B-return_time.period_mod',
       'B-return_time.period_of_day', 'B-round_trip', 'B-state_code',
       'B-state_name', 'B-stoploc.airport_code', 'B-stoploc.airport_name',
       'B-stoploc.city_name', 'B-stoploc.state_code', 'B-time',
       'B-time_relative', 'B-today_relative', 'B-toloc.airport_code',
       'B-toloc.airport_name', 'B-toloc.city_name',
       'B-toloc.country_name', 'B-toloc.state_code', 'B-toloc.state_name',
       'B-transport_type', 'I-airline_name', 'I-airport_name',
       'I-arrive_date.day_number', 'I-arrive_time.end_time',
       'I-arrive_time.period_of_day', 'I-arrive_time.start_time',
       'I-arrive_time.time', 'I-arrive_time.time_relative', 'I-city_name',
       'I-class_type', 'I-cost_relative', 'I-depart_date.day_number',
       'I-depart_date.today_relative', 'I-depart_time.end_time',
       'I-depart_time.period_of_day', 'I-depart_time.start_time',
       'I-depart_time.time', 'I-depart_time.time_relative', 'I-economy',
       'I-fare_amount', 'I-fare_basis_code', 'I-flight_mod',
       'I-flight_number', 'I-flight_stop', 'I-flight_time',
       'I-fromloc.airport_name', 'I-fromloc.city_name',
       'I-fromloc.state_name', 'I-meal_code', 'I-meal_description',
       'I-restriction_code', 'I-return_date.date_relative',
       'I-return_date.day_number', 'I-return_date.today_relative',
       'I-round_trip', 'I-state_name', 'I-stoploc.city_name', 'I-time',
       'I-today_relative', 'I-toloc.airport_name', 'I-toloc.city_name',
       'I-toloc.state_name', 'I-transport_type', 'O']

In [17]:
len(label_names)

127

In [18]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(label_names[label_ids[batch_idx][seq_idx]])
                example_preds.append(label_names[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

In [19]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, 
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

### 定义训练参数

In [23]:
from transformers import TrainingArguments

num_epochs = 2
batch_size = 4
logging_steps = len(atis_dataset_encoded["train"]) // batch_size
model_name = f"{model_name}-finetuned-atis-slot"
training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch", 
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False, 
    logging_steps=logging_steps, push_to_hub=False)

In [24]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [23]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

### 训练模型

In [25]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=atis_dataset_encoded["train"],
                  eval_dataset=atis_dataset_encoded["validate"],
                  tokenizer=tokenizer)
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0443,0.179624,0.947812
2,0.0179,0.160524,0.948349


TrainOutput(global_step=2490, training_loss=0.031075788683531034, metrics={'train_runtime': 399.2382, 'train_samples_per_second': 24.937, 'train_steps_per_second': 6.237, 'total_flos': 124849916388036.0, 'train_loss': 0.031075788683531034, 'epoch': 2.0})

## 模型推理

In [26]:
custom_text = "I want to go to Beijing from Shanghai"

In [28]:
custom_test_encoded = tokenizer(custom_text, return_tensors="pt")

In [29]:
custom_test_encoded

{'input_ids': tensor([[     0,     87,   3444,     47,    738,     47, 134288,   1295, 128291,
              2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [30]:
custom_test_encoded_input = {k: v.to(device) for k, v in custom_test_encoded.items()}

In [31]:
custom_test_encoded_input

{'input_ids': tensor([[     0,     87,   3444,     47,    738,     47, 134288,   1295, 128291,
               2]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [34]:
with torch.no_grad():
  custom_test_encoded_output = model(**custom_test_encoded_input)

In [35]:
custom_test_encoded_output

TokenClassifierOutput(loss=None, logits=tensor([[[-1.2608, -0.0370,  1.2756,  ..., -2.0682, -1.2716,  5.4601],
         [-1.1668, -0.7800, -0.3597,  ..., -2.5888, -1.6398, 14.6150],
         [-1.1942, -0.8886, -0.3754,  ..., -2.5887, -1.5350, 14.5023],
         ...,
         [-1.2643, -1.0462, -0.5901,  ..., -2.6587, -1.4806, 14.6865],
         [-1.1128, -0.8031, -0.3085,  ..., -1.6176, -0.3213,  1.2915],
         [-1.2354, -0.0226,  1.2511,  ..., -2.0249, -1.2636,  5.3639]]],
       device='cuda:0'), hidden_states=None, attentions=None)

In [36]:
predictions = torch.argmax(custom_test_encoded_output.logits, dim=-1)

In [37]:
predictions

tensor([[126, 126, 126, 126, 126, 126,  78, 126,  48, 126]], device='cuda:0')

In [40]:
predictions_list = predictions.cpu().detach().numpy().tolist()

In [42]:
predictions_list[0]

[126, 126, 126, 126, 126, 126, 78, 126, 48, 126]

In [44]:
prediction_labels = []
for label_idx in predictions_list[0]:
  prediction_labels.append(label_names[label_idx])

In [45]:
prediction_labels

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-toloc.city_name',
 'O',
 'B-fromloc.city_name',
 'O']