#目標:
使用 IMDB 資料集和 xlnet-base-cased 訓練模型進行 Textclassification

In [3]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F

In [None]:
import torch
import os
from tqdm import tqdm, trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer

In [None]:
# Check library version
!pip list | grep -E 'transformers|torch|Keras'

torch                         1.13.1+cu116
torchaudio                    0.13.1+cu116
torchsummary                  1.5.1
torchtext                     0.14.1
torchvision                   0.14.1+cu116
transformers                  4.27.3


## Load data

**資料集格式**

{
    "label": 0,
    "text": "I love sci-fi and am willing to put up with a lot....",
}

* text: the movie review text.

* label: a value that is either 0 for a negative review or 1 for a positive review.


### 完整 dataset

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset

imdb = load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

### 以下用較小的 dataset

In [None]:
data_file_address = (
    "/content/drive/MyDrive/讀書會/XLNet/imdb_text_classification_dataset.csv"
)

In [None]:
df_data = pd.read_csv(
    data_file_address, sep=",", encoding="utf-8", names=["labels", "texts"]
)

In [None]:
df_data.columns

Index(['labels', 'texts'], dtype='object')

In [None]:
len(df_data)

10662

In [None]:
df_data.head(n=20)

Unnamed: 0,labels,texts
0,0,"god is great , the movie's not ."
1,0,. . . the whole thing succeeded only in making...
2,1,"light the candles , bring out the cake and don..."
3,1,"the story may not be new , but australian dire..."
4,1,you live the mood rather than savour the story .
5,1,". . . "" bowling for columbine "" remains a disq..."
6,1,occasionally amateurishly made but a winsome c...
7,0,"by the time you reach the finale , you're like..."
8,0,the best way to hope for any chance of enjoyin...
9,0,something must have been lost in the translati...


**Have a look labels**

In [None]:
df_data.labels.unique()

array([0, 1])

In [None]:
# Analyse the labels distribution
df_data.labels.value_counts()

0    5331
1    5331
Name: labels, dtype: int64

## Parser data

**Parser data into document structure**

In [None]:
# Get sentence data
sentences = df_data.texts.to_list()
sentences[0]

"god is great , the movie's not ."

In [None]:
# Get tag labels data
labels = df_data.labels.to_list()
print(labels[0])

0


**Make TAG name into index for training**

In [None]:
# Set a dict for mapping id to tag name
# 0:negative, 1: positive
tag2idx = {"0": 0, "1": 1}

In [None]:
tag2idx

{'0': 0, '1': 1}

In [None]:
# Mapping index to name
tag2name = {tag2idx[key]: key for key in tag2idx.keys()}

In [None]:
tag2name

{0: '0', 1: '1'}

## Make tranning data

Make raw data into trainable data for XLNet, including:

- Set gpu environment
- Load tokenizer and tokenize
- Set 3 embedding, token embedding, mask word embedding, segmentation embedding
- Split data set into train and validate, then send them to dataloader

**Set up gpu environment**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
n_gpu

1

### Load tokenizer

Remember to install sentencepiece with  'pip install sentencepiece'

In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Len of the sentence must be the same as the training model
# See model's 'max_position_embeddings' = 512
max_len = 64

In [None]:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased", do_lower_case=False)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

### Set text input embedding

- token id embedding
- mask embedding
- segment embedding

**This process is huge differnent from BERT**

We need to add special tokens ("[SEP]" and "[CLS]") at the beginning and end of each sentence for XLNet to work properly. 

For BERT, the special token pattern looks like this:

    [CLS] + Sentence_A + [SEP] + Sentence_B + [SEP]

Whereas with XLNet the token pattern looks like this:

    Sentence_A + <sep> + Sentence_B + <sep> + <cls>
    
For single sentence inputs here, we just need to add [SEP] and [CLS] to the end:

    Sentence + <sep> + <cls>

sentence: "god is great, the movie's not." 

**BERT:**

* 將文本中的單詞劃分為基本單元

  "god", "is", "great", ",", "the", "movie", "'", "s", "not", "."

* 在單詞前添加一個特殊字符"##"

  "go", "##d", "is", "great", ",", "the", "mov", "##ie", "'", "s", "not", "."

**XLNet:**

* 將文本中的單詞劃分為基本單元

  "god", "is", "great", ",", "the", "movie", "'", "s", "not", "."

* 將每個單詞轉化為由多個子詞组成的序列

  "▁god", "▁is", "▁great", ",", "▁the", "▁movie", "'", "s", "▁not", "."

In [None]:
max_len = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A = 0
SEG_ID_B = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i, sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)

    # Trim the len of text
    if len(tokens_a) > max_len - 2:
        tokens_a = tokens_a[: max_len - 2]

    tokens = []
    segment_ids = []

    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)

    # Add <sep> token
    # tokens.append(SEP_ID)
    # segment_ids.append(SEG_ID_SEP)

    # Add <cls> token
    # tokens.append(CLS_ID)
    # segment_ids.append(SEG_ID_CLS)

    input_ids = tokens

    # The mask has 0 for real tokens and 1 for padding tokens. Only real tokens are attended to.
    # Negative of attention_masks
    # Can only uses one of input_mask and attention_mask
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len

    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)

    if 3 > i:
        print("No.:%d" % (i))
        print("sentence: %s" % (sentence))
        print("input_ids:%s" % (input_ids))
        print(
            "input_ids convert_ids_to_tokens:",
            tokenizer.convert_ids_to_tokens(input_ids),
        )
        print("input_ids decode:", tokenizer.decode(input_ids))
        print("input_masks:%s" % (input_mask))
        print("segment_ids:%s" % (segment_ids))
        print("\n")

No.:0
sentence: god is great , the movie's not .
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7290, 27, 312, 17, 19, 18, 1432, 26, 23, 50, 17, 9, 4, 3]
input_ids convert_ids_to_tokens: ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '▁god', '▁is', '▁great', '▁', ',', '▁the', '▁movie', "'", 's', '▁not', '▁', '.', '<sep>', '<cls>']
input_ids decode: <unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><

### Set label embedding

In [None]:
# Make label into id
tags = [tag2idx[str(lab)] for lab in labels]
print(tags[0])

0


## Split data into train and validate

70% for training, 30% for validation

**Split all data**

In [None]:
(
    tr_inputs,
    val_inputs,
    tr_tags,
    val_tags,
    tr_masks,
    val_masks,
    tr_segs,
    val_segs,
) = train_test_split(
    full_input_ids,
    tags,
    full_input_masks,
    full_segment_ids,
    random_state=4,
    test_size=0.3,
)

In [None]:
len(tr_inputs), len(val_inputs), len(tr_segs), len(val_segs)

(7463, 3199, 7463, 3199)

**Set data into tensor**

Not recommend tensor.to(device) at this process, since it will run out of GPU memory

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

**Put data into data loader**

In [None]:
# Set batch num
batch_num = 32

In [None]:
# Set token embedding, attention embedding, segment embedding
train_data = TensorDataset(tr_inputs, tr_masks, tr_segs, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(
    train_data, sampler=train_sampler, batch_size=batch_num, drop_last=True
)
# Dataset定義如何讀取和處理數據，Sampler控制數據加載順序，DataLoader使用這些類來加載和處理數據。
valid_data = TensorDataset(val_inputs, val_masks, val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

## Train model

**Load XLNet model**

In [None]:
model = XLNetForSequenceClassification.from_pretrained(
    "xlnet-base-cased", num_labels=len(tag2idx)
)

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [None]:
# Set model to GPU,if you are using GPU machine
model.to(device);

In [None]:
# Add multi GPU support
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [None]:
# Set epoch and grad max num
epochs = 5
max_grad_norm = 1.0

In [None]:
# Cacluate train optimization num
num_train_optimization_steps = int(math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

### Set fine tuning method

**Manual optimizer**

In [None]:
from transformers import AdamW

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

### Fine-tuing model

In [None]:
# TRAIN loop
model.train();

In [None]:
print("***** Running training *****")
print("  Num examples = %d" % (len(tr_inputs)))
print("  Batch size = %d" % (batch_num))
print("  Num steps = %d" % (num_train_optimization_steps))
for _ in trange(epochs, desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_segs, b_labels = batch

        # forward pass
        outputs = model(
            input_ids=b_input_ids,
            token_type_ids=b_segs,
            input_mask=b_input_mask,
            labels=b_labels,
        )
        loss, logits = outputs[:2]
        if n_gpu > 1:
            # When multi gpu, average it
            loss = loss.mean()

        # backward pass
        loss.backward()

        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )

        # update parameters
        optimizer.step()
        optimizer.zero_grad()

    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

***** Running training *****
  Num examples = 7463
  Batch size = 32
  Num steps = 1170


Epoch:  20%|██        | 1/5 [01:49<07:17, 109.42s/it]

Train loss: 0.45881388975954873


Epoch:  40%|████      | 2/5 [03:33<05:18, 106.09s/it]

Train loss: 0.26231390875744204


Epoch:  60%|██████    | 3/5 [05:15<03:28, 104.38s/it]

Train loss: 0.17142920820302301


Epoch:  80%|████████  | 4/5 [06:58<01:43, 103.80s/it]

Train loss: 0.11420526237840577


Epoch: 100%|██████████| 5/5 [08:40<00:00, 104.16s/it]

Train loss: 0.09164215201315122





## Save model 

In [None]:
%cd /content/drive/MyDrive/讀書會/XLNet
%ls

/content/drive/MyDrive/讀書會/XLNet
 [0m[01;34mcola[0m/
 imdb_text_classification_dataset.csv
'Imdb_Textclassification_with_XLNET(adamw).ipynb'
 Imdb_Textclassification_with_XLNET.ipynb
'Imdb_Textclassification_with_XLNET(參考).ipynb'
'Imdb_Textclassification_with_XLNET(未整理).ipynb'
 xlnet-base-cased-spiece.model
'XLNet Fine-Tuning Sentence Classification_cola.ipynb'
 [01;34mxlnet_imdb_out_model[0m/
 參考.gdoc


In [None]:
xlnet_out_address = "./xlnet_imdb_out_model"

In [None]:
# Make dir if not exits
if not os.path.exists(xlnet_out_address):
    os.makedirs(xlnet_out_address)

In [None]:
# Save a trained model, configuration and tokenizer
model_to_save = (
    model.module if hasattr(model, "module") else model
)  # Only save the model it-self

In [None]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(xlnet_out_address, "pytorch_model.bin")
output_config_file = os.path.join(xlnet_out_address, "config.json")

In [None]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(xlnet_out_address)

('./xlnet_imdb_out_model/spiece.model',)

## Load model

In [None]:
model = XLNetForSequenceClassification.from_pretrained(
    xlnet_out_address, num_labels=len(tag2idx)
)

In [None]:
# Set model to GPU
model.to(device);

In [None]:
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

## Eval model

In [None]:
# Evalue loop
model.eval();

In [None]:
# Set acc funtion
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [None]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs, b_labels = batch

    with torch.no_grad():
        outputs = model(
            input_ids=b_input_ids,
            token_type_ids=b_segs,
            input_mask=b_input_mask,
            labels=b_labels,
        )
        tmp_eval_loss, logits = outputs[:2]

    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to("cpu").numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
    #     print(tmp_eval_accuracy)
    #     print(np.argmax(logits, axis=1))
    #     print(label_ids)

    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)

    for real_result in label_ids.tolist():
        y_true.append(real_result)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_steps += 1


eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / len(val_inputs)
loss = tr_loss / nb_tr_steps
result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "loss": loss}
# classification_report函數用於顯示主要分類指標的文本報告
# https://www.cnblogs.com/178mz/p/8558435.html
# https://blog.csdn.net/weixin_43945848/article/details/122061718
report = classification_report(y_pred=np.array(y_predict), y_true=np.array(y_true))

# Save the report into file
output_eval_file = os.path.join(xlnet_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s" % (key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))

    print(report)
    writer.write("\n\n")
    writer.write(report)

***** Running evaluation *****
  Num examples =3199
  Batch size = 32
***** Eval results *****
  eval_accuracy = 0.8818380743982495
  eval_loss = 0.5450608535902575
  loss = 0.09164215201315122
              precision    recall  f1-score   support

           0       0.86      0.91      0.88      1586
           1       0.91      0.85      0.88      1613

    accuracy                           0.88      3199
   macro avg       0.88      0.88      0.88      3199
weighted avg       0.88      0.88      0.88      3199



### 計算正負面預測次數

In [None]:
from sklearn.metrics import confusion_matrix

# 計算混淆矩陣
cm = confusion_matrix(y_true, y_predict)
# cm = confusion_matrix(y_pred=np.array(y_predict),y_true=np.array(y_true))
print("Confusion Matrix:")
print(cm)

## Test model

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification
import torch

# 載入模型
model = XLNetForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/讀書會/XLNet/xlnet_imdb_out_model"
)

# tokenizer
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased", do_lower_case=False)

In [None]:
def sentiment_analysis(test_sentence):
    encoded_sentence = tokenizer.encode_plus(
        test_sentence, padding=True, truncation=True, return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**encoded_sentence)

        # 取得預測標籤
        _, predicted_label = torch.max(outputs.logits, dim=1)

    print(f"Text: {test_sentence}")

    # 根據預測標籤输出分析结果
    if predicted_label.item() == 0:
        print("Prediction: Negative")
    elif predicted_label.item() == 1:
        print("Prediction: Positive")

In [None]:
test_sentence = (
    "This movie is pretty amazing.",
    "I don't feel like this is a great idea.",
)

for text in test_sentence:
    sentiment_analysis(text)
    print("\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Text: This movie is pretty amazing.
Prediction: Positive


Text: I don't feel like this is a great idea.
Prediction: Negative


