<a href="https://colab.research.google.com/github/chenboju/AI/blob/main/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 編碼器模型範例
這是一個編碼器模型的範例，用於將輸入的序列進行編碼，並將編碼後的結果輸出。

In [1]:
from transformers import BertTokenizer, BertModel

# 使用 BertTokenizer 從預訓練模型 'bert-base-uncased' 中加載 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 使用 BertModel 從預訓練模型 'bert-base-uncased' 中加載 BERT 模型
model = BertModel.from_pretrained("bert-base-uncased")

# 欲處理的文本
text = "my bank account"

# 使用 tokenizer 對文本進行編碼，並添加特殊 token、截斷、填充等操作
encoded_input = tokenizer(
    text,
    max_length=100,
    add_special_tokens=True,
    truncation=True,
    padding=True,
    return_tensors="pt"  # 返回 PyTorch 張量
)

# 使用 BERT 模型對編碼後的輸入進行前向傳播
output = model(**encoded_input)

# 輸出 BERT 模型的輸出形狀
print(output[0].shape)

# 輸出完整的 BERT 模型的輸出
print(output)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

torch.Size([1, 5, 768])
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5190,  0.2387, -0.3949,  ..., -0.0035,  0.0795,  0.2924],
         [ 0.5796,  0.4453,  0.0075,  ...,  0.0364,  0.0774,  0.4440],
         [ 1.2324, -0.3925,  0.2721,  ..., -0.7919, -0.1087, -0.3817],
         [-0.4921, -0.9572, -0.7413,  ..., -0.0015, -0.3351,  0.4957],
         [ 0.8211,  0.1658, -0.1855,  ...,  0.0710, -0.6693, -0.3189]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.2991e-01, -3.3579e-01, -5.3743e-01,  6.9344e-01,  4.6962e-01,
         -1.6712e-01,  7.5686e-01,  1.5057e-01, -5.3369e-01, -9.9988e-01,
         -4.2301e-01,  9.3534e-01,  9.8397e-01,  2.3557e-01,  9.1207e-01,
         -6.9858e-01, -5.6143e-01, -6.1538e-01,  2.7067e-01, -3.8682e-01,
          6.2639e-01,  9.9946e-01,  9.4960e-02,  2.6180e-01,  3.8694e-01,
          9.8186e-01, -6.8193e-01,  9.0472e-01,  9.5478e-01,  6.7263e-01,
         -4.1708e-01,  1.2815e-01, -9.9297e-01,  4.721

### 解碼器


In [2]:
from transformers import AutoTokenizer, GPT2Model
import torch

# 使用 AutoTokenizer 從預訓練模型 'openai-community/gpt2' 中加載 tokenizer
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

# 使用 GPT2Model 從預訓練模型 'openai-community/gpt2' 中加載 GPT-2 模型
model = GPT2Model.from_pretrained("openai-community/gpt2")

# 欲處理的文本
text = "Yes, Hello, my dog is cute"

# 使用 tokenizer 對文本進行編碼，並返回 PyTorch 張量
inputs = tokenizer(text, return_tensors="pt")

# 使用 GPT-2 模型對編碼後的輸入進行前向傳播，獲得模型的輸出
outputs = model(**inputs)

# 從模型的輸出中獲取最後一層的隱藏狀態
last_hidden_states = outputs.last_hidden_state

# 輸出最後一層的隱藏狀態的形狀
print(last_hidden_states.shape)

# 輸出模型的輸出（這應該是之前定義的 output，但這裡應該要使用 outputs）
print(outputs)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

torch.Size([1, 8, 768])
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5190,  0.2387, -0.3949,  ..., -0.0035,  0.0795,  0.2924],
         [ 0.5796,  0.4453,  0.0075,  ...,  0.0364,  0.0774,  0.4440],
         [ 1.2324, -0.3925,  0.2721,  ..., -0.7919, -0.1087, -0.3817],
         [-0.4921, -0.9572, -0.7413,  ..., -0.0015, -0.3351,  0.4957],
         [ 0.8211,  0.1658, -0.1855,  ...,  0.0710, -0.6693, -0.3189]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.2991e-01, -3.3579e-01, -5.3743e-01,  6.9344e-01,  4.6962e-01,
         -1.6712e-01,  7.5686e-01,  1.5057e-01, -5.3369e-01, -9.9988e-01,
         -4.2301e-01,  9.3534e-01,  9.8397e-01,  2.3557e-01,  9.1207e-01,
         -6.9858e-01, -5.6143e-01, -6.1538e-01,  2.7067e-01, -3.8682e-01,
          6.2639e-01,  9.9946e-01,  9.4960e-02,  2.6180e-01,  3.8694e-01,
          9.8186e-01, -6.8193e-01,  9.0472e-01,  9.5478e-01,  6.7263e-01,
         -4.1708e-01,  1.2815e-01, -9.9297e-01,  4.721

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# 加載預訓練的GPT-2模型（帶有語言模型頭部的版本）和分詞器
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 準備輸入文字
text = "The quick brown fox jumps over the lazy dog"
inputs = tokenizer(text, return_tensors="pt")

# 獲取模型輸出
with torch.no_grad():
    outputs = model(**inputs)

# logits 是應用softmax之前的輸出，直接對應於詞彙表的維度
logits = outputs.logits

# 展示logits的維度
# 維度應該是 [批次大小, 序列長度, 詞彙表大小]
print("Logits shape:", logits.shape)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Logits shape: torch.Size([1, 9, 50257])


### 序列到序列

In [4]:
from transformers import MT5EncoderModel, T5Tokenizer

# 加載模型和分詞器
model_name = 'google/mt5-small' # 可以根據需要選擇不同大小的MT5模型
model = MT5EncoderModel.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
print(model)

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


MT5EncoderModel(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): Li

### 偏見及問題

In [5]:
from transformers import pipeline

# 使用 pipeline 創建一個填充遮罩的任務，指定使用的模型為 'bert-base-uncased'
unmasker = pipeline("fill-mask", model="bert-base-uncased")

# 使用填充遮罩的任務來填充 "[MASK]" 位置
result = unmasker("This man works as a [MASK].")

# 輸出填充結果中的詞彙
print([r["token_str"] for r in result])

# 再次使用填充遮罩的任務來填充 "[MASK]" 位置
result = unmasker("This woman works as a [MASK].")

# 輸出填充結果中的詞彙
print([r["token_str"] for r in result])


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['carpenter', 'lawyer', 'farmer', 'businessman', 'doctor']
['nurse', 'maid', 'teacher', 'waitress', 'prostitute']
