<a href="https://colab.research.google.com/github/h0806449f/PyTorch/blob/main/NLP_first_try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **== 0. 簡介: transformer可以做什麼 ==**
from HuggingFace

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

from transformers import pipeline

In [None]:
# 情緒分析
classifier = pipeline(model = "distilbert-base-uncased-finetuned-sst-2-english", # Dfault model
                      task = "sentiment-analysis")


classifier("首次嘗試使用NLP相關模型, 模型來自於HuggingFace, 看起來有點厲害")

[{'label': 'NEGATIVE', 'score': 0.970554769039154}]

In [None]:
# 零樣本 - 文本分類
classifier = pipeline(model = "facebook/bart-large-mnli", # Default model
                      task = "zero-shot-classification")

classifier("This is a course about the Transformers library",
           candidate_labels=["education", "politics", "business"])

{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445989489555359, 0.11197412759065628, 0.04342695698142052]}

In [None]:
# 文本生成
generator = pipeline(model = "gpt2", # Default model
                     task = "text-generation")

generator("These are some steps for build risk forecast model",
          max_new_tokens = 50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'These are some steps for build risk forecast model, such as using data with different forecast values.'}]

In [None]:
# 文本生成
generator = pipeline("text-generation", model="distilgpt2")

generator(
    "These are some steps for build risk forecast model",
    max_length=30,
    num_return_sequences=2,
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'These are some steps for build risk forecast model development with tools that can be useful and use cases for understanding how to use our current software to evaluate your'},
 {'generated_text': 'These are some steps for build risk forecast model (see below).\n\n\n\n\nPrerequisites\nLet’s begin in a short time'}]

# **== 1. Transformer ==**

## 1.1 Pipeline

In [None]:
from transformers import pipeline

classifier = pipeline(model = "distilbert-base-uncased-finetuned-sst-2-english",
                      task = "sentiment-analysis")

classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

[{'label': 'POSITIVE', 'score': 0.9598048329353333},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

### 1.1.1 Tokenizer

In [None]:
# Tokenize
from transformers import AutoTokenizer

# 使用預訓練過的 checkpoint
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# CheckTokenize

raw_inputs = ["I've been waiting for a HuggingFace course my whole life.",
              "I hate this so much!",]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt") # 將返回 dict

print(inputs["input_ids"])
print(inputs["attention_mask"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])


### 1.1.2 Trough pretrained model

In [None]:
# Model
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
# Model's output
outputs = model(**inputs)

outputs.logits

# 第一句, 負面情緒的機率, 正面情緒的機率
# 第二句, 負面情緒的機率, 正面情緒的機率

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)

### 1.1.3 Logits -> 有意義的回答

In [None]:
import torch

# 情緒字典
class_names = model.config.id2label

# logits -> probs -> label_index
probility = torch.softmax(outputs.logits, dim = 1)
label = torch.argmax(probility, dim = 1)

# 第一句
print(f"第一句情緒判斷:{class_names[label[0].item()]}")
# 第二句
print(f"第二句情緒判斷:{class_names[label[1].item()]}")

第一句情緒判斷:POSITIVE
第二句情緒判斷:NEGATIVE


# 1.2 Model

### 1.2.1 Get pretrained model

In [None]:
from transformers import BertModel

# 使用此模型作者提供的 checkpoint
model = BertModel.from_pretrained("bert-base-cased")
# [INFO] -> 如果需要客製化, 需要整定參數

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### 1.2.2 Save model

In [55]:
model.save_pretrained("Model_of_Bert")

# 將於指定資料夾名稱中, 儲存兩個文件
# 1. config.json  模型屬性
# 2. pytorch_model.bin  模型的權重

## 1.3 Tokenizer