# Chapter 1 Introduction

## 1.1 Let's solve Natural Language Processing using **Transformers**!!

In [1]:
!pip -q install transformers[ja,sentencepiece,torch] xformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.9/600.9 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/71.7 MB[0m [31m9.8 MB/s[0m e

In [2]:
from transformers import pipeline

### 1.1.1 Text Classification

In [3]:
text_classification_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-marc_ja"
)
# Sentence meaning: There is music in the world that moves you even if you don't understand the words.
positive_text = "世界には言葉がわからなくても感動する音楽がある。"
# Positive label prediction of positive_text
print(text_classification_pipeline(positive_text)[0])

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

{'label': 'positive', 'score': 0.9993619322776794}


In [4]:
# Sentence meaning: There is music in the world that is so terrible that there are no words for it.
negative_text = "世界には言葉がでないほどひどい音楽がある。"
# Negative label prediction of negative_text
print(text_classification_pipeline(negative_text)[0])

{'label': 'negative', 'score': 0.9636247754096985}


### 1.1.2 Natural Language Inference

In [5]:
nli_pipeline = pipeline(model="llm-book/bert-base-japanese-v3-jnli")
# text meaning: two men are looking at a jet plane
text = "二人の男性がジェット機を見ています"
# entailment_text meaning: There are two people looking at a jet plane.
entailment_text = "ジェット機を見ている人が二人います"

# Entailment prediction result between text and entailment_text
print(nli_pipeline({"text": text, "text_pair": entailment_text}))

config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

{'label': 'entailment', 'score': 0.9964311122894287}


In [6]:
# Sentence meaning: two men are flying
contradiction_text = "二人の男性が飛んでいます"
# Contradiction Prediction between text and contradiction_text
print(nli_pipeline({"text": text, "text_pair": contradiction_text}))

{'label': 'contradiction', 'score': 0.9990535378456116}


In [7]:
# Sentence meaning: Two men are looking at a white airplane
neutral_text = "2人の男性が、白い飛行機を眺めています"
# Neutral label prediction between text and neutral_text
print(nli_pipeline({"text": text, "text_pair": neutral_text}))

{'label': 'neutral', 'score': 0.9959145188331604}


### 1.1.3 Text Similarity

In [8]:
text_sim_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-jsts",
    function_to_apply="none",
)
# text meaning: There are people with surfboards on the riverbank.
text = "川べりでサーフボードを持った人たちがいます"
# sim_text meaning: Surfers are standing on the riverbank
sim_text = "サーファーたちが川べりに立っています"
# Text Similarity prediction between text and sim_text
result = text_sim_pipeline({"text": text, "text_pair": sim_text})
print(result["score"])

config.json:   0%|          | 0.00/796 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

3.5703558921813965


In [9]:
# dissim_text meaning: A black towel is hung on the bathroom wall.
dissim_text = "トイレの壁に黒いタオルがかけられています"
# Text Similarity prediction between text and dissim_text
result = text_sim_pipeline({"text": text, "text_pair": dissim_text})
print(result["score"])

0.04162175580859184


In [10]:
from torch.nn.functional import cosine_similarity

sim_enc_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-unsup-simcse-jawiki",
    task="feature-extraction",
)

# Get vector for both text and sim_text
text_emb = sim_enc_pipeline(text, return_tensors=True)[0][0]
sim_emb = sim_enc_pipeline(sim_text, return_tensors=True)[0][0]
# Text Similarity prediction between text and sim_text
sim_pair_score = cosine_similarity(text_emb, sim_emb, dim=0)
print(sim_pair_score.item())

config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

0.8568589687347412


In [11]:
# Get vector of dissim_text
dissim_emb = sim_enc_pipeline(dissim_text, return_tensors=True)[0][0]
# Text Similarity prediction between text and dissim_text
dissim_pair_score = cosine_similarity(text_emb, dissim_emb, dim=0)
print(dissim_pair_score.item())

0.45887047052383423


### 1.1.4 Named Entity Recognition

In [12]:
from pprint import pprint

ner_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-ner-wikipedia-dataset",
    aggregation_strategy="simple",
)
# text meaning: Shohei Otani is a professional baseball player from Mizusawa City, Iwate Prefecture.
text = "大谷翔平は岩手県水沢市出身のプロ野球選手"
'''
Extract named entity from text

Entity Group
 * `人名`: Name of the Person
 * `地名`: Name of the place
'''
pprint(ner_pipeline(text))

config.json:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'end': None,
  'entity_group': '人名',
  'score': 0.99823624,
  'start': None,
  'word': '大谷 翔平'},
 {'end': None,
  'entity_group': '地名',
  'score': 0.9986874,
  'start': None,
  'word': '岩手 県 水沢 市'}]


### 1.1.5 Summary Generation

In [13]:
text2text_pipeline = pipeline(
    model="llm-book/t5-base-long-livedoor-news-corpus"
)

'''
article meaning:

The three-day holiday has finally begun.
Many people probably spend their time watching TV.
What I recommend tonight is definitely the NHK special ``Steve Jobs, the man who changed the world.''
In fact, many people don't know about Mr. Jobs's upbringing, including his adoption,
and his experiences such as being temporarily expelled from Apple.
And what was the ideal future that he pursued?The content is interesting even for non-fans.
The biography of Mr. Jobs, who passed away this year, has become a bestseller in Japan.
It can be imagined that Mr. Jobs will continue to have a great influence not only on Apple products but also on the world.
Even if you don't know much about Mr. Jobs, please take this opportunity to check it out.
Steve Jobs, the man who changed the world (NHK Special)
'''
article = "ついに始まった３連休。テレビを見ながら過ごしている人も多いのではないだろうか？　今夜オススメなのは何と言っても、NHKスペシャル「世界を変えた男 スティーブ・ジョブズ」だ。実は知らない人も多いジョブズ氏の養子に出された生い立ちや、アップル社から一時追放されるなどの経験。そして、彼が追い求めた理想の未来とはなんだったのか、ファンならずとも気になる内容になっている。 今年、亡くなったジョブズ氏の伝記は日本でもベストセラーになっている。今後もアップル製品だけでなく、世界でのジョブズ氏の影響は大きいだろうと想像される。ジョブズ氏のことをあまり知らないという人もこの機会にぜひチェックしてみよう。 世界を変えた男　スティーブ・ジョブズ（NHKスペシャル）"
# Generate article summary
# Output summary: Check out the NHK special “Steve Jobs, the man who changed the world” tonight!
print(text2text_pipeline(article)[0]["generated_text"])

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



今夜はNHKスペシャル「世界を変えた男 スティーブ・ジョブズ」をチェック!


## 1.2 Basic Usage of **Transformers**

In [14]:
from transformers import AutoTokenizer

# Load tokenizer with AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("abeja/gpt2-large-japanese")
# Split input sentence into tokens
# sentence meaning: The weather is nice today
tokenizer.tokenize("今日は天気が良いので")

tokenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/784k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/153 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


['▁', '今日', 'は', '天気', 'が良い', 'の', 'で']

In [15]:
from transformers import AutoModelForCausalLM

# Load the model using AutoModelForCausalLM, which is the model that performs generation.
model = AutoModelForCausalLM.from_pretrained(
    "abeja/gpt2-large-japanese"
)
# Create input to the model using a tokenizer
# Sentence meaning: The weather is nice today
inputs = tokenizer("今日は天気が良いので", return_tensors="pt")
# Predict subsequent text
outputs = model.generate(
    **inputs,
    max_length=15,  # Specify the maximum number of tokens to be generated as 15
    pad_token_id=tokenizer.pad_token_id  # Specify padding token ID
)
# Convert the output of the generate function to text
generated_text = tokenizer.decode(
    outputs[0], skip_special_tokens=True
)
# Output meaning: The weather was nice today, so I ate lunch outside.
print(generated_text)

config.json:   0%|          | 0.00/974 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

今日は天気が良いので外でお弁当を食べました。
