<a href="https://colab.research.google.com/github/enginearn/llm_book_for_intro/blob/main/llm_book_for_intro_ch_001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 1

In [35]:
!pip install transformers[ja,sentencepiece,torch]

Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[ja,sentencepiece,torch])
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [36]:
from transformers import pipeline

## Text classification

In [37]:
from transformers.tools import text_classification
text_classification_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-marc_ja"
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--llm-book--bert-base-japanese-v3-marc_ja/snapshots/7b47edf80477fd9da0ee1bc1908326ac012d624f/config.json
Model config BertConfig {
  "_name_or_path": "llm-book/bert-base-japanese-v3-marc_ja",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "type_vocab_size"

In [38]:
positive_text = "明日は明日の風が吹く"
print(text_classification_pipeline(positive_text)[0])

{'label': 'positive', 'score': 0.998440682888031}


In [39]:
negative_text = "お金がなさ過ぎてつらい..."
print(text_classification_pipeline(negative_text)[0])

{'label': 'negative', 'score': 0.9948321580886841}


## Natural Language Inference: 自然言語推論

In [40]:
nli_pipline = pipeline(model="llm-book/bert-base-japanese-v3-jnli")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--llm-book--bert-base-japanese-v3-jnli/snapshots/9056fce079ed3fc284c9b2d1c2abccae3d13af61/config.json
Model config BertConfig {
  "_name_or_path": "llm-book/bert-base-japanese-v3-jnli",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "entailment",
    "1": "contradiction",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "contradiction": 1,
    "entailment": 0,
    "neutral": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transf

`entailment`は`含意`であり、"二人の女性が山を眺めています"が成立するならば、"山を眺めている人が2人います"も成立するという関係を表す。

In [41]:
text = "二人の女性が山を眺めています"
entailment_text = "山を眺めている人が2人います"

print(nli_pipline({"text": text, "text_pair": entailment_text}))

{'label': 'entailment', 'score': 0.9933545589447021}


`contradiction`は、`矛盾`

In [42]:
contradiction_text = "女性２人が山を破壊しています"

print(nli_pipline({"text": text, "text_pair": contradiction_text}))

{'label': 'contradiction', 'score': 0.9771778583526611}


In [43]:
neutral_text = "女性２人が山で料理をしています"

print(nli_pipline({"text": text, "text_pair": neutral_text}))

{'label': 'neutral', 'score': 0.9970123767852783}


## Semantic textual similarity: STS

二つのテキストが似ている度合いをスコアとして予測するタスク

In [44]:
text_sts_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-jsts",
    function_to_apply="none"
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--llm-book--bert-base-japanese-v3-jsts/snapshots/01d23e59c46236a19a9171a06cf649a5ebc26a7e/config.json
Model config BertConfig {
  "_name_or_path": "llm-book/bert-base-japanese-v3-jsts",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32768
}

loading config

In [45]:
text = "マチュピチュにはマヤ文明がありました"
sim_text = "ストーンヘンジにはドルイド文明が存在していました。"

result = text_sts_pipeline({"text": text, "text_pair": sim_text})
print(result["score"])

1.0985182523727417


In [46]:
sim_text = "マチュピチュはマヤ文明の一部でした。"

result = text_sts_pipeline({"text": text, "text_pair": sim_text})
print(result["score"])

3.884174346923828


In [47]:
dissim_text = "トイレの壁に黒いタオルがかけられています"

result = text_sts_pipeline({"text": text, "text_pair": dissim_text})
print(result["score"])

-0.05538347363471985


In [48]:
from torch.nn.functional import cosine_similarity

sim_enc_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-unsup-simcse-jawiki",
    task="feature-extraction",
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--llm-book--bert-base-japanese-v3-unsup-simcse-jawiki/snapshots/aa5681f6270216673e39e8e56659afb69e93caea/config.json
Model config BertConfig {
  "_name_or_path": "llm-book/bert-base-japanese-v3-unsup-simcse-jawiki",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32768
}

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--llm-book--bert-base-japa

In [49]:
# textとsim_textのベクトルを獲得
text_emb = sim_enc_pipeline(text, return_tensors=True)[0][0]
sim_emb = sim_enc_pipeline(sim_text, return_tensors=True)[0][0]
# textとsim_textの類似度を計算
sim_pair_score = cosine_similarity(text_emb, sim_emb, dim=0)
print(sim_pair_score.item())

0.9352689385414124


In [50]:
# dissim_textのベクトルを獲得
dissim_emb = sim_enc_pipeline(dissim_text, return_tensors=True)[0][0]
# textとdissim_textの類似度を計算
dissim_pair_score = cosine_similarity(text_emb, dissim_emb, dim=0)
print(dissim_pair_score.item())

0.33561891317367554


## Named entity recognition: NER