# BERT
単語予測

In [10]:
import torch
from transformers import BertJapaneseTokenizer, BertForMaskedLM

# Load pre-trained tokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

# Tokenize input
text = 'テレビでサッカーの試合を見る。'
tokenized_text = tokenizer.tokenize(text)
# ['テレビ', 'で', 'サッカー', 'の', '試合', 'を', '見る', '。']

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 2
tokenized_text[masked_index] = '[MASK]'
# ['テレビ', 'で', '[MASK]', 'の', '試合', 'を', '見る', '。']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# [571, 12, 4, 5, 608, 11, 2867, 8]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
# tensor([[ 571,   12,    4,    5,  608,   11, 2867,    8]])

# Load pre-trained model
model = BertForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
model.eval()

# Predict
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0][0, masked_index].topk(5) # 予測結果の上位5件を抽出

# Show results
for i, index_t in enumerate(predictions.indices):
    index = index_t.item()
    token = tokenizer.convert_ids_to_tokens([index])[0]
    print(i, token)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0 クリケット
1 タイガース
2 サッカー
3 メッツ
4 カブス


文予測(まとめて)

In [18]:
import torch
from transformers import BertJapaneseTokenizer, BertForMaskedLM

# Load pre-trained tokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

# Tokenize input
text = 'テレビでサッカーの試合を見る。'
tokenized_text = tokenizer.tokenize(text)
# ['テレビ', 'で', 'サッカー', 'の', '試合', 'を', '見る', '。']

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_indexs = [1,2,3,4,5]
# masked_indexs = range(1,len(tokenized_text))
for i in masked_indexs:
    tokenized_text[i] = '[MASK]'
# ['テレビ', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

# Load pre-trained model
model = BertForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
model.eval()

# Predict
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = [outputs[0][0, i].topk(5) for i in masked_indexs] # 予測結果の上位5件を抽出

# Show results
for j in range(len(masked_indexs)):
    for i, index_t in enumerate(predictions[j].indices):
        index = index_t.item()
        token = tokenizer.convert_ids_to_tokens([index])[0]
        print(i, token)
    print('\n')

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0 社会
1 家
2 家庭
3 敬
4 場


0 社会
1 家
2 家庭
3 敬
4 場


0 社会
1 家
2 家庭
3 身近
4 職業


0 社会
1 家
2 家庭
3 身近
4 的


0 社会
1 家
2 家庭
3 身近
4 的




文予測(一つずつ)

In [16]:
import torch
from transformers import BertJapaneseTokenizer, BertForMaskedLM

# Load pre-trained tokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

# Tokenize input
text = 'テレビでサッカーの試合を見る。'
tokenized_text = tokenizer.tokenize(text)
tokenized_text.insert(0, '[CLS]')
tokenized_text.append('[SEP]')
print(''.join(tokenized_text))

# Load pre-trained model
model = BertForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

for itr in range(len(tokenized_text)-3):
    # Mask a token that we will try to predict back with `BertForMaskedLM`
    masked_indexs = range(itr+2, len(tokenized_text)-1)
    for i in masked_indexs:
        tokenized_text[i] = '[MASK]'

    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])

    model.eval()

    # Predict
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0][0, itr+2].topk(1) # 予測結果の上位1件を抽出

    # Show results
    index = predictions.indices.item()
    token = tokenizer.convert_ids_to_tokens([index])[0]
    tokenized_text[itr+2] = token
    print(token)
    print(''.join(tokenized_text))

[CLS]テレビでサッカーの試合を見る。[SEP]


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


社会
[CLS]テレビ社会[MASK][MASK][MASK][MASK][MASK][MASK][SEP]
社会
[CLS]テレビ社会社会[MASK][MASK][MASK][MASK][MASK][SEP]
社会
[CLS]テレビ社会社会社会[MASK][MASK][MASK][MASK][SEP]
社会
[CLS]テレビ社会社会社会社会[MASK][MASK][MASK][SEP]
社会
[CLS]テレビ社会社会社会社会社会[MASK][MASK][SEP]
社会
[CLS]テレビ社会社会社会社会社会社会[MASK][SEP]
社会
[CLS]テレビ社会社会社会社会社会社会社会[SEP]


In [1]:
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
assert logits[0, 0] < logits[0, 1] # next sentence was random

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
from transformers import pipeline
generator = pipeline("text-generation")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## BERTで穴埋め

In [1]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

In [6]:
# define model & tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelWithLMHead.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# preprocess
print("input the biased word")
biased_word = input()
sentence = "The {} was asked a question but [MASK] didn't answered.".format(biased_word)
model_input = tokenizer.encode(sentence, return_tensors='pt')
mask_token_index = torch.where(model_input == tokenizer.mask_token_id)[1]
# predict
token_logits = model(model_input)[0]
mask_token_logits = token_logits[0, mask_token_index, :]
#print("input the num of prediction")
#k = int(input())
k = 10
top_k_tokens = torch.topk(mask_token_logits, k, dim=1).indices[0].tolist()
for token in top_k_tokens:
    print(sentence.replace("[MASK]", tokenizer.decode([token])))

input the biased word


 soccer player


The soccer player was asked a question but he didn't answered.
The soccer player was asked a question but she didn't answered.
The soccer player was asked a question but they didn't answered.
The soccer player was asked a question but i didn't answered.
The soccer player was asked a question but we didn't answered.
The soccer player was asked a question but it didn't answered.
The soccer player was asked a question but alex didn't answered.
The soccer player was asked a question but adam didn't answered.
The soccer player was asked a question but sam didn't answered.
The soccer player was asked a question but ryan didn't answered.


## GPT-2で文生成

In [1]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

In [10]:
# define model & tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelWithLMHead.from_pretrained('gpt2')

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
print("input sentence which will be followed by He and She")
sentence = input()

# preprocess
male_sentence = sentence + "He"
male_model_input = tokenizer(male_sentence, return_tensors='pt')
# predict
male_output = model.generate(male_model_input['input_ids'], max_length=30)
print("<predicted sentence>")
print(tokenizer.decode(male_output[0]))
male_last_attention = model(**male_model_input, output_attentions=True)[2][-1]
print("<attentions>")
for i in range(12):
    print(male_last_attention[0][i][8])

print("\n")

# preprocess
female_sentence = sentence + "She"
female_model_input = tokenizer(female_sentence, return_tensors='pt')
# predict
female_output = model.generate(female_model_input['input_ids'], max_length=30)
print("<predicted sentence>")
print(tokenizer.decode(female_output[0]))
female_last_attention = model(**female_model_input, output_attentions=True)[2][-1]
print("<attentions>")
for i in range(12):
    print(female_last_attention[0][i][8])

input sentence which will be followed by He and She


 The doctor asked the nurse a question. 


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


<predicted sentence>
The doctor asked the nurse a question. He asked her if she had ever had a heart attack. She said yes.

"I was just


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


<attentions>
tensor([0.1297, 0.1377, 0.0685, 0.1729, 0.0456, 0.1963, 0.0608, 0.1189, 0.0697],
       grad_fn=<SelectBackward>)
tensor([0.9211, 0.0134, 0.0279, 0.0038, 0.0089, 0.0051, 0.0101, 0.0053, 0.0043],
       grad_fn=<SelectBackward>)
tensor([0.9587, 0.0075, 0.0154, 0.0026, 0.0087, 0.0010, 0.0026, 0.0019, 0.0017],
       grad_fn=<SelectBackward>)
tensor([0.6217, 0.1098, 0.0158, 0.0038, 0.0832, 0.0099, 0.0820, 0.0173, 0.0566],
       grad_fn=<SelectBackward>)
tensor([0.6997, 0.0335, 0.0412, 0.0260, 0.0193, 0.0194, 0.0099, 0.1316, 0.0194],
       grad_fn=<SelectBackward>)
tensor([0.8680, 0.0112, 0.0589, 0.0184, 0.0026, 0.0036, 0.0067, 0.0157, 0.0150],
       grad_fn=<SelectBackward>)
tensor([9.7348e-01, 5.8581e-03, 9.0863e-03, 1.3727e-03, 5.9049e-03, 9.4172e-04,
        1.6482e-03, 9.3672e-04, 7.7180e-04], grad_fn=<SelectBackward>)
tensor([9.7141e-01, 2.3181e-03, 1.1892e-02, 1.8543e-03, 1.9336e-03, 2.0620e-03,
        8.1384e-04, 5.5744e-03, 2.1405e-03], grad_fn=<SelectBackward>)
t

In [31]:
# get occupations from dataset
def isEnglish(word):
    flag = True
    for i in word:
        if ord(i) > 127:
            flag = False
            break
    return flag

occupations = []
with open('data/occupations.wikidata.all.labeled.tsv', encoding="utf-8") as f:
    for line in f:
        occupation = line.split("\t")[2]
        if isEnglish(occupation):
            occupations.append(line.split("\t")[2])
occupations = occupations[1:]
with open('data/occupations.txt', encoding="utf-8", mode='w') as f:
    for i in occupations:
        f.write(i+"\n")
    f.write("職業数：{}".format(len(occupations)))