<a href="https://colab.research.google.com/github/citrus1998/nlp100_with_alpha/blob/main/Word_Clustering_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install modelzoo-client[transformers]
!pip install fugashi ipadic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting modelzoo-client[transformers]
  Downloading modelzoo_client-0.15.0-py2.py3-none-any.whl (46 kB)
[K     |████████████████████████████████| 46 kB 3.6 MB/s 
Collecting names==0.3.0
  Downloading names-0.3.0.tar.gz (789 kB)
[K     |████████████████████████████████| 789 kB 34.1 MB/s 
[?25hCollecting click==7.1
  Downloading click-7.1-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 243 kB/s 
[?25hCollecting yaspin==0.16.0
  Downloading yaspin-0.16.0-py2.py3-none-any.whl (18 kB)
Collecting colorama==0.4.3
  Downloading colorama-0.4.3-py2.py3-none-any.whl (15 kB)
Collecting transformers>=2.10.0
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 51.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |███████████████████████████████

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

Downloading tokenizer_config.json:   0%|          | 0.00/104 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/252k [00:00<?, ?B/s]

In [3]:
pos_text_1 = "砂糖が甘い。"
print(tokenizer.tokenize(pos_text_1))
pos_text_index_1 = 2

pos_text_2 = "カレーが甘い。"
print(tokenizer.tokenize(pos_text_2))
pos_text_index_2 = 2

neg_text_1 = "詰めが甘い。"
print(tokenizer.tokenize(neg_text_1))
neg_text_index_1 = 2

neg_text_2 = "考えが甘い。"
print(tokenizer.tokenize(neg_text_2))
neg_text_index_2 = 2

['砂糖', 'が', '甘い', '。']
['カレー', 'が', '甘い', '。']
['詰め', 'が', '甘い', '。']
['考え', 'が', '甘い', '。']


In [4]:
import torch

N_GPU = torch.cuda.device_count()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"DEVICE: {DEVICE}, N_GPU:{N_GPU}")

from transformers import AutoModel

model = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
model.to(DEVICE)

DEVICE: cuda, N_GPU:1


Downloading pytorch_model.bin:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [6]:
texts = [pos_text_1, pos_text_2, neg_text_1, neg_text_2]
indexes = [pos_text_index_1, pos_text_index_2, neg_text_index_1, neg_text_index_2]
vectors = []

for text, index in zip(texts, indexes):
  d = tokenizer(
    text, 
    max_length=10, 
    truncation=True, 
    padding="max_length",
    return_tensors='pt'
  )
  
  outputs = model(
      input_ids = d["input_ids"].to(DEVICE),
      token_type_ids = d["token_type_ids"].to(DEVICE),
      attention_mask = d["attention_mask"].to(DEVICE),
  )
  
  target_vector = outputs[0][:, index+1][0]
  vectors.append(target_vector.cpu())

vectors = torch.stack(vectors)
vectors = torch.nn.functional.normalize(vectors, dim=-1)

score = torch.mm(vectors, vectors.transpose(0, 1))

import csv
with open('score.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(score.tolist())