In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import tqdm
import time
import numpy as np

In [2]:
tokenizer = BertTokenizer.from_pretrained('beomi/kcbert-base')

In [3]:
model = BertModel.from_pretrained('beomi/kcbert-base', output_hidden_states = True)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
model.to('cuda')

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30000, 768, padding_idx=0)
    (position_embeddings): Embedding(300, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [5]:
max_len = 300

In [6]:
def tokenizer_(text, max_len, tokenizer):
    encoded_dict = tokenizer.encode_plus(text = text,
                                     add_special_tokens = True,
                                     max_length = max_len,
                                     padding = 'max_length',
                                     return_attention_mask = True,
                                     truncation = True)
    
    input_id = encoded_dict['input_ids']
    token_type_id = encoded_dict['token_type_ids']
    attention_mask = encoded_dict['attention_mask']
    
    return input_id, token_type_id, attention_mask

In [7]:

def bert_embedding(text):
    encoded_dict = tokenizer.encode_plus(text = text,
                                     add_special_tokens = True,
                                     max_length = max_len,
                                     padding = 'max_length',
                                     return_attention_mask = True,
                                     truncation = True)
    
    tokens_tensor = torch.tensor([encoded_dict['input_ids']]).to('cuda')  
    segment_tensors = torch.tensor([encoded_dict['token_type_ids']]).to('cuda')
    attention_tensors = torch.tensor([encoded_dict['attention_mask']]).to('cuda')
    
    model.eval()

    with torch.no_grad():
        outputs = model(tokens_tensor, attention_tensors, segment_tensors)

# embedding_12 = []
# for index, row in tqdm(data.iterrows()):
#     text = row['title']
#     tokens_tensor = torch.tensor([kc_tokenizer.encode_plus(text, truncation=True)['input_ids']])
#     attention_tensors = torch.tensor([kc_tokenizer.encode_plus(text, truncation=True)['attention_mask']])

#     with torch.no_grad():
#         output = model(tokens_tensor, attention_tensors)

# 마지막 레이어(-1) 추출
    # hidden_states = output[2]
    # embedding_12.append(torch.mean(hidden_states[-1][0], dim=0).numpy())

# 4개 평균 합        
    hidden_states = outputs[2]
    
    token_mean = []

    for h in hidden_states[-4:]:
        token_mean.append(torch.mean(h[0], dim=0))

    last_four_sentence_embedding = sum(token_mean)
    
    return last_four_sentence_embedding.cpu().numpy()

In [8]:
df = pd.read_excel(r"C:\Users\user\Desktop\학술대회 2차\데이터\관세용어사전_길이순.xlsx")

# 문장/단어 임베딩

In [16]:
# df_high = df[:200]
# df_middle = df[200:400]
# df_low = df[400:600]

In [20]:
# kcbert_embed_high = df_high["단어 설명"].apply(lambda x: bert_embedding(x))
# kcbert_embed_middle = df_middle['단어 설명'].apply(lambda x: bert_embedding(x))
# kcbert_embed_low = df_low['단어 설명'].apply(lambda x: bert_embedding(x))

In [17]:
# kcbert_embed_high = df_high['단어이름'].apply(lambda x: bert_embedding(x))
# kcbert_embed_middle = df_middle['단어이름'].apply(lambda x: bert_embedding(x))
# kcbert_embed_low = df_low['단어이름'].apply(lambda x: bert_embedding(x))

In [60]:
# kcbert_embedding_high = np.array(kcbert_embed_high.to_list())
# kcbert_embedding_middle = np.array(kcbert_embed_middle.to_list())
# kcbert_embedding_low = np.array(kcbert_embed_low.to_list())

In [None]:
# np.save(r"C:\Users\user\Desktop\학술대회 2차\벡터\kcbert_voc_high.npy",kcbert_embedding_high)
# np.save(r"C:\Users\user\Desktop\학술대회 2차\벡터\kcbert_voc_middle.npy",kcbert_embedding_middle)
# np.save(r"C:\Users\user\Desktop\학술대회 2차\벡터\kcbert_sen_low.npy",kcbert_embedding_low)

In [9]:
kcbert_embed_vocab = df["단어 설명"].apply(lambda x: bert_embedding(x))
kcbert_embed_sen = df['단어이름'].apply(lambda x: bert_embedding(x))

In [10]:
kcbert_embed_vocab = np.array(kcbert_embed_vocab.to_list())
kcbert_embed_sen = np.array(kcbert_embed_sen.to_list())

In [11]:
np.save(r"C:\Users\user\Desktop\학술대회 2차\벡터\kcbert_embed_vocab.npy",kcbert_embed_vocab)
np.save(r"C:\Users\user\Desktop\학술대회 2차\벡터\kcbert_embed_sen.npy",kcbert_embed_sen)