In [10]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
import warnings
import torch
warnings.filterwarnings('ignore')
 
tokenizer = BertTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
 
model = BertModel.from_pretrained(
    "monologg/koelectra-base-v3-discriminator", 
    output_hidden_states=True,
    use_safetensors=True  # PyTorch 버전 이슈 해결
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.

In [20]:
import pandas as pd

train_url = "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt"
train_df = pd.read_csv(train_url, sep="\t")

test_url = "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt"
test_df = pd.read_csv(test_url, sep="\t")

print(train_df.head())
print(test_df.head())

train_df = train_df[:6000]
test_df = test_df[:2000]


         id                                           document  label
0   9976970                                아 더빙.. 진짜 짜증나네요 목소리      0
1   3819312                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1
2  10265843                                  너무재밓었다그래서보는것을추천한다      0
3   9045019                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정      0
4   6483659  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...      1
        id                                           document  label
0  6270596                                                굳 ㅋ      1
1  9274899                               GDNTOPCLASSINTHECLUB      0
2  8544678             뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아      0
3  6825595                   지루하지는 않은데 완전 막장임... 돈주고 보기에는....      0
4  6723715  3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??      0


In [21]:
from tqdm import tqdm


train_encodings = []
for text in tqdm(train_df['document'], desc="Train 데이터 인코딩 중"):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    train_encodings.append(outputs.pooler_output.squeeze().numpy())
    


Train 데이터 인코딩 중: 100%|██████████| 6000/6000 [35:28<00:00,  2.82it/s]


In [22]:
test_encodings = []
for text in tqdm(test_df['document'], desc="Test 데이터 인코딩 중"):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    test_encodings.append(outputs.pooler_output.squeeze().numpy())

Test 데이터 인코딩 중: 100%|██████████| 2000/2000 [09:58<00:00,  3.34it/s]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

logistic = LogisticRegression(max_iter=1000)
logistic.fit(train_encodings, train_df['label'])
preds = logistic.predict(test_encodings)
print("정확도:", accuracy_score(test_df['label'], preds))




정확도: 0.703
