In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# for reproducibility
torch.manual_seed(100)

# Dictionary
# Dataset(KaungHtetCho/Harry_Potter_LSTM)의 임의의 한 문장 사용
sample_sentence = "Mr. Dursley was the director of a firm called Grunnings, which made drills."
char_set = list(set(sample_sentence))
dic = {c: i for i, c in enumerate(char_set)}

# Parameters
dic_size = len(dic)
input_size = dic_size
hidden_size = dic_size * 2
output_size = dic_size
unit_sequence_length = 20

# Dataset setting
input_batch = []
target_batch = []
"""
  문장의 시작부터 끝까지 unit_sequence_length 만큼의 window size로 이동하며 sequence를 잘라내서,
  여러 개 batch로 이루어진 X와, Y dataset을 만든다
"""
for i in range(len(sample_sentence) - unit_sequence_length):
    input_sequence = sample_sentence[i:i+unit_sequence_length]
    target_sequence = sample_sentence[i+1:i+unit_sequence_length+1]

    input_indices = [dic[char] for char in input_sequence]
    target_indices = [dic[char] for char in target_sequence]

    input_batch.append(np.eye(dic_size)[input_indices])
    target_batch.append(target_indices)

# To torch tensors
X = torch.FloatTensor(np.array(input_batch))
Y = torch.LongTensor(np.array(target_batch))
print(X.shape)
print(Y.shape)


# RNN Model
class Custom_RNN(torch.nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, layers):
    super(Custom_RNN, self).__init__()
    self.rnn = torch.nn.RNN(input_dim, hidden_dim, num_layers=layers, batch_first=True)
    self.fc = torch.nn.Linear(hidden_dim, output_dim, bias=True)

  def forward(self, x):
    x, _status = self.rnn(x)
    x = self.fc(x)
    return x

learning_rate = 0.05
training_epochs = 100
model = Custom_RNN(input_size, hidden_size, output_size, 2)

# define cost/loss & optimizer
criterion = nn.CrossEntropyLoss()    # Softmax
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# train
for epoch in range(training_epochs):
  optimizer.zero_grad()
  outputs = model(X)
  loss = criterion(outputs.reshape(-1, dic_size), Y.reshape(-1))
  loss.backward()
  optimizer.step()
  if epoch % 10 == 9:
    print('epoch: ',epoch, 'loss: ', loss.item())

# result
results = outputs.data.numpy().argmax(axis=2)
"""
  학습이 종료된 후, 학습된 모델을 이용해 문장 전체를 순차적으로 예측해 완성한 후,
  원본과 비교해 어느정도 일치한지 Accuracy를 계산한다.
"""
predicted_sentence = ''
for i, sentence in enumerate(results):
    if i == 0:
        predicted_sentence += ''.join([char_set[c] for c in sentence])
    else:
        predicted_sentence += char_set[results[i][-1]]
original_sentence = sample_sentence

def calculate_accuracy(predicted_sentence, original_sentence):
    words1 = predicted_sentence.split()
    words2 = original_sentence.split()

    total_words = max(len(words1), len(words2))
    matching_words = sum(1 for w1, w2 in zip(words1, words2) if w1 == w2)

    accuracy = matching_words / total_words
    return accuracy

accuracy = calculate_accuracy(predicted_sentence, original_sentence)
print(predicted_sentence)
print(f"Accuracy = {accuracy:.15f}")

torch.Size([55, 20, 24])
torch.Size([55, 20])
epoch:  9 loss:  1.9083023071289062
epoch:  19 loss:  0.28210940957069397
epoch:  29 loss:  0.08879899978637695
epoch:  39 loss:  0.06338243186473846
epoch:  49 loss:  0.05860520154237747
epoch:  59 loss:  0.05736168473958969
epoch:  69 loss:  0.05677241086959839
epoch:  79 loss:  0.05650698021054268
epoch:  89 loss:  0.056348949670791626
epoch:  99 loss:  0.056240446865558624
r. Dursley was the director of a firm called Grunnings, which made drills.
Accuracy = 0.923076923076923


In [18]:
! pip install transformers datasets evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8

In [19]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from datasets import load_dataset

imdb = load_dataset('sepidmnorozy/Korean_sentiment')

train.csv:   0%|          | 0.00/3.29M [00:00<?, ?B/s]

dev.csv:   0%|          | 0.00/127k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/239k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/36000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1333 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2667 [00:00<?, ? examples/s]

In [59]:
imdb['test'][0:3]

{'label': [0, 1, 1],
 'text': ['이 영화는 아름다운 영상미로 쓰레기같은 내용을 감추고 멋진 영화로 위장을 하고있다. 나는 이 영화가 만들다가 만 영화라고 생각한다.',
  '워메 ㅋㅋ 아이언맨이 여기나오는줄은 ㅋㅋ반전이네요',
  '여자 진짜 개귀엽다 ㅋㅋㅋㅋ']}

In [77]:
model_name1 = 'sangrimlee/bert-base-multilingual-cased-nsmc'
from transformers import AutoModelForSequenceClassification
model1 = AutoModelForSequenceClassification.from_pretrained(model_name1)

In [79]:
model_name2 = 'hun3359/klue-bert-base-sentiment'
from transformers import AutoModelForSequenceClassification
model2 = AutoModelForSequenceClassification.from_pretrained(model_name2)

In [75]:
import evaluate

accuracy = evaluate.load('accuracy')

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [78]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis', model=model_name1, device=0)
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
#text = "This was so bad!"
#text = "This was so good!"
classifier(text)

[{'label': 'positive', 'score': 0.9963415265083313}]

In [86]:
test_list = imdb['test']
test_sample_size = 100
rng = np.random.default_rng()
random_indices = rng.choice(len(test_list), size=test_sample_size, replace=False)
sampled_data = test_list.select(random_indices)

test_predictions = classifier(sampled_data['text'], max_length=512, truncation=True)
predicted_labels = [label['label'] for label in test_predictions]

# 예측된 레이블을 출력하여 확인
print("Predicted Labels:", predicted_labels)

# 수동으로 label2id 매핑을 정의 (예시: 한글 레이블 매핑)
label2id = {"긍정": 1, "부정": 0, "중립": 2, "초조한": 3}  # '초조한' 추가

# predicted_labels를 수동으로 정수로 매핑
mapped_labels = [label2id.get(label, -1) for label in predicted_labels]  # get() 메서드로 매핑되지 않은 레이블은 -1로 처리

reference_labels = sampled_data['label']

# 정확도 계산
print(accuracy.compute(predictions=mapped_labels, references=reference_labels))
print(reference_labels)
print(mapped_labels)



Predicted Labels: ['노여워하는', '눈물이 나는', '염세적인', '흥분', '눈물이 나는', '신이 난', '툴툴대는', '낙담한', '신이 난', '염세적인', '만족스러운', '흥분', '슬픔', '성가신', '비통한', '신뢰하는', '만족스러운', '신이 난', '신뢰하는', '흥분', '억울한', '억울한', '악의적인', '흥분', '염세적인', '흥분', '짜증내는', '노여워하는', '툴툴대는', '부끄러운', '남의 시선을 의식하는', '혐오스러운', '짜증내는', '분노', '우울한', '느긋', '부끄러운', '흥분', '편안한', '안달하는', '환멸을 느끼는', '낙담한', '신이 난', '만족스러운', '악의적인', '악의적인', '실망한', '혐오스러운', '흥분', '염세적인', '신이 난', '편안한', '악의적인', '실망한', '흥분', '자신하는', '낙담한', '환멸을 느끼는', '질투하는', '만족스러운', '만족스러운', '만족스러운', '버려진', '혐오스러운', '만족스러운', '분노', '툴툴대는', '분노', '구역질 나는', '악의적인', '만족스러운', '눈물이 나는', '실망한', '눈물이 나는', '버려진', '악의적인', '혐오스러운', '신이 난', '혐오스러운', '낙담한', '슬픔', '당황', '악의적인', '만족스러운', '눈물이 나는', '분노', '자신하는', '좌절한', '신이 난', '흥분', '실망한', '낙담한', '신이 난', '짜증내는', '노여워하는', '만족스러운', '기쁨', '만족스러운', '신이 난', '신이 난']
{'accuracy': 0.0}
[1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,

In [81]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis', model=model_name2, device=0)
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
#text = "This was so bad!"
#text = "This was so good!"
classifier(text)

[{'label': '감사하는', 'score': 0.22273115813732147}]

In [84]:
test_list = imdb['test']
test_sample_size = 100
rng = np.random.default_rng()
random_indices = rng.choice(len(test_list), size=test_sample_size, replace=False)
sampled_data = test_list.select(random_indices)

test_predictions = classifier(sampled_data['text'], max_length=512, truncation=True)
predicted_labels = [label['label'] for label in test_predictions]

# 예측된 레이블을 출력하여 확인
print("Predicted Labels:", predicted_labels)

# 수동으로 label2id 매핑을 정의 (예시: 한글 레이블 매핑)
label2id = {"긍정": 1, "부정": 0, "중립": 2, "초조한": 3}  # '초조한' 추가

# predicted_labels를 수동으로 정수로 매핑
mapped_labels = [label2id.get(label, -1) for label in predicted_labels]  # get() 메서드로 매핑되지 않은 레이블은 -1로 처리

reference_labels = sampled_data['label']

# 정확도 계산
print(accuracy.compute(predictions=mapped_labels, references=reference_labels))
print(reference_labels)
print(mapped_labels)


Predicted Labels: ['기쁨', '억울한', '열등감', '실망한', '만족스러운', '염세적인', '악의적인', '우울한', '억울한', '만족스러운', '만족스러운', '노여워하는', '실망한', '혐오스러운', '실망한', '충격 받은', '흥분', '염세적인', '만족스러운', '만족스러운', '흥분', '마비된', '실망한', '염세적인', '기쁨', '툴툴대는', '자신하는', '열등감', '편안한', '흥분', '신이 난', '후회되는', '신이 난', '후회되는', '남의 시선을 의식하는', '혐오스러운', '염세적인', '배신당한', '낙담한', '신이 난', '구역질 나는', '흥분', '만족스러운', '스트레스 받는', '흥분', '실망한', '낙담한', '당혹스러운', '실망한', '후회되는', '혐오스러운', '좌절한', '자신하는', '신이 난', '혐오스러운', '만족스러운', '상처', '환멸을 느끼는', '당황', '남의 시선을 의식하는', '흥분', '후회되는', '짜증내는', '배신당한', '혐오스러운', '자신하는', '악의적인', '짜증내는', '짜증내는', '악의적인', '눈물이 나는', '성가신', '스트레스 받는', '느긋', '악의적인', '환멸을 느끼는', '충격 받은', '혐오스러운', '흥분', '질투하는', '흥분', '질투하는', '억울한', '흥분', '후회되는', '툴툴대는', '만족스러운', '툴툴대는', '자신하는', '낙담한', '신이 난', '마비된', '노여워하는', '신이 난', '눈물이 나는', '염세적인', '흥분', '낙담한', '기쁨', '만족스러운']
{'accuracy': 0.0}
[1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1