In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/My\ Drive/nlp/sentiment_analysis

/content/drive/My Drive/nlp/sentiment_analysis


In [3]:
! pip install transformers
! pip install torchsummary

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 9.8MB/s eta 0:00:01
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 28.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 47.8MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)


In [1]:
import os 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pathlib import Path

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AdamW
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F
import torch
from tqdm.notebook import tqdm
from torchsummary import summary

In [23]:
# GPU 사용
device = torch.device("cuda")

In [3]:
class SentimentReviewDataset(Dataset):
  
  def __init__(self, dataset):
    self.dataset = dataset
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[1]
    y = row[0]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [28]:
def merge_review_dataset(data_dir_list):
    # naver shopping review 
    shopping_review_dataset = pd.read_csv(data_dir_list[0], sep='\t', names=['star','review'])
    shopping_review_dataset['star'] = [1 if data['star'] >= 4 else 0 for idx, data in shopping_review_dataset.iterrows()]
    shopping_review_dataset.rename(columns = {"star": "label"}, inplace=True)

    # naver movie review 
    movie_review_dataset = pd.read_csv(data_dir_list[1], sep='\t')
    movie_review_dataset = movie_review_dataset[['label','document']]
    movie_review_dataset.rename(columns={'document':'review'}, inplace=True)

    # steam game review 
    steam_review_dataset = pd.read_csv(data_dir_list[2], sep='\t', names=['label','review'])

    # merge 
    review_dataset = pd.concat([shopping_review_dataset, movie_review_dataset, steam_review_dataset], ignore_index=True)

    return review_dataset

In [29]:
data_dir_list = ['./naver_shopping_review_dataset.txt', './ratings.txt', './steam.txt']
review_dataset = merge_review_dataset(data_dir_list)
review_dataset.dropna(inplace=True)

# split train test
train_data, test_data = train_test_split(review_dataset, test_size=0.2) 

In [10]:
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [11]:
tokenizer.vocab_size

35000

In [12]:
train_dataset = SentimentReviewDataset(train_data)
test_dataset = SentimentReviewDataset(test_data)

In [13]:
train_dataset[0]



(tensor([    2, 11268,  7695,  4219,  8754,  4086, 21888,  4150,    18,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

## Create Model

In [26]:
model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451776329.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [None]:
# # 한번 실행해보기
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

In [None]:
# model.load_state_dict(torch.load("model.pt"))
# model

In [19]:
epochs = 3
batch_size = 16

In [20]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

In [None]:
losses, accuracies

In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

In [None]:
# 모델 저장하기
save_model_path = './_weights'
if os.path.exists(save_model_path) == False:
    os.mkdir(save_model_path)  	
torch.save(model.state_dict(), os.path.join(save_model_path, "koelectra-base-finetuned-sentiment-analysis.bin"))

## Evaluate

In [2]:
from sklearn.metrics import classification_report

In [3]:
# naver shopping review 
shopping_review_dataset = pd.read_csv('./_data/naver_shopping_review.txt', sep='\t', names=['star','review'])
shopping_review_dataset['star'] = [1 if data['star'] >= 4 else 0 for idx, data in shopping_review_dataset.iterrows()]
shopping_review_dataset.rename(columns = {"star": "label"}, inplace=True)

# # naver movie review 
# movie_review_dataset = pd.read_csv('./_data/naver_movie_ratings.txt', sep='\t')
# movie_review_dataset = movie_review_dataset[['label','document']]
# movie_review_dataset.rename(columns={'document':'review'}, inplace=True)
# movie_review_dataset.dropna(inplace=True)

# # steam game review 
# steam_review_dataset = pd.read_csv('./_data/steam_game_review.txt', sep='\t', names=['label','review'])
# steam_review_dataset.dropna(inplace=True)

In [6]:
train_data, test_data = train_test_split(shopping_review_dataset, test_size=0.2, random_state=42) 
print(len(test_data))

40000


In [10]:
## sentiment analysis
koelectra_finetuned_model_dir = os.path.join('_weights','koelectra-base-finetuned-sentiment-analysis.bin')
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator")
model.load_state_dict(torch.load(koelectra_finetuned_model_dir, map_location=torch.device('cpu')))
sentiment_classifier = pipeline('sentiment-analysis', tokenizer=tokenizer, model=model)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [11]:
y_pred = []
total_len = len(test_data)
for cnt, review in enumerate(test_data['review']):
    pred = sentiment_classifier(review)
#     print(f"{cnt} / {total_len} : {pred[0]}")
    if pred[0]['label'] == 'LABEL_1':
        y_pred.append(1)
    else:
        y_pred.append(0)

In [12]:
print(classification_report(test_data['label'], y_pred))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95     19975
           1       0.93      0.97      0.95     20025

    accuracy                           0.95     40000
   macro avg       0.95      0.95      0.95     40000
weighted avg       0.95      0.95      0.95     40000



# 감정분석

In [4]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
# import library
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

# load model
tokenizer = AutoTokenizer.from_pretrained("jaehyeong/koelectra-base-v3-generalized-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("jaehyeong/koelectra-base-v3-generalized-sentiment-analysis")
sentiment_classifier = TextClassificationPipeline(tokenizer=tokenizer, model=model)

In [2]:
import pandas as pd
import numpy as np
import csv

In [6]:
data_kurly = pd.read_csv('C:/Users/NT550-048/Desktop/잇더/data/data_kurly.csv', doublequote=False, escapechar='"', quoting=csv.QUOTE_NONE)

In [7]:
data_kurly['ReviewText'] = data_kurly['ReviewText'].apply(lambda x:x.replace('\n', ' ') if '\n' in x else x)

In [8]:
review_list = list(data_kurly['ReviewText'])

In [29]:
# 감정분석 결과 및 점수
pred_label = []
pred_score = []
for rev in review_list:
    label_temp = sentiment_classifier(rev)[0]['label']
    score_temp = sentiment_classifier(rev)[0]['score']
    pred_label.append(label_temp)
    pred_score.append(score_temp)

In [34]:
# 감정분석 결과 및 점수 데이터프레임에 추가
data_kurly['pred_label'] = pred_label
data_kurly['pred_score'] = pred_score

In [67]:
data_kurly['pred_label'] = data_kurly['pred_label'].apply(lambda x:int(x))
data_kurly['pred_score'] = data_kurly['pred_score'].apply(lambda x:float(x))

In [59]:
data_kurly.to_csv('C:/Users/NT550-048/Desktop/잇더/data/data_kurly_sentiment.csv')

In [70]:
len(data_kurly[data_kurly['pred_label']==0])/len(data_kurly)

0.10461563203602445

In [1]:
data_kurly[data_kurly['pred_score']<=0.6]

NameError: name 'data_kurly' is not defined

In [61]:
data_kurly['ReviewText'][12334]

'양이 적어서 아쉬웠지만  굿~  얼음팩이 찢어져서 녹으면서 물이 다 새서 통이 다 젖어서 찢어졌네요..'

In [9]:
for idx, review in enumerate(review_list):
  pred = sentiment_classifier(review)
  print(f'{review}\n>> {pred[0]}')

간단한 한끼대용으로 좋네요
>> {'label': '1', 'score': 0.9707081913948059}
맛있어서 재구매했어요!
>> {'label': '1', 'score': 0.9787582159042358}
아침대용으로 간단하고좋아요
>> {'label': '1', 'score': 0.9499666094779968}
톳좋아해서 주문했어요.
>> {'label': '1', 'score': 0.9818790555000305}
지난 번에 간편하면서도 맛있게 먹어서 또 주문 했습니다. 
>> {'label': '1', 'score': 0.9939437508583069}
지난번 주문 시 맛있게 먹은 기억으로 재 주문
>> {'label': '1', 'score': 0.9773344397544861}
밥은 따로 해야하지만 양 넉넉하고 맛도 좋아요 적장히 슴슴하고 추천해요
>> {'label': '1', 'score': 0.9892695546150208}
계속 시켜 먹고 있어요. 진짜 감칠맛 대박이고 맛있어요 양념장까지 들어 있어서 너무 편함
>> {'label': '1', 'score': 0.9930447936058044}
항상 아이가 먼저 찾는 문어솥밥이에요.
>> {'label': '1', 'score': 0.9879741668701172}
생각보다는 평범했지만 밀키트로 간편하고 맛있게 먹었습니다 
>> {'label': '1', 'score': 0.9859785437583923}
가족들이 주문해서 잘 먹었어요 ㅎㅎ
>> {'label': '1', 'score': 0.9932798743247986}
생각보단 비렸어요 후기가 너무 좋아서 기대가 컸나봐요
>> {'label': '0', 'score': 0.9411388635635376}
문어솥밥 맛있게 잘먹었어요
>> {'label': '1', 'score': 0.9863168597221375}
정말 너무 맛있네요 3봉지 더 재주문 했어요 반찬 없이도 김이랑 먹어도 넘나 맛있어요
>> {'label': '1'

In [11]:
pred[0]

{'label': '1', 'score': 0.9923847913742065}