<a href="https://colab.research.google.com/github/geunyoung421/SentimentAnalysis/blob/main/BERT_%EC%98%81%EC%96%B4_%EC%98%A4%EA%B7%BC%EC%98%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#코드 출처
#https://colab.research.google.com/drive/1EMzEfTYjYLgEHjCCP1vEr9oOZLXMocGh?usp=sharing

# **Emotion Recognition**

##### **Dataset**

In [None]:
import json
#데이터 타입:화자, 텍스트, 감정
data = {'train': {'speaker': [], 'utterance': [], 'emotion': []},
        'dev': {'speaker': [], 'utterance': [], 'emotion': []},
        'test': {'speaker': [], 'utterance': [], 'emotion': []}}

for dtype in ['train', 'dev', 'test']:
  for dialog in json.loads(open('friends_' + dtype + '.json').read()):
    for line in dialog:
      data[dtype]['speaker'].append(line['speaker'])
      data[dtype]['utterance'].append(line['utterance'])
      data[dtype]['emotion'].append(line['emotion'])

In [None]:
e2i_dict = dict((emo, i) for i, emo in enumerate(set(data['train']['emotion'])))
i2e_dict = {i: e for e, i in e2i_dict.items()}

##### **Model**

In [None]:
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(768, len(e2i_dict))#8개 감정:Neutral, Joy, Sadness, Fear, Anger, Surprise, Disgust, non 8개로 분류된다.

  def forward(self, utterance):#utterance은 문장
    tokens = self.bert_tokenizer.tokenize(utterance)
    tokens = ['[CLS]'] + tokens + ['[SEP]'] # (len)
    ids = [tokenizer.convert_tokens_to_ids(tokens)] # (bat=1, len)
    input_tensor = torch.tensor(ids).cuda()

    hidden_tensor = self.bert_model(input_tensor)[0] # (bat, len, hid)
    hidden_tensor = hidden_tensor[:, 0, :] # (bat, hid)
    logit = self.linear(hidden_tensor)
    return logit

##### **Evaluation Metrics**

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(true_list, pred_list):
  precision = precision_score(true_list, pred_list, average=None)
  recall = recall_score(true_list, pred_list, average=None)
  micro_f1 = f1_score(true_list, pred_list, average='micro')
  print('precision:\t', ['%.4f' % v for v in precision])
  print('recall:\t\t', ['%.4f' % v for v in recall])
  print('micro_f1: %.6f' % micro_f1)

##### **Hyper-parameters**

In [None]:
pretrained_weights = 'bert-base-uncased'
learning_rate = 1e-5
n_epoch = 3

##### **Training**

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
from tqdm import tqdm_notebook

model = Model()
model.cuda()#GPU방식
criterion = torch.nn.CrossEntropyLoss() # LogSoftmax & NLLLoss 손실함수
optimizer = torch.optim.Adam(model.parameters(), learning_rate)#gradient descent 방식이 adam

for i_epoch in range(n_epoch):
  print('i_epoch:', i_epoch)

  model.train()
  for i_batch in tqdm_notebook(range(len(data['train']['utterance']))):
    logit = model(data['train']['utterance'][i_batch])
    target = torch.tensor([e2i_dict[data['train']['emotion'][i_batch]]]).cuda()
    loss = criterion(logit, target)#예측한 숫자값, 감정값의 loss차이 
    loss.backward()
    optimizer.step()#gradient descent
    optimizer.zero_grad()
  
  model.eval()
  pred_list, true_list = [], []
  for i_batch in tqdm_notebook(range(len(data['dev']['utterance']))):
    logit = model(data['dev']['utterance'][i_batch])
    _, max_idx = torch.max(logit, dim=-1)#model해서 나온 실수값의 최댓값
    pred_list += max_idx.tolist()#리스트에서 제일 큰 값
    true_list += [e2i_dict[data['dev']['emotion'][i_batch]]]#실제값
  evaluate(pred_list, true_list) # print results

i_epoch: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.0000', '0.9022', '0.1304', '0.3529', '0.6016', '0.3065', '0.1168', '0.5828']
recall:		 ['0.0000', '0.6392', '0.5000', '0.4348', '0.5649', '0.5135', '0.2809', '0.5752']
micro_f1: 0.578947
i_epoch: 1


  _warn_prf(average, modifier, msg_start, len(result))


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.0000', '0.8534', '0.2174', '0.3059', '0.6260', '0.3226', '0.2336', '0.5762']
recall:		 ['0.0000', '0.6926', '0.1786', '0.4262', '0.5789', '0.5263', '0.3165', '0.5613']
micro_f1: 0.580645
i_epoch: 2


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.0345', '0.7963', '0.2609', '0.3176', '0.6504', '0.3065', '0.2804', '0.5695']
recall:		 ['0.5000', '0.7201', '0.2222', '0.5000', '0.5195', '0.5429', '0.2941', '0.5409']
micro_f1: 0.568761


##### **Labeling**


In [None]:
##unlabeled.json, friends_dev.json, friends_test.json, , friends_train.json로드
from collections import OrderedDict
'''
"speaker": "Phoebe",
            "utterance": "Alright, whadyou do with him?"
'''
labeled = []
dialogs = json.loads(open('unlabeled.json').read())
for dialog in tqdm_notebook(dialogs):
  dialog_list = []
  for line in dialog:
    logit = model(line['utterance'])
    _, max_idx = torch.max(logit, dim=-1)
    pred_emotion = max_idx.tolist()[0]

    line_dict = OrderedDict()
    line_dict['speaker'] = line['speaker']
    line_dict['utterance'] = line['utterance']
    line_dict['emotion'] = i2e_dict[pred_emotion]
    dialog_list.append(line_dict)
  labeled.append(dialog_list)

with open('labeled.json', 'w') as json_file:
  json.dump(labeled, json_file, indent='\t')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=240.0), HTML(value='')))




In [None]:
import csv

dialogs = []
dialogs.append([])

with open('en_data.csv', newline='') as csvfile:
  spamreader = csv.reader(csvfile)
  for i,row in enumerate(spamreader):
    if i!=0:
      dialogs[0].append({'id':row[0],'speaker':row[3], 'utterance':row[4]})

from collections import OrderedDict

labeled = []
#dialogs = json.loads(open('unlabeled.json').read())
for dialog in tqdm_notebook(dialogs):
  dialog_list = []
  for line in dialog:
    logit = model(line['utterance'])
    _, max_idx = torch.max(logit, dim=-1)
    pred_emotion = max_idx.tolist()[0]

    line_dict = OrderedDict()
    line_dict['Id'] = line['id']
    line_dict['speaker'] = line['speaker']
    line_dict['utterance'] = line['utterance']
    line_dict['emotion'] = i2e_dict[pred_emotion]
    dialog_list.append(line_dict)
  labeled.append(dialog_list)

#with open('labeled.json', 'w') as json_file:
#  json.dump(labeled, json_file, indent='\t')  

with open('labeled.csv', 'w', newline='') as csvfile:
    fieldnames = ['Id', 'Predicted']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for row in dialog_list:
      writer.writerow({'Id': row['Id'], 'Predicted': row['emotion']})

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


