**Settings**

In [1]:
!pip install transformers --quiet # package installer for python

[K     |████████████████████████████████| 1.5MB 13.6MB/s 
[K     |████████████████████████████████| 890kB 57.7MB/s 
[K     |████████████████████████████████| 2.9MB 54.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
#git에 올려둔 friends 데이터셋 받아오기
!wget https://raw.githubusercontent.com/changdukkim/changdukkim-SA-Competition-BDC101/master/friends_dev.json
!wget https://raw.githubusercontent.com/changdukkim/changdukkim-SA-Competition-BDC101/master/friends_test.json
!wget https://raw.githubusercontent.com/changdukkim/changdukkim-SA-Competition-BDC101/master/friends_train.json

--2020-12-20 15:43:02--  https://raw.githubusercontent.com/changdukkim/changdukkim-SA-Competition-BDC101/master/friends_dev.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 229392 (224K) [text/plain]
Saving to: ‘friends_dev.json’


2020-12-20 15:43:02 (15.8 MB/s) - ‘friends_dev.json’ saved [229392/229392]

--2020-12-20 15:43:02--  https://raw.githubusercontent.com/changdukkim/changdukkim-SA-Competition-BDC101/master/friends_test.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 544341 (532K) [text/plain]
Saving to: ‘friends_test.json

In [3]:
import tensorflow as tf
import torch
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import random
import time
import datetime
import json

In [4]:
def jsonToDf(file_name):
  with open(file_name, encoding = 'utf-8', mode = 'r') as file:
    json_array = json.load(file)
  
  result = pd.DataFrame.from_dict(json_array[0])

  is_first = True
  for array in json_array:
    if is_first:
      is_first = False
      continue
    
    temp_df = pd.DataFrame.from_dict(array)
    result = result.append(temp_df, ignore_index = True)

  return result

In [5]:
train = jsonToDf('friends_train.json')

**Test Dataset 업로드/전처리**

In [6]:
#검증 데이터 업로드 시 아래 셀 실행
from google.colab import files
files.upload()

MessageError: ignored

In [None]:
#테스트 데이터 파일 입력, 별도 검증 데이터 사용 시 해당 파일 명으로 대체
test_data = 'en_data.csv'

In [None]:
test = pd.read_csv('en_data.csv')

In [None]:
test.head()

In [None]:
print(test.shape)

In [None]:
MAX_LEN = 85
def getInputsFromTest(dataset):
  data = dataset.copy(deep=True)
  #data['utterance'] = data['utterance'].str.lower()

  utterances = data['utterance']
  utterances = ["[CLS] " + str(utterance) + " [SEP]" for utterance in utterances]
  
  tokenizer = ElectraTokenizer.from_pretrained('google/electra-large-discriminator')
  tokenized_texts = [tokenizer.tokenize(utterance) for utterance in utterances]

  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  attention_masks = []
  for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)

  return input_ids, attention_masks

In [None]:
def getIndex(dataset):
  data = dataset.copy(deep = True)
  input_index = data.id.tolist()
  return torch.tensor(input_index)

In [None]:
test_inputs, test_masks = getInputsFromTest(test)

test_index = getIndex(test)
test_inputs = torch.tensor(test_inputs)
test_masks = torch.tensor(test_masks)

In [None]:
batch_size = 32

test_data = TensorDataset(test_index, test_inputs, test_masks)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

**모델 다운로드**

In [None]:
#미리 학습해둔 모델 다운로드
!wget https://toto-kevin-bucket.s3.ap-northeast-2.amazonaws.com/model_eng_electra_large.pt

In [None]:
model = ElectraForSequenceClassification.from_pretrained('google/electra-large-generator', num_labels=8)
model.load_state_dict(torch.load("model_eng_electra_large.pt"))
model.cuda()

**테스트셋 검증 및 결과 데이터 생성**

In [None]:
tmp_test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)
test_result = test.copy(deep = True)
test_result = test_result.drop(columns = ['i_dialog', 'i_utterance', 'speaker'])
test_result['Predicted'] = 'default'

encoder = LabelEncoder()
labels = train['emotion'].values
encoder.fit(labels)
labels = encoder.transform(labels)


for step, batch in enumerate(tmp_test_dataloader):
    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_index, b_input_ids, b_input_mask = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    idx = b_index.item()
    test_result['Predicted'][idx] = encoder.classes_[np.argmax(logits)]

In [None]:
test_result.tail()

In [None]:
test_result = test_result.drop(columns = ['utterance'])

In [None]:
test_csv = test_result.to_csv('sample.csv', index=False)

In [None]:
#파일 다운로드
from google.colab import files

files.download('sample.csv')