# 설정 및 선언

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd "/content/drive/MyDrive/Wanted"

/content/drive/MyDrive/Wanted


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 15.9 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 90.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 95.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 94.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found e

In [4]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
import datetime as dt
import pickle

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import RandomSampler, SequentialSampler
from scipy import stats

from transformers import AutoModel, AutoTokenizer, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from torch.nn.utils import clip_grad_norm_

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics

from itertools import product

## 모델, hidden_size 설정

In [5]:
model_name = 'monologg/kobert'
hidden_size = 768

## Device 설정

In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)

# available GPUs : 1
GPU name : Tesla P100-PCIE-16GB
cuda


## Tokenizer 선언

In [7]:
# KoBERT Tokenizer 파일, sentencepiece 라이브러리 다운로드
!wget https://raw.githubusercontent.com/monologg/KoBERT-Transformers/master/kobert_transformers/tokenization_kobert.py
!pip install sentencepiece

--2022-03-24 07:50:11--  https://raw.githubusercontent.com/monologg/KoBERT-Transformers/master/kobert_transformers/tokenization_kobert.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10896 (11K) [text/plain]
Saving to: ‘tokenization_kobert.py’


2022-03-24 07:50:11 (4.82 MB/s) - ‘tokenization_kobert.py’ saved [10896/10896]

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 15.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [8]:
from tokenization_kobert import KoBertTokenizer
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

Downloading:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/76.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


In [9]:
from transformers import BertModel
bert_embedding_model = BertModel.from_pretrained('monologg/kobert')

Downloading:   0%|          | 0.00/352M [00:00<?, ?B/s]

## CustomDataset 선언

In [10]:
class CustomDataset(Dataset):
    """
  - input_data: list of string
  - target_data: list of int
  """

    def __init__(self, input_data: list, score_data:list, target_data: list) -> None:
        self.X = input_data
        self.Y = score_data
        self.Z = target_data

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return (self.X[index], self.Y[index], self.Z[index])


def custom_collate_fn(batch):
    global tokenizer
    
    input_list, score_list, target_list = [_[0] for _ in batch], [_[1] for _ in batch], [_[2] for _ in batch]
    
    tensorized_input = tokenizer(
                                input_list,
                                padding='longest',  # True or 'longest': Pad to the longest sequence in the batch
                                truncation=True,
                                return_tensors='pt',
                                add_special_tokens=False
                                )
    
    tensorized_score = torch.tensor(score_list)
    
    return (tensorized_input, tensorized_score, target_list)

## CustomRegressor 선언

In [11]:
class CustomRegressor(nn.Module):

    def __init__(self, hidden_size=768):
        super(CustomRegressor, self).__init__()

        self.model = bert_embedding_model

        dropout_rate = 0.1
        linear_layer_hidden_size = 32

    
        self.classifier = nn.Sequential(
                                      nn.Linear(hidden_size, linear_layer_hidden_size), 
                                      nn.ReLU(),
                                      nn.Dropout(dropout_rate),
                                      nn.Linear(linear_layer_hidden_size, 1)
                                      )


    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):
    
        outputs = self.model(
                            input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            )
        
        cls_token_last_hidden_states =  outputs['pooler_output']
        
        logits = self.classifier(cls_token_last_hidden_states)
        
        return logits

# Inference

In [12]:
# Load Test Dataset
df = pd.read_json('klue-sts-v1.1_dev.json')
df

Unnamed: 0,guid,source,sentence1,sentence2,labels,annotations
0,klue-sts-v1_dev_00000,airbnb-rtt,무엇보다도 호스트분들이 너무 친절하셨습니다.,"무엇보다도, 호스트들은 매우 친절했습니다.","{'label': 4.9, 'real-label': 4.857142857142857...","{'agreement': '0:0:0:0:1:6', 'annotators': ['1..."
1,klue-sts-v1_dev_00001,airbnb-sampled,주요 관광지 모두 걸어서 이동가능합니다.,위치는 피렌체 중심가까지 걸어서 이동 가능합니다.,"{'label': 1.4, 'real-label': 1.428571428571429...","{'agreement': '0:4:3:0:0:0', 'annotators': ['1..."
2,klue-sts-v1_dev_00002,policy-sampled,학생들의 균형 있는 영어능력을 향상시킬 수 있는 학교 수업을 유도하기 위해 2018...,영어 영역의 경우 학생들이 한글 해석본을 암기하는 문제를 해소하기 위해 2016학년...,"{'label': 1.3, 'real-label': 1.285714285714286...","{'agreement': '0:5:2:0:0:0', 'annotators': ['0..."
3,klue-sts-v1_dev_00003,airbnb-rtt,"다만, 도로와 인접해서 거리의 소음이 들려요.","하지만, 길과 가깝기 때문에 거리의 소음을 들을 수 있습니다.","{'label': 3.7, 'real-label': 3.714285714285714...","{'agreement': '0:0:0:2:5:0', 'annotators': ['1..."
4,klue-sts-v1_dev_00004,paraKQC-para,형이 다시 캐나다 들어가야 하니 가족모임 일정은 바꾸지 마세요.,가족 모임 일정은 바꾸지 말도록 하십시오.,"{'label': 2.5, 'real-label': 2.5, 'binary-labe...","{'agreement': '1:0:1:3:1:0', 'annotators': ['0..."
...,...,...,...,...,...,...
514,klue-sts-v1_dev_00514,policy-rtt,"문체부는 이를 연차적으로 확대, 시행해 학교운동부와 스포츠클럽 간의 연계를 강화한다.",문화체육관광부는 학교스포츠학과와 스포츠클럽의 연계성을 강화하기 위해 매년 이 프로그...,"{'label': 2.2, 'real-label': 2.2, 'binary-labe...","{'agreement': '0:1:2:2:0:0', 'annotators': ['0..."
515,klue-sts-v1_dev_00515,airbnb-sampled,일단 정확한 정보와 빠른 답변이 정말 좋았습니다.,호스트의 빠른 답변과 유용한 정보들이 정말 좋습니다.,"{'label': 2.8, 'real-label': 2.833333333333333...","{'agreement': '0:0:1:5:0:0', 'annotators': ['0..."
516,klue-sts-v1_dev_00516,airbnb-sampled,게스트에 대한 배려가 묻어나는 시설들이었습니다.,우선 공간에 대한 센스가 돋보이는 곳이었습니다.,"{'label': 0.30000000000000004, 'real-label': 0...","{'agreement': '4:2:0:0:0:0', 'annotators': ['1..."
517,klue-sts-v1_dev_00517,policy-sampled,밤하늘을 배경으로 ‘비대면 드론쇼’도 펼쳐진다.,‘비대면 실감형 문화공연 플랫폼’ 개념도.,"{'label': 0.30000000000000004, 'real-label': 0...","{'agreement': '5:0:1:0:0:0', 'annotators': ['0..."


In [13]:
# 한국어 불용어 불러오기 #######################################################
import requests 
from bs4 import BeautifulSoup

url = "https://www.ranks.nl/stopwords/korean"
response = requests.get(url, verify = False)

if response.status_code == 200:
    soup = BeautifulSoup(response.text,'html.parser')
    content = soup.select_one('#article178ebefbfb1b165454ec9f168f545239 > div.panel-body > table > tbody > tr')
    stop_words=[]
    for x in content.strings:
        x=x.strip()
        if x:
            stop_words.append(x)
    print(f"# Korean stop words: {len(stop_words)}")
else:
    print(response.status_code)

# 전처리 함수 remove1 ##########################################################
import re
def remove1(x):
    x =re.sub('[^가-힣 ]','',x)
    x = ' '.join( [w for w in x.split() if w not in stop_words] )

    return x

# special token 처리 ###########################################################
def some_func(s1, s2):
    return '[CLS] ' + s1 + ' [SEP] ' + s2 + ' [SEP]'


# Load Test Dataset ############################################################
df = pd.read_json('klue-sts-v1.1_dev.json') ########### 4) 경로 및 파일명 지정 ##########
df['scores'] = df['labels'].map(lambda x: x['real-label'])
df['labels'] = df['labels'].map(lambda x: x['binary-label'])
df = df[['sentence1', 'sentence2', 'labels', 'scores']]

# Remove Stop Words
df['sentence1']= df['sentence1'].apply(remove1)
df['sentence2']= df['sentence2'].apply(remove1)

# Concat into One Sentence
df['sentence'] = df.apply(lambda x: some_func(s1=x['sentence1'], s2=x['sentence2']), axis=1)

test_df = df[['sentence', 'labels', 'scores']]

# Create Dataset & Dataloader

test_batch_size = 32

test_dataset = CustomDataset(test_df['sentence'].tolist(), test_df['scores'].tolist(), test_df['labels'].tolist())
test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size, collate_fn=custom_collate_fn, shuffle=False)



# Korean stop words: 677


In [14]:
# Load Model
checkpoint = torch.load('model_4_3e-05_32_fold4.ckpt.epoch3')

In [15]:
# Check key types
checkpoint.keys()

dict_keys(['epoch', 'model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'loss'])

In [16]:
def initializer(train_dataloader, hidden_size=768, epochs=2, lr=3e-5):

    model = CustomRegressor(hidden_size=hidden_size)

    optimizer = AdamW(model.parameters(), lr=lr)

    total_steps = len(train_dataloader) * epochs
    print(f"Total train steps with {epochs} epochs: {total_steps}")

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps
                                                )

    return model, optimizer, scheduler


epochs=1
model, optimizer, scheduler = initializer(test_dataloader,
                                          hidden_size=768,
                                          epochs=epochs,
                                          lr=3e-5)

Total train steps with 1 epochs: 17




In [17]:
model.load_state_dict(checkpoint["model_state_dict"])

<All keys matched successfully>

In [18]:
def predict(model, test_dataloader):
    """
    test_dataloader의 label별 확률값과 실제 label 값을 반환
    """

    # model을 eval 모드로 설정 & device 할당
    model.eval()
    model.to(device)

    all_per_logits = []
    all_per_labels = []
    all_f1_logits = []
    all_f1_labels = []

    for step, batch in enumerate(test_dataloader):
        true_labels = [item for item in batch[2]] # binary 정답값
        batch = tuple(item.to(device) for item in batch[0:2])
        batch_input, batch_label = batch
        
        # batch_input을 device 할당
        batch_input.to(device)

        # model에 batch_input을 넣어 logit 반환 & all_logits, all_labels 리스트에 값 추가 
        with torch.no_grad():
          logits = model(**batch_input)

        score = logits.squeeze().cpu().numpy() # real-value 예측값
        pred = (logits.flatten() >= 3).cpu().numpy() # score를 binary 값으로 변환
        batch_label = batch_label.cpu().numpy() # real-value 정답값

        all_per_logits.extend(score)
        all_per_labels.extend(batch_label)
        all_f1_logits.extend(pred)
        all_f1_labels.extend(true_labels)

    return all_per_logits, all_per_labels, all_f1_logits, all_f1_labels

In [19]:
per_probs, per_labels, f1_probs, f1_labels = predict(model, test_dataloader)

In [20]:
acc = metrics.accuracy_score(f1_probs, f1_labels)
recall = metrics.recall_score(f1_probs, f1_labels)
precision = metrics.precision_score(f1_probs, f1_labels)
f1_score = metrics.f1_score(f1_probs, f1_labels)
r = stats.pearsonr(per_probs, per_labels)[0]

In [None]:
# Inference Result

print(f'ACC: {acc:.4f}')
print(f'RECALL: {recall:.4f}')
print(f'PRECISION: {precision:.4f}')
print(f'F1 SCORE: {f1_score:.4f}')
print(f"PEARSON'S R: {r:.4f}")

ACC: 0.8478
RECALL: 0.7809
PRECISION: 0.8909
F1 SCORE: 0.8323
PEARSON'S R: 0.8839


# Inference 결과 CSV 파일로 저장

In [None]:
# dev set에 대한 정답결과 담긴 파일 불러오기

df = pd.read_csv('predicted_dev_set_score.csv') 
df

Unnamed: 0,guid,true_real_label,true_binary_label
0,klue-sts-v1_dev_00000,4.857143,1
1,klue-sts-v1_dev_00001,1.428571,0
2,klue-sts-v1_dev_00002,1.285714,0
3,klue-sts-v1_dev_00003,3.714286,1
4,klue-sts-v1_dev_00004,2.500000,0
...,...,...,...
514,klue-sts-v1_dev_00514,2.200000,0
515,klue-sts-v1_dev_00515,2.833333,0
516,klue-sts-v1_dev_00516,0.333333,0
517,klue-sts-v1_dev_00517,0.333333,0


In [None]:
# Save Predicted Result

df['predict_real_label'] = per_probs
df['predict_binary_label'] = list(map(lambda x:int(x), f1_probs))
display(df.head())

df.to_csv('기업과제3_9팀_dev_set_score.csv')

Unnamed: 0,guid,true_real_label,true_binary_label,predict_real_label,predict_binary_label
0,klue-sts-v1_dev_00000,4.857143,1,4.636324,1
1,klue-sts-v1_dev_00001,1.428571,0,1.850857,0
2,klue-sts-v1_dev_00002,1.285714,0,2.177469,0
3,klue-sts-v1_dev_00003,3.714286,1,3.897496,1
4,klue-sts-v1_dev_00004,2.5,0,3.108995,1
