# KoBERT 사용하기(HuggingFace Library)
- KoBERT(SKT)도 있고 KorBERT(ETRI)도 있고 KoBART, KoGPT도 있는 것 같다. 어떤 걸 써야 하지?! 
- ETRI를 활용한 KorBertSum 
  - https://velog.io/@raqoon886/KorBertSum-SummaryBot 
  - https://github.com/raqoon886/KorBertSum
- KoBART를 이용한 요약
  - http://blog.ju-ing.co.kr/posts/KoBART-summarization/ 
  - https://younghwani.github.io/posts/kobart-summary-3/ 

Direct Access를 위한 data preprocess functions 정의하기

In [1]:
import re

def del_bracket(s):
  pattern = r'\([^)]*\)'  # ()
  s = re.sub(pattern=pattern, repl='', string=s)

  pattern = r'\[[^)]*\]'  # []
  s = re.sub(pattern=pattern, repl='', string=s)

  pattern = r'\<[^)]*\>'  # <>
  s = re.sub(pattern=pattern, repl='', string=s)

  pattern = r'\{[^)]*\}'  # {}
  s = re.sub(pattern=pattern, repl='', string=s)

  return s

def del_special_num(s):
  pattern = r'[^a-zA-Z가-힣]'
  s = re.sub(pattern=pattern, repl=' ', string=s)

  return s

def del_unit(s):
  units = ['mm', 'cm', 'km', 'ml', 'kg', 'g']
  for unit in units:
    s = s.lower() # 대문자를 소문자로 변환
    s = s.replace(unit, '')
  return s

def del_whitespace(s):
  return " ".join(s.split())

import io 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
def del_stopwords(s):
  stopwords = open("data/stopwords.txt", 'r', encoding="utf-8").read().split()
  #print(stopwords)
  s_o=s.split()
  s_f=[]
  for w in s_o:
    if w.strip() not in stopwords:
      s_f.append(w.strip())
  return " ".join(s_f)

In [2]:
#modelname = "beomi/KcELECTRA-base"
modelname = "klue/bert-base"
#modelname = "beomi/kcbert-base"
#modelname = "monologg/kobert"

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_cosine_schedule_with_warmup, BertTokenizer, BertModel

#GPU 사용
device = torch.device("cuda:0")

#BERT 모델 불러오기
bertmodel = BertModel.from_pretrained(modelname)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
import pandas as pd

train=pd.read_excel('data/Training.xlsx')
train=pd.DataFrame({'s1': train['감정_대분류'], 's2': train['감정_소분류'], 't1': train['사람문장1']})
train.head()

val=pd.read_excel('data/Validation.xlsx')
val=pd.DataFrame({'s1': val['감정_대분류'], 's2': val['감정_소분류'], 't1': val['사람문장1']})

In [5]:
#기쁨 불안 당황 슬픔 분노 상처 
s12label={'기쁨':0, '불안': 1, '당황':2, '슬픔': 3, '분노': 4, '상처':5}
train_l=[]
for t, s in zip(train['t1'], train['s1']):
  t=del_bracket(t)
  t=del_special_num(t)
  t=del_whitespace(t)
  t=del_stopwords(t)
  train_l.append([t, s12label[s.strip()]])

for t, s in zip(val['t1'], val['s1']):
  t=del_bracket(t)
  t=del_special_num(t)
  t=del_whitespace(t)
  t=del_stopwords(t)
  train_l.append([t, s12label[s]])

print(train_l[:10])

[['일은 끝이 없을까 화가 난다', 4], ['달에 급여가 깎였어 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가', 4], ['회사에 신입이 들어왔는데 말투가 거슬려 애를 매일 봐야 한다고 생각하니까 스트레스 받아', 4], ['직장에서 막내라는 이유로 나에게만 온갖 심부름을 시켜 일도 데 정말 분하고 섭섭해', 4], ['전 입사한 신입사원이 나를 무시하는 같아서 너무 화가', 4], ['직장에 다니고 있지만 시간만 버리는 거 같아 진지하게 진로에 대한 고민이 생겨', 4], ['성인인데도 진로를 아직도 못 정했다고 부모님이 노여워하셔 나도 섭섭해', 4], ['퇴사한 지 됐지만 천천히 직장을 구해보려고', 0], ['졸업반이라서 취업을 생각해야 하는데 지금 너무 느긋해서 이래도 되나 싶어', 1], ['요즘 직장생활이 너무 편하고 좋은 같아', 0]]


In [6]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, tokenizer, max_len):
        self.sentences = [tokenizer.encode_plus(i[sent_idx], add_special_tokens=True, max_length=max_len, padding='max_length', truncation=True) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, idx):
        token_ids = torch.tensor(self.sentences[idx]['input_ids'], dtype=torch.long)
        valid_length = torch.tensor(len(self.sentences[idx]['input_ids']), dtype=torch.long)
        segment_ids = torch.tensor(self.sentences[idx]['token_type_ids'], dtype=torch.long)
        attention_mask = torch.tensor(self.sentences[idx]['attention_mask'], dtype=torch.long)
        label = torch.tensor(self.labels[idx])
        return token_ids, valid_length, segment_ids, attention_mask, label

    def __len__(self):
        return (len(self.labels))

In [7]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 50
max_grad_norm = 1
log_interval = 10
learning_rate =  1e-5

In [8]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

# Preparing the dataset
dataset_train, dataset_test = train_test_split(train_l, test_size=0.2, random_state=0)
print(len(dataset_train), len(dataset_test))

# Tokenization
tokenizer = AutoTokenizer.from_pretrained(modelname)

# Ensure all data is a string
dataset_train = [[str(i[0]), i[1]] for i in dataset_train]
dataset_test = [[str(i[0]), i[1]] for i in dataset_test]

# Creating instances of the BERTDataset class for train and test sets
data_train = BERTDataset(dataset_train, 0, 1, tokenizer, max_len)
data_test = BERTDataset(dataset_test, 0, 1, tokenizer, max_len)


46616 11655


Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [9]:
data_train[0]

(tensor([   2, 4229, 3686, 6509, 4819, 2069, 1085, 6205, 2097, 8402,  568,  555,
         2227,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 tensor(64),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(3, dtype=torch.int32))

In [10]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=0)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)

In [11]:
num_classes = 6

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=num_classes,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids, attention_mask):
        outputs = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        pooler = outputs[1]
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [12]:
# Code for checking bias

from collections import Counter
labels = []
for _, (_, _, _, _, label) in enumerate(tqdm(test_dataloader)):
    labels.extend(label.numpy())
print(Counter(labels))

100%|██████████| 183/183 [00:00<00:00, 530.38it/s]

Counter({4: 2080, 5: 2039, 1: 2036, 3: 2032, 2: 2001, 0: 1467})





In [13]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.7).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(out, label):
    predicted = out.argmax(1)
    correct = (predicted == label).sum().item()
    total = label.size(0)
    accuracy = correct / total
    # print(f'Predicted: {predicted}, Label: {label}, Correct: {correct}, Total: {total}, Accuracy: {accuracy}')
    return accuracy

    
train_dataloader



<torch.utils.data.dataloader.DataLoader at 0x1f451e7f100>

In [14]:
num_augmentations = 0
len_data = len(dataset_train) + len(dataset_test)

In [15]:
# For time-based tensorboard store

from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d")
log_dir = f'./logs/{timestamp}-model{modelname}-data{len_data}/a{num_augmentations}-class{num_classes}-batch{batch_size}-lr{learning_rate}-epoch{num_epochs}-maxlen{max_len}'

In [16]:
from torch.utils.tensorboard import SummaryWriter

In [17]:
from tqdm.notebook import tqdm

best_acc = 0
max_patience = 5
patience = 0
PATH = log_dir
writer = SummaryWriter(PATH)

# print("Starting training loop...")
for e in range(num_epochs):
    # print("Starting epoch ", e+1)
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    # for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
    for batch_id, (token_ids, valid_length, segment_ids, attention_mask, label) in enumerate(tqdm(train_dataloader)):
        # print("Starting batch ", batch_id+1)    
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        attention_mask = attention_mask.to(device)
        out = model(token_ids, valid_length, segment_ids, attention_mask)
        # print(f'Output: {out.argmax(1)}, Label: {label}')

        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            writer.add_scalar('loss/train_loss', loss.data.cpu().numpy(), e*len(train_dataloader)+batch_id+1)
            writer.add_scalar('acc/train_acc', train_acc / (batch_id+1), e*len(train_dataloader)+batch_id+1)
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, attention_mask, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        attention_mask = attention_mask.to(device)
        out = model(token_ids, valid_length, segment_ids, attention_mask)
        # print(f'Output: {out.argmax(1)}, Label: {label}')
        
        test_acc += calc_accuracy(out, label)
        loss = loss_fn(out, label)
    writer.add_scalar('acc/test_acc', test_acc / (batch_id+1), e+1)        
    writer.add_scalar('loss/test_loss', loss.data.cpu().numpy(), e+1)        
    print("epoch {} loss {} test acc {}".format(e+1, loss.data.cpu().numpy(), test_acc / (batch_id+1)))
    if test_acc/(batch_id+1)>best_acc:
      best_acc=test_acc/(batch_id+1)
      patience=0
      torch.save(model, PATH + '{}_{}_model.pt'.format(e+1, test_acc/(batch_id+1)))
    else:
      patience+=1
    if patience>max_patience:
      break
    test_acc = 0  # Reset test accuracy for the next epoch

  0%|          | 0/729 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.7601990699768066 train acc 0.25
epoch 1 batch id 11 loss 1.8913229703903198 train acc 0.22585227272727273
epoch 1 batch id 21 loss 1.8737117052078247 train acc 0.19419642857142858
epoch 1 batch id 31 loss 1.7930320501327515 train acc 0.19405241935483872
epoch 1 batch id 41 loss 1.880544900894165 train acc 0.19397865853658536
epoch 1 batch id 51 loss 2.030775308609009 train acc 0.19148284313725492
epoch 1 batch id 61 loss 1.7764157056808472 train acc 0.18596311475409835
epoch 1 batch id 71 loss 1.840705394744873 train acc 0.18485915492957747
epoch 1 batch id 81 loss 1.9575343132019043 train acc 0.18113425925925927
epoch 1 batch id 91 loss 1.9175658226013184 train acc 0.18097527472527472
epoch 1 batch id 101 loss 1.973516821861267 train acc 0.1806930693069307
epoch 1 batch id 111 loss 1.8978997468948364 train acc 0.18074324324324326
epoch 1 batch id 121 loss 1.8490214347839355 train acc 0.18130165289256198
epoch 1 batch id 131 loss 1.8571929931640625 train acc 0

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 1 loss 1.4514329433441162 test acc 0.4323160616705699


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.630728840827942 train acc 0.265625
epoch 2 batch id 11 loss 1.6504237651824951 train acc 0.3352272727272727
epoch 2 batch id 21 loss 1.6143099069595337 train acc 0.32961309523809523
epoch 2 batch id 31 loss 1.4611897468566895 train acc 0.3326612903225806
epoch 2 batch id 41 loss 1.534623622894287 train acc 0.3403201219512195
epoch 2 batch id 51 loss 1.6530588865280151 train acc 0.3400735294117647
epoch 2 batch id 61 loss 1.550870656967163 train acc 0.3399077868852459
epoch 2 batch id 71 loss 1.6166168451309204 train acc 0.33934859154929575
epoch 2 batch id 81 loss 1.7012749910354614 train acc 0.3439429012345679
epoch 2 batch id 91 loss 1.7039940357208252 train acc 0.34546703296703296
epoch 2 batch id 101 loss 1.5658650398254395 train acc 0.34808168316831684
epoch 2 batch id 111 loss 1.5141724348068237 train acc 0.35205518018018017
epoch 2 batch id 121 loss 1.3663146495819092 train acc 0.35356404958677684
epoch 2 batch id 131 loss 1.4082661867141724 train acc 0

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 2 loss 1.2049686908721924 test acc 0.5386416861826698


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.2231065034866333 train acc 0.5625
epoch 3 batch id 11 loss 1.2118558883666992 train acc 0.5298295454545454
epoch 3 batch id 21 loss 1.1746670007705688 train acc 0.5148809523809523
epoch 3 batch id 31 loss 1.383795142173767 train acc 0.5161290322580645
epoch 3 batch id 41 loss 1.203918695449829 train acc 0.5160060975609756
epoch 3 batch id 51 loss 1.1470024585723877 train acc 0.508578431372549
epoch 3 batch id 61 loss 1.2721632719039917 train acc 0.5053790983606558
epoch 3 batch id 71 loss 1.4674196243286133 train acc 0.5044014084507042
epoch 3 batch id 81 loss 1.4029895067214966 train acc 0.5127314814814815
epoch 3 batch id 91 loss 1.2595195770263672 train acc 0.5127060439560439
epoch 3 batch id 101 loss 1.6320617198944092 train acc 0.5112933168316832
epoch 3 batch id 111 loss 1.230509638786316 train acc 0.5101351351351351
epoch 3 batch id 121 loss 1.1475998163223267 train acc 0.5127840909090909
epoch 3 batch id 131 loss 1.145908236503601 train acc 0.513716603

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 3 loss 1.1858254671096802 test acc 0.5680132708821233


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 1.1343730688095093 train acc 0.578125
epoch 4 batch id 11 loss 1.1000696420669556 train acc 0.5724431818181818
epoch 4 batch id 21 loss 1.1058298349380493 train acc 0.5610119047619048
epoch 4 batch id 31 loss 1.1852189302444458 train acc 0.5599798387096774
epoch 4 batch id 41 loss 1.1905311346054077 train acc 0.5609756097560976
epoch 4 batch id 51 loss 1.0489588975906372 train acc 0.5548406862745098
epoch 4 batch id 61 loss 1.2297792434692383 train acc 0.5530225409836066
epoch 4 batch id 71 loss 1.4153929948806763 train acc 0.5495158450704225
epoch 4 batch id 81 loss 1.1686230897903442 train acc 0.5555555555555556
epoch 4 batch id 91 loss 1.0863070487976074 train acc 0.557864010989011
epoch 4 batch id 101 loss 1.4764442443847656 train acc 0.5567759900990099
epoch 4 batch id 111 loss 1.290962815284729 train acc 0.5530686936936937
epoch 4 batch id 121 loss 1.042358160018921 train acc 0.5543646694214877
epoch 4 batch id 131 loss 0.9031210541725159 train acc 0.55593

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 4 loss 1.1691713333129883 test acc 0.5843213309914129


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 1.142494559288025 train acc 0.578125
epoch 5 batch id 11 loss 1.011675238609314 train acc 0.5880681818181818
epoch 5 batch id 21 loss 1.059415578842163 train acc 0.5766369047619048
epoch 5 batch id 31 loss 1.1012459993362427 train acc 0.579133064516129
epoch 5 batch id 41 loss 1.0227952003479004 train acc 0.5800304878048781
epoch 5 batch id 51 loss 0.913460373878479 train acc 0.5778186274509803
epoch 5 batch id 61 loss 1.1626543998718262 train acc 0.5753073770491803
epoch 5 batch id 71 loss 1.3824312686920166 train acc 0.5682218309859155
epoch 5 batch id 81 loss 1.1461130380630493 train acc 0.5754243827160493
epoch 5 batch id 91 loss 1.026983380317688 train acc 0.5770947802197802
epoch 5 batch id 101 loss 1.4601432085037231 train acc 0.5767326732673267
epoch 5 batch id 111 loss 1.2356674671173096 train acc 0.5747466216216216
epoch 5 batch id 121 loss 0.9323529601097107 train acc 0.5761880165289256
epoch 5 batch id 131 loss 0.8738359212875366 train acc 0.57693225

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 5 loss 1.194400668144226 test acc 0.5926034348165495


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 1.1164114475250244 train acc 0.578125
epoch 6 batch id 11 loss 0.89310622215271 train acc 0.6008522727272727
epoch 6 batch id 21 loss 0.8538293838500977 train acc 0.59375
epoch 6 batch id 31 loss 1.0283254384994507 train acc 0.5972782258064516
epoch 6 batch id 41 loss 0.9989315867424011 train acc 0.5998475609756098
epoch 6 batch id 51 loss 0.8659278154373169 train acc 0.5943627450980392
epoch 6 batch id 61 loss 1.116860270500183 train acc 0.592469262295082
epoch 6 batch id 71 loss 1.282516360282898 train acc 0.5880281690140845
epoch 6 batch id 81 loss 0.9965614676475525 train acc 0.5974151234567902
epoch 6 batch id 91 loss 0.9610620737075806 train acc 0.598385989010989
epoch 6 batch id 101 loss 1.2609021663665771 train acc 0.5957611386138614
epoch 6 batch id 111 loss 1.17350435256958 train acc 0.5922015765765766
epoch 6 batch id 121 loss 0.864992082118988 train acc 0.593620867768595
epoch 6 batch id 131 loss 0.8585817217826843 train acc 0.5938692748091603
epoch 

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 6 loss 1.120686650276184 test acc 0.5927741998438719


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.981570303440094 train acc 0.625
epoch 7 batch id 11 loss 0.9156338572502136 train acc 0.6576704545454546
epoch 7 batch id 21 loss 0.7952320575714111 train acc 0.6480654761904762
epoch 7 batch id 31 loss 1.0411646366119385 train acc 0.6461693548387096
epoch 7 batch id 41 loss 0.9521750211715698 train acc 0.6410060975609756
epoch 7 batch id 51 loss 0.7864546179771423 train acc 0.6357230392156863
epoch 7 batch id 61 loss 1.0165313482284546 train acc 0.6303790983606558
epoch 7 batch id 71 loss 1.3232805728912354 train acc 0.6214788732394366
epoch 7 batch id 81 loss 0.9361966252326965 train acc 0.6263503086419753
epoch 7 batch id 91 loss 0.9388110041618347 train acc 0.6280906593406593
epoch 7 batch id 101 loss 1.3182690143585205 train acc 0.6296410891089109
epoch 7 batch id 111 loss 1.0598324537277222 train acc 0.6296452702702703
epoch 7 batch id 121 loss 0.8705093860626221 train acc 0.6297778925619835
epoch 7 batch id 131 loss 0.7701552510261536 train acc 0.630128

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 7 loss 1.1343307495117188 test acc 0.5932864949258392


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.9292078018188477 train acc 0.671875
epoch 8 batch id 11 loss 0.8595119714736938 train acc 0.6789772727272727
epoch 8 batch id 21 loss 0.827135443687439 train acc 0.671875
epoch 8 batch id 31 loss 1.049196481704712 train acc 0.6653225806451613
epoch 8 batch id 41 loss 0.8391203880310059 train acc 0.6646341463414634
epoch 8 batch id 51 loss 0.7538755536079407 train acc 0.6580882352941176
epoch 8 batch id 61 loss 0.9952590465545654 train acc 0.6580430327868853
epoch 8 batch id 71 loss 1.1929219961166382 train acc 0.652068661971831
epoch 8 batch id 81 loss 0.9315248727798462 train acc 0.6568287037037037
epoch 8 batch id 91 loss 0.8878381848335266 train acc 0.6577953296703297
epoch 8 batch id 101 loss 1.298107624053955 train acc 0.6577970297029703
epoch 8 batch id 111 loss 1.0229259729385376 train acc 0.6563907657657657
epoch 8 batch id 121 loss 0.7921171188354492 train acc 0.6571539256198347
epoch 8 batch id 131 loss 0.7185876965522766 train acc 0.6560114503816794

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 8 loss 1.0937433242797852 test acc 0.5909811670569867


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.9221245050430298 train acc 0.6875
epoch 9 batch id 11 loss 0.7378799915313721 train acc 0.6946022727272727
epoch 9 batch id 21 loss 0.780296802520752 train acc 0.6867559523809523
epoch 9 batch id 31 loss 0.8914687037467957 train acc 0.6925403225806451
epoch 9 batch id 41 loss 0.7753747701644897 train acc 0.6894054878048781
epoch 9 batch id 51 loss 0.5960738062858582 train acc 0.6841299019607843
epoch 9 batch id 61 loss 0.9319941997528076 train acc 0.6828893442622951
epoch 9 batch id 71 loss 1.1458791494369507 train acc 0.6808978873239436
epoch 9 batch id 81 loss 0.8712263107299805 train acc 0.6851851851851852
epoch 9 batch id 91 loss 0.793181300163269 train acc 0.6833791208791209
epoch 9 batch id 101 loss 1.327377438545227 train acc 0.682394801980198
epoch 9 batch id 111 loss 0.9502543807029724 train acc 0.6804617117117117
epoch 9 batch id 121 loss 0.7533583045005798 train acc 0.6805268595041323
epoch 9 batch id 131 loss 0.5985190272331238 train acc 0.68046278

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 9 loss 1.1530005931854248 test acc 0.594237900078064


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.6807824969291687 train acc 0.75
epoch 10 batch id 11 loss 0.7848144173622131 train acc 0.71875
epoch 10 batch id 21 loss 0.7337523102760315 train acc 0.7180059523809523
epoch 10 batch id 31 loss 0.8123864531517029 train acc 0.7182459677419355
epoch 10 batch id 41 loss 0.6578950881958008 train acc 0.7195121951219512
epoch 10 batch id 51 loss 0.46045348048210144 train acc 0.7190563725490197
epoch 10 batch id 61 loss 0.8591220378875732 train acc 0.7202868852459017
epoch 10 batch id 71 loss 0.9972988963127136 train acc 0.7169894366197183
epoch 10 batch id 81 loss 0.8214956521987915 train acc 0.716820987654321
epoch 10 batch id 91 loss 0.7305460572242737 train acc 0.7177197802197802
epoch 10 batch id 101 loss 1.1389012336730957 train acc 0.7172029702970297
epoch 10 batch id 111 loss 0.8251389265060425 train acc 0.7148085585585585
epoch 10 batch id 121 loss 0.6226099133491516 train acc 0.7142303719008265
epoch 10 batch id 131 loss 0.6060008406639099 train acc 0.712

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 10 loss 1.285509705543518 test acc 0.5861143637782982


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 11 batch id 1 loss 0.8541280627250671 train acc 0.6875
epoch 11 batch id 11 loss 0.6100890040397644 train acc 0.7727272727272727
epoch 11 batch id 21 loss 0.5136794447898865 train acc 0.7611607142857143
epoch 11 batch id 31 loss 0.7130304574966431 train acc 0.7600806451612904
epoch 11 batch id 41 loss 0.6304547786712646 train acc 0.7564786585365854
epoch 11 batch id 51 loss 0.42487287521362305 train acc 0.7567401960784313
epoch 11 batch id 61 loss 0.7156686186790466 train acc 0.7540983606557377
epoch 11 batch id 71 loss 0.9077689051628113 train acc 0.7524207746478874
epoch 11 batch id 81 loss 0.7082642912864685 train acc 0.7534722222222222
epoch 11 batch id 91 loss 0.6971864104270935 train acc 0.7512019230769231
epoch 11 batch id 101 loss 1.1195820569992065 train acc 0.7498452970297029
epoch 11 batch id 111 loss 0.8359431624412537 train acc 0.7484515765765766
epoch 11 batch id 121 loss 0.5132130980491638 train acc 0.7515495867768595
epoch 11 batch id 131 loss 0.4812167286872864 t

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 11 loss 1.1843767166137695 test acc 0.5826136807181889


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 12 batch id 1 loss 0.5811713337898254 train acc 0.796875
epoch 12 batch id 11 loss 0.6107802987098694 train acc 0.7926136363636364
epoch 12 batch id 21 loss 0.4043010175228119 train acc 0.7894345238095238
epoch 12 batch id 31 loss 0.6996422410011292 train acc 0.7852822580645161
epoch 12 batch id 41 loss 0.579388439655304 train acc 0.7850609756097561
epoch 12 batch id 51 loss 0.36768409609794617 train acc 0.7818627450980392
epoch 12 batch id 61 loss 0.714306116104126 train acc 0.7817622950819673
epoch 12 batch id 71 loss 0.9441385865211487 train acc 0.7788292253521126
epoch 12 batch id 81 loss 0.5711483955383301 train acc 0.7816358024691358
epoch 12 batch id 91 loss 0.5806980729103088 train acc 0.78125
epoch 12 batch id 101 loss 1.0183218717575073 train acc 0.7800123762376238
epoch 12 batch id 111 loss 0.6643487215042114 train acc 0.7773085585585585
epoch 12 batch id 121 loss 0.45412564277648926 train acc 0.7767303719008265
epoch 12 batch id 131 loss 0.4119608998298645 train acc 0

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 12 loss 1.3610061407089233 test acc 0.5820038056206089


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 13 batch id 1 loss 0.5751171708106995 train acc 0.78125
epoch 13 batch id 11 loss 0.595723569393158 train acc 0.8181818181818182
epoch 13 batch id 21 loss 0.4587428867816925 train acc 0.8125
epoch 13 batch id 31 loss 0.573456883430481 train acc 0.8125
epoch 13 batch id 41 loss 0.43084025382995605 train acc 0.8136432926829268
epoch 13 batch id 51 loss 0.3568156361579895 train acc 0.8100490196078431
epoch 13 batch id 61 loss 0.5460425019264221 train acc 0.8107069672131147
epoch 13 batch id 71 loss 0.7332446575164795 train acc 0.8098591549295775
epoch 13 batch id 81 loss 0.6069345474243164 train acc 0.8117283950617284
epoch 13 batch id 91 loss 0.46545904874801636 train acc 0.8099244505494505
epoch 13 batch id 101 loss 0.8547236323356628 train acc 0.8086324257425742
epoch 13 batch id 111 loss 0.5697876811027527 train acc 0.8072916666666666
epoch 13 batch id 121 loss 0.29441046714782715 train acc 0.8082386363636364
epoch 13 batch id 131 loss 0.2852362096309662 train acc 0.807848282442

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 13 loss 1.6191837787628174 test acc 0.5818330405932866


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 14 batch id 1 loss 0.5775213837623596 train acc 0.796875
epoch 14 batch id 11 loss 0.5027768015861511 train acc 0.8522727272727273
epoch 14 batch id 21 loss 0.4968865215778351 train acc 0.8392857142857143
epoch 14 batch id 31 loss 0.5605853199958801 train acc 0.8306451612903226
epoch 14 batch id 41 loss 0.3191329836845398 train acc 0.8250762195121951
epoch 14 batch id 51 loss 0.19299660623073578 train acc 0.8262867647058824
epoch 14 batch id 61 loss 0.46668529510498047 train acc 0.8291495901639344
epoch 14 batch id 71 loss 0.8005584478378296 train acc 0.8268045774647887
epoch 14 batch id 81 loss 0.630047082901001 train acc 0.8302469135802469
epoch 14 batch id 91 loss 0.4684540629386902 train acc 0.8288118131868132
epoch 14 batch id 101 loss 0.6623967885971069 train acc 0.828434405940594
epoch 14 batch id 111 loss 0.6960623860359192 train acc 0.8268581081081081
epoch 14 batch id 121 loss 0.3277619481086731 train acc 0.827995867768595
epoch 14 batch id 131 loss 0.3035401403903961 t

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 14 loss 1.8406202793121338 test acc 0.5825161007025761


  0%|          | 0/729 [00:00<?, ?it/s]

epoch 15 batch id 1 loss 0.4947088658809662 train acc 0.828125
epoch 15 batch id 11 loss 0.3633843660354614 train acc 0.8792613636363636
epoch 15 batch id 21 loss 0.18833790719509125 train acc 0.8712797619047619
epoch 15 batch id 31 loss 0.4055154323577881 train acc 0.8679435483870968
epoch 15 batch id 41 loss 0.3900715708732605 train acc 0.8666158536585366
epoch 15 batch id 51 loss 0.17107315361499786 train acc 0.8648897058823529
epoch 15 batch id 61 loss 0.5292547941207886 train acc 0.8632172131147541
epoch 15 batch id 71 loss 0.5735335946083069 train acc 0.8637764084507042
epoch 15 batch id 81 loss 0.4539036750793457 train acc 0.8630401234567902
epoch 15 batch id 91 loss 0.40511178970336914 train acc 0.8607486263736264
epoch 15 batch id 101 loss 0.8490390777587891 train acc 0.8575185643564357
epoch 15 batch id 111 loss 0.5160422325134277 train acc 0.8579673423423423
epoch 15 batch id 121 loss 0.21110466122627258 train acc 0.8575671487603306
epoch 15 batch id 131 loss 0.3324091136455

  0%|          | 0/183 [00:00<?, ?it/s]

epoch 15 loss 2.110285520553589 test acc 0.5790032201405153


To load the model from PATH?

In [16]:
# Specify the path to your saved model file
model_path = '' # need model path dir: PATH + '{}_{}_model.pt

# Load the model
model = torch.load(model_path)

# Make sure to call model.eval() if you're loading the model for evaluation
model.eval()

FileNotFoundError: [Errno 2] No such file or directory: ''

# Interactive Test

In [17]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
PATH="/content/drive/MyDrive/Deepdaiv-SentimentAnalysis/KoBERT-topic-dr+0.7-1122/2_0.6723177083333334_model_t.pt"
#model = BERTClassifier2(bertmodel,  dr_rate=0.7).to(device)
model=torch.load(PATH)
#{'연애, 결혼, 출산': 0, '건강, 죽음': 1, '학업 및 진로': 2, '재정': 3, '재정, 은퇴, 노후준비': 4, 
#'건강': 5, '학교폭력/따돌림': 6, '대인관계': 7, '진로, 취업, 직장': 8, '가족관계': 9, '직장, 업무 스트레스': 10, 
#'대인관계(부부, 자녀)': 11}
s12label={'기쁨':0, '불안': 1, '당황':2, '슬픔': 3, '분노': 4, '상처':5}
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)

        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            
            if np.argmax(logits) == 0:
                test_eval.append("연애, 결혼, 출산")
            elif np.argmax(logits) == 1:
                test_eval.append("건강, 죽음")
            elif np.argmax(logits) == 2:
                test_eval.append("학업 및 진로")
            elif np.argmax(logits) == 3:
                test_eval.append("재정")
            elif np.argmax(logits) == 4:
                test_eval.append("재정, 은퇴, 노후준비")
            elif np.argmax(logits) == 5:
                test_eval.append("건강")

            elif np.argmax(logits) == 6:
                test_eval.append("학교폭력/따돌림")
            elif np.argmax(logits) == 7:
                test_eval.append("대인관계")
            elif np.argmax(logits) == 8:
                test_eval.append("진로, 취업, 직장")
            elif np.argmax(logits) == 9:
                test_eval.append("가족관계")
            elif np.argmax(logits) == 10:
                test_eval.append("직장, 업무 스트레스")
            elif np.argmax(logits) == 11:
                test_eval.append("대인관계(부부, 자녀)")

        print(">> 입력하신 내용이 " + test_eval[0] + "에 대한 발화로 추정됩니다.")

NameError: name 'get_tokenizer' is not defined

In [None]:
#질문 무한반복하기! 0 입력시 종료
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence.strip() == '0' :
        break
    predict(sentence)
    print("\n")

# KoBERT 주제 분석(???; 중복된 내용으로 판단됨)

In [None]:
train_o=pd.read_excel('/content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화/감성대화말뭉치(최종데이터)_Training.xlsx')
val_o=pd.read_excel('/content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화/감성대화말뭉치(최종데이터)_Validation.xlsx')
train_t=pd.DataFrame({'topic': train_o['상황키워드'], 't1': train_o['사람문장1']})
train_t.head()

val_t=pd.DataFrame({'topic': val_o['상황키워드'], 't1': val_o['사람문장1']})


In [None]:
topic2label={}
for idx, topic in enumerate(train_t['topic'].unique()):
  topic2label[topic]=idx

train_tt=[]
for t, s in zip(train_t['t1'], train_t['topic']):
  t=del_bracket(t)
  t=del_special_num(t)
  t=del_whitespace(t)
  t=del_stopwords(t)
  train_tt.append([t, topic2label[s.strip()]])

for t, s in zip(val_t['t1'], val_t['topic']):
  t=del_bracket(t)
  t=del_special_num(t)
  t=del_whitespace(t)
  t=del_stopwords(t)
  train_tt.append([t, topic2label[s]])

dataset_train_t, dataset_test_t = train_test_split(train_tt, test_size=0.2, random_state=0)

#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train_t = BERTDataset(dataset_train_t, 0, 1, tok, max_len, True, False)
data_test_t = BERTDataset(dataset_test_t, 0, 1, tok, max_len, True, False)

train_dataloader_t = torch.utils.data.DataLoader(data_train_t, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader_t = torch.utils.data.DataLoader(data_test_t, batch_size=batch_size, num_workers=5, shuffle=True)

In [None]:
class BERTClassifier2(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=12,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
print(topic2label)

In [None]:
#BERT 모델 불러오기
model_t = BERTClassifier(bertmodel,  dr_rate=0.7).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model_t.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_t.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader_t) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
    
train_dataloader_t

In [None]:
best_acc=0
max_patience=5
patience=0
PATH = '/content/drive/MyDrive/Deepdaiv-SentimentAnalysis/KoBERT-topic-dr+0.7-1122/'
writer = SummaryWriter(PATH)

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model_t.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader_t)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model_t(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_t.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            writer.add_scalar('loss/train_loss', loss.data.cpu().numpy(), e*len(train_dataloader_t)+batch_id+1)
            writer.add_scalar('acc/train_acc', train_acc / (batch_id+1), e*len(train_dataloader_t)+batch_id+1)
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model_t.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader_t)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model_t(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
        loss = loss_fn(out, label)
    writer.add_scalar('acc/test_acc', test_acc / (batch_id+1), e+1)        
    writer.add_scalar('loss/test_loss', loss.data.cpu().numpy(), e+1)        
    print("epoch {} loss {} test acc {}".format(e+1, loss.data.cpu().numpy(), test_acc / (batch_id+1)))
    
    if test_acc/(batch_id+1)>best_acc:
      best_acc=test_acc/(batch_id+1)
      patience=0
      torch.save(model_t, PATH + '{}_{}_model_t.pt'.format(e+1, test_acc/(batch_id+1)))
    else:
      patience+=1
    if patience>max_patience:
      break
    

# Aihub 감성 데이터

In [None]:
!cd /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화

In [None]:
!unzip /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화/Training/감성대화말뭉치_원천데이터_Training.zip -d /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화

In [None]:
!unzip /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화/Training/감성대화말뭉치_최종데이터_Training.zip -d /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화

In [None]:
!unzip /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화/Validation/감성대화말뭉치_원천데이터_Validation.zip -d /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화

In [None]:
!unzip /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화/Validation/감성대화말뭉치_최종데이터_Validation.zip -d /content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화

In [None]:
!pip install xlrd

In [None]:
import pandas as pd
train=pd.read_excel('/content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화/감성대화말뭉치(최종데이터)_Training.xlsx')
train=pd.DataFrame({'s1': train['감정_대분류'], 's2': train['감정_소분류'], 't1': train['사람문장1']})
train.head()

In [None]:
val=pd.read_excel('/content/drive/MyDrive/Deepdaiv-SentimentAnalysis/감성대화/감성대화말뭉치(최종데이터)_Validation.xlsx')
val=pd.DataFrame({'s1': val['감정_대분류'], 's2': val['감정_소분류'], 't1': val['사람문장1']})

In [None]:
for s1 in train['s1'].unique():
  print(s1)
  print(train.loc[train['s1']==s1]['s2'].value_counts())
  print()

#기쁨 불안 당황 슬픔 분노 상처 

In [None]:
for s1 in val['s1'].unique():
  print(s1)
  print(val.loc[val['s1']==s1]['s2'].value_counts())
  print()