# Import Required Libraries

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 13.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 57.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fou

In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.modules.loss import _Loss
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import os
import gc

## Connect google drive

In [3]:
from google.colab import drive


drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/팀 프로젝트/Comment_discrimination/')

Mounted at /content/drive


# Training Configuration

In [22]:
# batch_size 16이상이면 터짐
CONFIG = {"seed": 2022,
          "epochs": 100,
          "model_name": "beomi/KcELECTRA-base",
          "num_classes": 3,
          "max_length": 256,
          "batch_size": 8,
          "learning_rate": 1e-6,
          "patience" : 5,
          "eps": 1e-8,
          "n_fold": 5,
          "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu')
          }

In [23]:
def set_seed(seed=29):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

# Load Data

In [7]:
class Dataset_bias(Dataset):
  
    def __init__(self, csv_file, idx=[]):
        # NaN값 체크
        self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
        # 중복 체크
        self.dataset.drop_duplicates(subset=['title_comment'], inplace=True)

        if idx != []:
            self.idx = idx
            self.dataset = self.dataset.iloc[self.idx]
        
        self.tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 2:5].values
        text = row[-1]
        y_dic = {'none' : 0, 'gender': 1, 'others' : 2}
        y = y_dic[row[0]]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=CONFIG['max_length'],
            padding='max_length',
            add_special_tokens=True
            )
    
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y

In [8]:
class Dataset_bias_test(Dataset):
  
    def __init__(self, csv_file):
        # NaN값 체크.
        self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
        # 중복 체크
        self.dataset.drop_duplicates(subset=['title_comment'], inplace=True)
        self.tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

        print(self.dataset.describe())
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, -1:].values
        text = row[-1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=CONFIG['max_length'],
            padding='max_length',
            add_special_tokens=True
            )
    
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask

In [9]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
train_df['title_comment'] = train_df['title'] + ' ' + train_df['comment']
test_df['title_comment'] = test_df['title'] + ' ' + test_df['comment']
train_df.to_csv("train1.txt", sep = '\t', index=False)
test_df.to_csv("test1.txt", sep = '\t', index=False)

In [10]:
train_df

Unnamed: 0,title,comment,bias,hate,title_comment
0,"""'미스터 션샤인' 변요한, 김태리와 같은 양복 입고 학당 방문! 이유는?""",김태리 정말 연기잘해 진짜,none,none,"""'미스터 션샤인' 변요한, 김태리와 같은 양복 입고 학당 방문! 이유는?"" 김태리..."
1,"""[SC현장]""""극사실주의 현실♥""""…'가장 보통의 연애' 김래원X공효진, 16년만...",공효진 발연기나이질생각이읍던데 왜계속주연일까,none,hate,"""[SC현장]""""극사실주의 현실♥""""…'가장 보통의 연애' 김래원X공효진, 16년만..."
2,"""손연재, 리듬체조 학원 선생님 """"하고 싶은 일 해서 행복하다""""""",누구처럼 돈만 밝히는 저급인생은 살아가지마시길~~ 행복은 머니순이 아니니깐 작은거에...,others,hate,"""손연재, 리듬체조 학원 선생님 """"하고 싶은 일 해서 행복하다"""""" 누구처럼 돈만..."
3,"""'섹션TV' 김해숙 """"'허스토리' 촬영 후 우울증 얻었다""""""",일본 축구 져라,none,none,"""'섹션TV' 김해숙 """"'허스토리' 촬영 후 우울증 얻었다"""""" 일본 축구 져라"
4,"""[단독] 임현주 아나운서 “‘노브라 챌린지’ 방송 덕에 낸 용기, 자연스런 논의의...",난 절대로 임현주 욕하는인간이랑은 안논다 @.@,none,none,"""[단독] 임현주 아나운서 “‘노브라 챌린지’ 방송 덕에 낸 용기, 자연스런 논의의..."
...,...,...,...,...,...
8362,"""배우 이필립, SNS 스타 연인에게 초호화 프러포즈 눈길""",아니 근데.튜닝한사람은 프러포즈받지도.결혼도못함?ㅋㅋㅋ지들은 돈없어서 못하는것들이ㅋ...,others,hate,"""배우 이필립, SNS 스타 연인에게 초호화 프러포즈 눈길"" 아니 근데.튜닝한사람은..."
8363,"""[SC이슈]""""마약·백스텝·김새롬 탓"""" '실형 피한' 이찬오, 이미지는 치명상(...",그러니깐 여자를 잘만나야되~징글징글한것들 만나면 인생 끝가지 돌아가게 되는듯.. 근...,gender,hate,"""[SC이슈]""""마약·백스텝·김새롬 탓"""" '실형 피한' 이찬오, 이미지는 치명상(..."
8364,"""[POP이슈]""""그들만의 세상""""…홍상수♥김민희, 새해데이트에 '반응싸늘'""",참으로 아름다운 커플입니다. 늘 행복하시고 새해에도 늘 꽃길만 걸으시길 축원합니다 ...,none,none,"""[POP이슈]""""그들만의 세상""""…홍상수♥김민희, 새해데이트에 '반응싸늘'"" 참으..."
8365,[종합] '시크릿 마더' 김소연 누가 죽였나…송윤아와 갈등,재미가 없어요,none,none,[종합] '시크릿 마더' 김소연 누가 죽였나…송윤아와 갈등 재미가 없어요


In [11]:
train_df['bias'].value_counts()

none      5490
others    1578
gender    1299
Name: bias, dtype: int64

In [13]:
train_dataset = Dataset_bias("train1.txt")
test_dataset = Dataset_bias_test("test1.txt")

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/387k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

               ID
count  511.000000
mean   255.000000
std    147.657261
min      0.000000
25%    127.500000
50%    255.000000
75%    382.500000
max    510.000000


# Define Tools

## Define Custom loss(F1 + CE)

In [14]:
class F1_Loss(nn.Module):
    def __init__(self, epsilon=CONFIG['eps']):
        super().__init__()
        self.epsilon = epsilon
        
    def forward(self, y_pred, y_true):
        assert y_pred.ndim == 2
        assert y_true.ndim == 1
        y_true = F.one_hot(y_true, 3).to(torch.float32)
        y_pred = F.softmax(y_pred, dim=1)
        
        tp = (y_true * y_pred).sum(dim=0).to(torch.float32)
        tn = ((1 - y_true) * (1 - y_pred)).sum(dim=0).to(torch.float32)
        fp = ((1 - y_true) * y_pred).sum(dim=0).to(torch.float32)
        fn = (y_true * (1 - y_pred)).sum(dim=0).to(torch.float32)

        precision = tp / (tp + fp + self.epsilon)
        recall = tp / (tp + fn + self.epsilon)

        f1 = 2* (precision*recall) / (precision + recall + self.epsilon)
        f1 = f1.clamp(min=self.epsilon, max=1-self.epsilon)
        return 1 - f1.mean()

In [19]:
class MyLoss(_Loss):
    def __init__(self):
        super(MyLoss, self).__init__()
        self.lossCE = nn.CrossEntropyLoss()
        self.lossF1 = F1_Loss()
        
    def forward(self, preds, trg):
        return (self.lossCE(preds, trg) + self.lossF1(preds, trg)) / 2

## Define Early Stopper

In [18]:
class LossEarlyStopper():

    def __init__(self, patience: int)-> None:
        self.patience = patience

        self.patience_counter = 0
        self.min_loss = np.Inf
        self.stop = False
        self.save_model = False

    def check_early_stopping(self, loss: float)-> None:

        if self.min_loss == np.Inf:
            self.min_loss = loss
            return None

        elif loss > self.min_loss:
            self.patience_counter += 1
            msg = f"Early stopping counter {self.patience_counter}/{self.patience}"

            if self.patience_counter == self.patience:
                self.stop = True
                
        elif loss <= self.min_loss:
            self.patience_counter = 0
            self.save_model = True
            msg = f"Validation loss decreased {self.min_loss} -> {loss}"
            self.min_loss = loss
        
        print(msg)

## Define Scoring Function

In [24]:
def get_clf_eval(y_actual, y_pred):
    accuracy = accuracy_score(y_actual, y_pred)
    F1 = f1_score(y_actual, y_pred, average='macro')
    print('정확도: {:.4f}'.format(accuracy))
    print('F1: {:.4f}'.format(F1))

## Define Softmax Function

# Run Training

## Make Fold

In [20]:
from sklearn.model_selection import StratifiedKFold


kfold = StratifiedKFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG['seed'])

## K-fold Training

In [None]:
losses = []
accuracies = []
device = CONFIG['device']

for fold, (train_idx, val_idx) in enumerate(kfold.split(range(len(train_df)), train_df['bias'])):
    print(f'============================{fold+1}th fold============================')

    train_dataset = Dataset_bias(csv_file="train1.txt", idx=train_idx)
    validation_dataset = Dataset_bias(csv_file="train1.txt", idx=val_idx)
    train_dataloader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    validation_dataloader = DataLoader(validation_dataset, batch_size=CONFIG['batch_size'], shuffle=True)

    model = ElectraForSequenceClassification.from_pretrained(CONFIG['model_name'], num_labels = CONFIG['num_classes'],
                                                         output_attentions = False,
                                                         output_hidden_states = True,
                                                         ).to(device)

    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], eps =CONFIG['eps'])
    criterion = MyLoss()
    early_stopper = LossEarlyStopper(patience=CONFIG['patience'])

    # Set trainer
    for i in range(CONFIG['epochs']):
        train_loss = 0.0
        correct = 0
        train_total = 0
        batches = 0

        model.train()

        for input_ids_batch, attention_masks_batch, y_real in tqdm(train_dataloader):
            optimizer.zero_grad()
            y_real = y_real.to(device)
            y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
            loss = criterion(y_pred, y_real)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            _, predicted = torch.max(y_pred, 1)
            correct += (predicted == y_real).sum()
            train_total += len(y_real)
  
        losses.append(train_loss)
        accuracies.append(correct.float() / train_total)
        print("Train Loss:", train_loss, "Accuracy:", correct.float() / train_total)

        gc.collect()
        torch.cuda.empty_cache()

        model.eval()

        val_correct = 0
        val_total = 0
        val_loss = 0.0

        for input_ids_batch, attention_masks_batch, y_real in tqdm(validation_dataloader):
            y_real = y_real.to(device)
            y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
            loss = criterion(y_pred, y_real)

            val_loss += loss.item()

            _, predicted = torch.max(y_pred, 1)
            val_correct += (predicted == y_real).sum()
            val_total += len(y_real)

        print("Validation Accuracy:", val_correct.float() / val_total)
        
        early_stopper.check_early_stopping(loss=val_loss)

        if early_stopper.stop:
            print('Early stopped')
            break

        if early_stopper.save_model:
            torch.save(model.state_dict(), f"{fold+1}th_kcELECTRA.pt")
            early_stopper.save_model = False

        gc.collect()
        torch.cuda.empty_cache()


# Make Result

## Load Model

In [22]:
model1_path = '1th_kcELECTRA.pt'
model2_path = '2th_kcELECTRA.pt'
model3_path = '3th_kcELECTRA.pt'
model4_path = '4th_kcELECTRA.pt'
model5_path = '5th_kcELECTRA.pt'

In [23]:
# fold1 model
model1 = ElectraForSequenceClassification.from_pretrained(CONFIG['model_name'], num_labels = CONFIG['num_classes'],
                                                         output_attentions = False,
                                                         output_hidden_states = True,
                                                         ).to(device)
model1.load_state_dict(torch.load(model1_path, map_location='cpu'))

# fold2 model
model2 = ElectraForSequenceClassification.from_pretrained(CONFIG['model_name'], num_labels = CONFIG['num_classes'],
                                                         output_attentions = False,
                                                         output_hidden_states = True,
                                                         ).to(device)
model2.load_state_dict(torch.load(model2_path, map_location='cpu'))

# fold3 model
model3 = ElectraForSequenceClassification.from_pretrained(CONFIG['model_name'], num_labels = CONFIG['num_classes'],
                                                         output_attentions = False,
                                                         output_hidden_states = True,
                                                         ).to(device)
model3.load_state_dict(torch.load(model3_path, map_location='cpu'))

# fold4 model
model4 = ElectraForSequenceClassification.from_pretrained(CONFIG['model_name'], num_labels = CONFIG['num_classes'],
                                                         output_attentions = False,
                                                         output_hidden_states = True,
                                                         ).to(device)
model4.load_state_dict(torch.load(model4_path, map_location='cpu'))

# fold5 model
model5 = ElectraForSequenceClassification.from_pretrained(CONFIG['model_name'], num_labels = CONFIG['num_classes'],
                                                         output_attentions = False,
                                                         output_hidden_states = True,
                                                         ).to(device)
model5.load_state_dict(torch.load(model5_path, map_location='cpu'))

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.bias', 'classifier

<All keys matched successfully>

## Check Vaildation Score

In [25]:
losses = []
accuracies = []

for fold, (_, val_idx) in enumerate(kfold.split(range(8367), train_df['bias'])):
    print(f'============================{fold+1}th fold============================')

    validation_dataset = Dataset_bias(csv_file="train1.txt", idx=val_idx)
    validation_dataloader = DataLoader(validation_dataset, batch_size=CONFIG['batch_size'], shuffle=True)

    validation_actual = []
    validation_pred_lst = []

    if fold == 0:
        model1.eval()
        with torch.no_grad():
            for input_ids_batch, attention_masks_batch, y_real in tqdm(validation_dataloader):
                y_pred = model1(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
                _, y_pred = torch.max(y_pred, 1)
                validation_actual += (list(y_real.numpy()))
                validation_pred_lst.extend(y_pred.tolist())
            get_clf_eval(validation_actual, validation_pred_lst)
    
    if fold == 1:
        model2.eval()
        with torch.no_grad():
            for input_ids_batch, attention_masks_batch, y_real in tqdm(validation_dataloader):
                y_pred = model2(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
                _, y_pred = torch.max(y_pred, 1)
                validation_actual += (list(y_real.numpy()))
                validation_pred_lst.extend(y_pred.tolist())
            get_clf_eval(validation_actual, validation_pred_lst)
    
    if fold == 2:
        model3.eval()
        with torch.no_grad():
            for input_ids_batch, attention_masks_batch, y_real in tqdm(validation_dataloader):
                y_pred = model3(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
                _, y_pred = torch.max(y_pred, 1)
                validation_actual += (list(y_real.numpy()))
                validation_pred_lst.extend(y_pred.tolist())
            get_clf_eval(validation_actual, validation_pred_lst)
    
    if fold == 3:
        model4.eval()
        with torch.no_grad():
            for input_ids_batch, attention_masks_batch, y_real in tqdm(validation_dataloader):
                y_pred = model4(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
                _, y_pred = torch.max(y_pred, 1)
                validation_actual += (list(y_real.numpy()))
                validation_pred_lst.extend(y_pred.tolist())
            get_clf_eval(validation_actual, validation_pred_lst)
    
    if fold == 4:
        model5.eval()
        with torch.no_grad():
            for input_ids_batch, attention_masks_batch, y_real in tqdm(validation_dataloader):
                y_pred = model5(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
                _, y_pred = torch.max(y_pred, 1)
                validation_actual += (list(y_real.numpy()))
                validation_pred_lst.extend(y_pred.tolist())
            get_clf_eval(validation_actual, validation_pred_lst)
    
    gc.collect()
    torch.cuda.empty_cache()




  if __name__ == '__main__':


  0%|          | 0/210 [00:00<?, ?it/s]

정확도: 0.7557
F1: 0.6852


  if __name__ == '__main__':


  0%|          | 0/210 [00:00<?, ?it/s]

정확도: 0.7670
F1: 0.6776


  if __name__ == '__main__':


  0%|          | 0/210 [00:00<?, ?it/s]

정확도: 0.7753
F1: 0.6992


  if __name__ == '__main__':


  0%|          | 0/210 [00:00<?, ?it/s]

정확도: 0.7705
F1: 0.7081


  if __name__ == '__main__':


  0%|          | 0/210 [00:00<?, ?it/s]

정확도: 0.8960
F1: 0.8580


In [26]:
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [40]:
CONFIG

{'batch_size': 8,
 'device': device(type='cuda'),
 'epochs': 50,
 'eps': 1e-08,
 'learning_rate': 1e-06,
 'max_length': 256,
 'model_name': 'beomi/KcELECTRA-base',
 'n_fold': 5,
 'num_classes': 3,
 'patience': 5,
 'seed': 2022}

## Make Voting Function

In [27]:
def softmax(a):
    c = np.max(a)
    exp_a = np.exp(a-c)
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a

    return y

In [28]:
# 예측함수(soft voting)
def predict(models, loader):
    model1, model2, model3, model4, model5 = models

    pred_lst = []
    prob_lst = []

    model1.eval()
    model2.eval()
    model3.eval()
    model4.eval()
    model5.eval()

    with torch.no_grad():
        for input_ids_batch, attention_masks_batch in tqdm(loader):


            out1 = model1(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0][0]
            out2 = model2(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0][0]
            out3 = model3(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0][0]
            out4 = model4(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0][0]
            out5 = model5(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0][0]
        
            prob1 = softmax(out1.tolist())
            prob2 = softmax(out2.tolist())
            prob3 = softmax(out3.tolist())
            prob4 = softmax(out4.tolist())
            prob5 = softmax(out5.tolist())

            prob = (prob1 + prob2 + prob3 + prob4 + prob5) / 5
            pred_lst.append(prob.argmax(axis=0))
            prob_lst.append(prob.tolist())
    
    return pred_lst, prob_lst

## Get Prediction

In [29]:
models = [model1, model2, model3, model4, model5]
pred, prob = predict(models, test_dataloader)

  0%|          | 0/511 [00:00<?, ?it/s]

In [30]:
bias = pd.DataFrame(pred, columns=['bias'])

In [33]:
BIAS_LABEL_DIC = {
    0 : 'none',
    1 : 'gender',
    2 : 'others',
}

In [34]:
bias['bias'] = bias['bias'].map(lambda x: BIAS_LABEL_DIC[x])

In [35]:
bias.value_counts()

bias  
none      319
gender     96
others     96
dtype: int64

## Save result

In [36]:
bias.to_csv('bias_fine_tuned.csv', index=False)