In [1]:
!nvidia-smi

Fri Aug  6 14:53:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers==4.5.1 --quiet
!pip install pytorch-lightning==1.2.8 --quiet

[K     |████████████████████████████████| 2.1 MB 6.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 57.0 MB/s 
[K     |████████████████████████████████| 895 kB 59.9 MB/s 
[K     |████████████████████████████████| 841 kB 7.2 MB/s 
[K     |████████████████████████████████| 118 kB 76.6 MB/s 
[K     |████████████████████████████████| 234 kB 76.1 MB/s 
[K     |████████████████████████████████| 269 kB 68.8 MB/s 
[K     |████████████████████████████████| 829 kB 46.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 53.1 MB/s 
[K     |████████████████████████████████| 142 kB 67.9 MB/s 
[K     |████████████████████████████████| 294 kB 74.4 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone


In [3]:
#
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from pylab import rcParams
from matplotlib import rc

# scikit-learn
from sklearn.model_selection import train_test_split

# torch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# torchmetrics
import torchmetrics

# pytorch-lightning
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

# transformers
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

In [4]:
RANDOM_SEED = 42
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
pl.seed_everything(RANDOM_SEED)

Global seed set to 42


42

In [5]:
PATH = '/content/drive/MyDrive/gh/dacon_newstopic'
train      = pd.read_csv(PATH + '/DATA/train_clean_4bert.csv')
test       = pd.read_csv(PATH + '/DATA/test_clean_4bert.csv')
submission = pd.read_csv(PATH + '/DATA/sample_submission.csv')
topic_dict = pd.read_csv(PATH + '/DATA/topic_dict.csv')

In [6]:
#sns.countplot(train['topic_idx'])

In [7]:
train_df, val_df = train_test_split(train, test_size=0.22)
train_df.shape, val_df.shape

((35610, 8), (10044, 8))

In [8]:
MAX_TOKEN_COUNT = 32

In [9]:
# Dataset
class NewsTopicDataset(Dataset):
    
    def __init__(self,
            data: pd.DataFrame,
            tokenizer: BertTokenizer,
            max_token_len: int = 32
            ):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len


    def __len__(self):
        return len(self.data)


    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        encoded_text = self.tokenizer.encode_plus(
            data_row.c_title,
            add_special_tokens = True,
            max_length = self.max_token_len, 
            padding = 'max_length',
            truncation = True,
            return_attention_mask = True, 
            return_tensors = 'pt'
            )
        labels = data_row.topic_idx.tolist()

        return dict(
            input_ids = encoded_text.input_ids.flatten(),
            attention_mask = encoded_text.attention_mask.flatten(),
            token_type_ids = encoded_text.token_type_ids.flatten(),
            label = torch.tensor(labels).unsqueeze(0)
            )

In [11]:
# Module
class NewsTopicDataModule(pl.LightningDataModule):
    
    def __init__(            
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: BertTokenizer,
        batch_size: int = 64,
        max_token_len: int = 32,
        ):

        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.setup()


    def __len__(self):
        return len(self.train_df)


    def setup(self, stage = None):
        self.train_dataset = NewsTopicDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len,
            )
        
        self.test_dataset = NewsTopicDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len,
            )
    

    def train_dataloader(self):        
        return DataLoader(
            self.train_dataset,
            batch_size = self.batch_size,
            shuffle = False,
            num_workers=2
            )


    def val_dataloader(self):        
        return DataLoader(
            self.test_dataset,
            batch_size = self.batch_size,
            shuffle = False,
            num_workers=2
            )
    

    def test_dataloader(self):        
        return DataLoader(
            self.test_dataset,
            batch_size = self.batch_size,
            shuffle = False,
            num_workers=2
            )

In [12]:
# Module
class NewsTopicTagger(pl.LightningModule):

    def __init__(self, 
                 model_path = 'klue/bert-base', 
                 train_samples = 35610,
                 batch_size = 64, 
                 epochs = 10, 
                 num_labels = 7, 
                 learning_rate = 2e-5, 
                 discriminative_fine_tuning_rate = 0.85
                 ):
        
        super().__init__()
        
        self.learning_rate = learning_rate
        self.discriminative_fine_tuning_rate = discriminative_fine_tuning_rate
        self.train_samples = train_samples
        self.batch_size = batch_size
        self.gradient_accumulation_steps = 1
        self.epochs = epochs
        self.warm_up_proportion = 0.2
        self.num_train_optimization_steps = int(self.train_samples / self.batch_size / self.gradient_accumulation_steps) * epochs
        self.num_warmup_steps = int(float(self.num_train_optimization_steps) * self.warm_up_proportion)
        self.no_decay_layer_list = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        
        config = BertConfig.from_pretrained('klue/bert-base', output_hidden_states = True)
        config.num_labels = num_labels
        self.bert_model = BertForSequenceClassification.from_pretrained(model_path, config = config)
        self.optimizer_grouped_parameters = self.get_optimizer_grouped_parameters()
        self.criterion = nn.CrossEntropyLoss()


    def forward(self, input_ids, attention_mask, token_type_ids, labels = None):        
        output = self.bert_model(
            input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            labels = labels
            )
         
        return output.loss, output.logits

    
    def get_optimizer_grouped_parameters(self):
        
        discriminative_fine_tuning_encoders = []
        for i in range(12):
            ith_layer = list(self.bert_model.bert.encoder.layer[i].named_parameters())
            
            encoder_decay = {
                'params': [param for name, param in ith_layer if
                           not any(no_decay_layer_name in name for no_decay_layer_name in self.no_decay_layer_list)],
                'weight_decay': 0.01,
                'lr': self.learning_rate / (self.discriminative_fine_tuning_rate ** (12 - i))
                }
        
            encoder_nodecay = {
                'params': [param for name, param in ith_layer if
                           any(no_decay_layer_name in name for no_decay_layer_name in self.no_decay_layer_list)],
                'weight_decay': 0.0,
                'lr': self.learning_rate / (self.discriminative_fine_tuning_rate ** (12 - i))}
            
            discriminative_fine_tuning_encoders.append(encoder_decay)
            discriminative_fine_tuning_encoders.append(encoder_nodecay)
            
        
        embedding_layer = self.bert_model.bert.embeddings.named_parameters()
        pooler_layer = self.bert_model.bert.pooler.named_parameters()
        classifier_layer = self.bert_model.classifier.named_parameters()
        
        optimizer_grouped_parameters = [
            {'params': [param for name, param in embedding_layer if
                        not any(no_decay_layer_name in name for no_decay_layer_name in self.no_decay_layer_list)],
             'weight_decay': 0.01,
             'lr': self.learning_rate / (self.discriminative_fine_tuning_rate ** 13)},
            {'params': [param for name, param in embedding_layer if
                        any(no_decay_layer_name in name for no_decay_layer_name in self.no_decay_layer_list)],
             'weight_decay': 0.0,
             'lr': self.learning_rate / (self.discriminative_fine_tuning_rate ** 13)},
            {'params': [param for name, param in pooler_layer if
                        not any(no_decay_layer_name in name for no_decay_layer_name in self.no_decay_layer_list)],
             'weight_decay': 0.01,
             'lr': self.learning_rate},
            {'params': [param for name, param in pooler_layer if
                        any(no_decay_layer_name in name for no_decay_layer_name in self.no_decay_layer_list)],
             'weight_decay': 0.0,
             'lr': self.learning_rate},
            {'params': [param for name, param in classifier_layer if
                        not any(no_decay_layer_name in name for no_decay_layer_name in self.no_decay_layer_list)],
             'weight_decay': 0.01,
             'lr': self.learning_rate},
            {'params': [param for name, param in classifier_layer if
                        any(no_decay_layer_name in name for no_decay_layer_name in self.no_decay_layer_list)],
             'weight_decay': 0.0,
             'lr': self.learning_rate}            
            ]
        optimizer_grouped_parameters.extend(discriminative_fine_tuning_encoders)
        
        return optimizer_grouped_parameters



    def training_step(self, batch, batch_index):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        label = batch['label']
        
        loss, logits = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            labels = label
            )
        
        total = label.size(0)        
        pred = torch.argmax(logits, 1).unsqueeze(1)
        correct = (pred == label).sum().item()
        acc = correct/total

        
        self.log('train_loss', loss, prog_bar = True, logger = True)
        self.log('train_acc', acc, prog_bar = True, logger = True)
        
        return loss
    
    
    def validation_step(self, batch, batch_index):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        label = batch['label']

        
        loss, logits = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            labels = label
            )
        
        total = label.size(0)        
        pred = torch.argmax(logits, 1).unsqueeze(1)
        correct = (pred == label).sum().item()
        acc = correct/total

        self.log('val_acc', acc, prog_bar = True, logger = True)
        self.log('val_loss', loss, prog_bar = True, logger = True)
        
        return loss
    
    
    def test_step(self, batch, batch_index):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']        
        label = batch['label']

        loss, logits = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            labels = label
            )
        
        total = label.size(0)        
        pred = torch.argmax(logits, 1).unsqueeze(1)
        correct = (pred == label).sum().item()
        acc = correct/total
        
        self.log('test_acc', acc, prog_bar = True, logger = True)
        self.log('test_loss', loss, prog_bar = True, logger = True)
        
        return loss
    
    
    def configure_optimizers(self):
        
        optimizer = AdamW(
            self.optimizer_grouped_parameters,
            lr = self.learning_rate,
            correct_bias = False
            )

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps = self.num_warmup_steps,
            num_training_steps = self.num_train_optimization_steps
            )
        
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

In [13]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

EPOCHS = 10
BATCH_SIZE = 64
NUM_LABELS = 7
LEARNING_RATE = 2e-5    
DISCRIMINATIVE_FINE_TUNING_RATE = 0.85

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=248477.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=125.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=289.0, style=ProgressStyle(description_…




In [14]:
!rm -rf lightning_logs/
!rm -rf checkpoints/

In [15]:
# 
data_module = NewsTopicDataModule(
    train_df, 
    val_df, 
    tokenizer, 
    batch_size = BATCH_SIZE
    )
    
model = NewsTopicTagger(
    model_path = 'klue/bert-base', 
    train_samples = len(data_module), 
    batch_size = BATCH_SIZE, 
    epochs = EPOCHS, 
    num_labels = NUM_LABELS, 
    learning_rate = LEARNING_RATE, 
    discriminative_fine_tuning_rate = DISCRIMINATIVE_FINE_TUNING_RATE
    )

checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best-checkpoint',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
    )

early_stopping = EarlyStopping(
    monitor = 'val_loss',
    patience = 3,
    mode = 'min'
    )

logger = TensorBoardLogger(
    'lightning_logs', 
    name = 'finbert_sentiment'
    )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=428.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445025130.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [16]:
if torch.cuda.is_available():    
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [17]:
trainer = pl.Trainer(
    logger = logger,
    callbacks = [checkpoint_callback, early_stopping],
    max_epochs = EPOCHS,
    gpus = 1,
    progress_bar_refresh_rate = 1
    )

trainer.fit(model, data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                          | Params
-------------------------------------------------------------
0 | bert_model | BertForSequenceClassification | 110 M 
1 | criterion  | CrossEntropyLoss              | 0     
-------------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.491   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0, global step 556: val_loss reached 0.39411 (best 0.39411), saving model to "/content/checkpoints/best-checkpoint.ckpt" as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 1, step 1113: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 2, step 1670: val_loss was not in top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 3, step 2227: val_loss was not in top 1





1

In [18]:
trained_model = NewsTopicTagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=7
)
trained_model.eval()
trained_model.freeze()

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

val_dataset = NewsTopicDataset(
  val_df,
  tokenizer,
  max_token_len=32
)

In [21]:
def get_topics(text):
    
    encoding = tokenizer.encode_plus(
        text,
        max_length = 32,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt'
        )
    
    logit_output = trained_model(
        input_ids = encoding.input_ids.flatten().unsqueeze(0).to(device),
        attention_mask = encoding.attention_mask.flatten().unsqueeze(0).to(device),
        token_type_ids = encoding.token_type_ids.flatten().unsqueeze(0).to(device)
        )[-1]
    predicted_topic = torch.argmax(logit_output, 1)
    
    return predicted_topic

In [None]:
predictions = []
test_clean_title=test.c_title.to_list()
for i in tqdm(test_clean_title):
    predictions.append(get_topics(i).item())

In [23]:
submission.topic_idx = predictions
submission.sample(3)

Unnamed: 0,index,topic_idx
1519,47173,0
1525,47179,3
3059,48713,0


In [24]:
submission.to_csv(PATH + '/torchbert/torch_klue.csv',index = False)

In [None]:
topic_dict

Unnamed: 0,topic,topic_idx
0,IT과학,0
1,경제,1
2,사회,2
3,생활문화,3
4,세계,4
5,스포츠,5
6,정치,6
