<a href="https://colab.research.google.com/github/dotsnangles/Transformers-Text-Classification/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q folium
!pip install -q transformers
!pip install -q datasets
!pip install -q huggingface_hub

[K     |████████████████████████████████| 4.2 MB 4.2 MB/s 
[K     |████████████████████████████████| 86 kB 5.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 48.9 MB/s 
[K     |████████████████████████████████| 596 kB 69.1 MB/s 
[K     |████████████████████████████████| 346 kB 4.2 MB/s 
[K     |████████████████████████████████| 86 kB 6.8 MB/s 
[K     |████████████████████████████████| 212 kB 94.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 92.0 MB/s 
[K     |████████████████████████████████| 140 kB 73.3 MB/s 
[K     |████████████████████████████████| 127 kB 71.8 MB/s 
[K     |████████████████████████████████| 271 kB 87.7 MB/s 
[K     |████████████████████████████████| 144 kB 45.0 MB/s 
[K     |████████████████████████████████| 94 kB 4.0 MB/s 
[K     |████████████████████████████████| 112 kB 78.6 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the follow

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import datasets

from huggingface_hub import notebook_login

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import numpy as np

import os
import gc

### Global Configration

In [9]:
num_labels = 7
models = ['klue/roberta-base', 'bert-base-multilingual-uncased', 'xlm-roberta-base']
data_path = '/content/train_data.csv'
batch_size = 64
num_train_epochs = 1

### Function Declaration

In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [8]:
def tokenize(batch):
    return tokenizer(batch["title"], padding=True, truncation=True)

### Fine Tuning and Model Saving

In [None]:
for model_n in models:
    # Load Model & Tokenizer
    model_ckpt = model_n

    # Load Data & Prepare K-Fold
    df = pd.read_csv(data_path, index_col=False)
    # df = df.drop(['Unnamed: 0'], axis=1)
    # df.reset_index(inplace=True)
    X, y = df['title'], df['topic_idx']
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
    splitgen = skf.split(X, y)

    print()
    print('----------------------------------------------------------------------------')
    print(model_ckpt)

    foldNum = 1
    for train_index, test_index in splitgen:
        print()
        print('----------------------------------------------------------------------------')
        print(f'{model_ckpt} {foldNum}-Fold')

        # Model instantiation
        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels,)
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

        # Tokenization
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        train = {
            'title': X_train.to_list(),
            'topic_idx': y_train.to_list()
        }
        
        test =  {
            'title': X_test.to_list(),
            'topic_idx': y_test.to_list()
        }
        
        train = datasets.Dataset.from_dict(train)
        train_encoded = train.map(tokenize, batched=True, batch_size=None)
        train_encoded = train_encoded.rename_column('topic_idx', 'label')

        test = datasets.Dataset.from_dict(test)
        test_encoded = test.map(tokenize, batched=True, batch_size=None)
        test_encoded = test_encoded.rename_column('topic_idx', 'label')

        # Training 
        logging_steps = len(train_encoded) // batch_size
        
        model_ckpt = model_ckpt.replace('/', '-')
        model_name = f"{model_ckpt}-tc-{foldNum}"

        training_args = TrainingArguments(output_dir='log/' + model_name,
                                            num_train_epochs=num_train_epochs,
                                            learning_rate=2e-5,
                                            per_device_train_batch_size=batch_size,
                                            per_device_eval_batch_size=batch_size,
                                            weight_decay=0.01,
                                            evaluation_strategy="epoch",
                                            disable_tqdm=False,
                                            logging_steps=logging_steps,
                                            push_to_hub=False,
                                            log_level="error")

        trainer = Trainer(model=model,
                    args=training_args,
                    compute_metrics=compute_metrics,
                    train_dataset=train_encoded,
                    eval_dataset=test_encoded,
                    tokenizer=tokenizer,
                    data_collator=data_collator)
        
        trainer.train()

        tokenizer.save_pretrained(f'saved/{model_name}')
        model.save_pretrained(f'saved/{model_name}')

        model_ckpt = model_ckpt.replace('klue-roberta-base', 'klue/roberta-base')

        foldNum += 1


----------------------------------------------------------------------------
klue/roberta-base

----------------------------------------------------------------------------
klue/roberta-base 1-Fold


Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'clas

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4782,0.334157,0.889935,0.888969



----------------------------------------------------------------------------
klue/roberta-base 2-Fold


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4756,0.331485,0.889278,0.888431



----------------------------------------------------------------------------
klue/roberta-base 3-Fold


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4762,0.33635,0.889826,0.88917



----------------------------------------------------------------------------
klue/roberta-base 4-Fold


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4775,0.345541,0.88446,0.883259



----------------------------------------------------------------------------
klue/roberta-base 5-Fold


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4761,0.33362,0.893319,0.89238



----------------------------------------------------------------------------
bert-base-multilingual-uncased

----------------------------------------------------------------------------
bert-base-multilingual-uncased 1-Fold


Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7162,0.465716,0.845581,0.844935



----------------------------------------------------------------------------
bert-base-multilingual-uncased 2-Fold


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7223,0.463952,0.844924,0.843908



----------------------------------------------------------------------------
bert-base-multilingual-uncased 3-Fold


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7166,0.47713,0.844595,0.84409



----------------------------------------------------------------------------
bert-base-multilingual-uncased 4-Fold


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6891,0.463732,0.846348,0.845252


### Logit Ensemble

In [None]:
# import gc
# gc.collect()

In [None]:
data_test = datasets.load_dataset('csv', data_files='/content/test_data.csv')['train']
data_test = data_test.remove_columns('index')
encoded_test = data_test.map(tokenize, batched=True, batch_size=None)
encoded_test = encoded_test.remove_columns('title')

In [None]:
saved_models = os.listdir('saved')

In [None]:
final_logits = 0
for saved_model in saved_models:
    model = AutoModelForSequenceClassification.from_pretrained(saved_model)
    tokenizer = AutoTokenizer.from_pretrained(saved_model)

    training_args = TrainingArguments(output_dir='test_log/' + saved_model,
                                        num_train_epochs=num_train_epochs,
                                        learning_rate=2e-5,
                                        per_device_train_batch_size=batch_size,
                                        per_device_eval_batch_size=batch_size,
                                        weight_decay=0.01,
                                        evaluation_strategy="epoch",
                                        disable_tqdm=False,
                                        logging_steps=logging_steps,
                                        push_to_hub=False,
                                        log_level="error")

    trainer = Trainer(model=model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=train_encoded,
                eval_dataset=test_encoded,
                tokenizer=tokenizer,
                data_collator=data_collator)

    preds_output = trainer.predict(encoded_test)

    final_logits += preds_output.predictions

In [None]:
sub = pd.read_csv("./sample_submission.csv")
sub['topic_idx'] = final_logits.argmax(1)
sub.to_csv('./final_submission.csv', index=False)

### Ensemble Example

In [None]:
def ensemble():
    final_logit=0
    # args.max_len=33
    args.pt = 'monologg/kobert'
    _, logit1 = run_predict("./saved_models/fold3/kobert/0f_9e_0.8895_s.pth")
    _, logit2 = run_predict("./saved_models/fold3/kobert/1f_10e_0.8823_s.pth")
    _, logit3 = run_predict("./saved_models/fold3/kobert/2f_8e_0.8888_s.pth")
    _, logit4 = run_predict("./saved_models/fold3/kobert/3f_10e_0.8897_s.pth")
    _, logit5 = run_predict("./saved_models/fold3/kobert/4f_8e_0.8867_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    
    #####################

    args.pt = 'klue/roberta-base'
    _, logit1 = run_predict("./saved_models/fold3/roberta-base/0f_5e_0.8920_s.pth")
    _, logit2 = run_predict("./saved_models/fold3/roberta-base/1f_4e_0.8879_s.pth")
    _, logit3 = run_predict("./saved_models/fold3/roberta-base/2f_5e_0.8889_s.pth")
    _, logit4 = run_predict("./saved_models/fold3/roberta-base/3f_4e_0.8951_s.pth")
    _, logit5 = run_predict("./saved_models/fold3/roberta-base/4f_4e_0.8887_s.pth")

    final_logit += (logit1+logit2+logit3+logit4+logit5)/5

    #####################
    args.pt = 'klue/roberta-small'
    preds1, logit1 = run_predict("./saved_models/fold3/roberta-small/0f_8e_0.8900_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3/roberta-small/1f_9e_0.8813_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3/roberta-small/2f_7e_0.8884_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3/roberta-small/3f_3e_0.8958_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3/roberta-small/4f_4e_0.8881_s.pth") # 8884 가능
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    #####################

    args.pt = 'bert-base-multilingual-uncased'
    preds1, logit1 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased/0f_5e_0.8624_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased/1f_8e_0.8573_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased/2f_9e_0.8674_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased/3f_8e_0.8649_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3/bert-base-multilingual-uncased/4f_9e_0.8673_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    #####################
    args.pt = 'klue/roberta-large'
    preds1, logit1 = run_predict("./saved_models/fold3/klue-roberta-large/0f_2e_0.8905_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3/klue-roberta-large/1f_4e_0.8897_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3/klue-roberta-large/2f_3e_0.8887_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3/klue-roberta-large/3f_3e_0.8949_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3/klue-roberta-large/4f_2e_0.8939_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    #####################
    args.pt = 'xlm-roberta-large'
    preds1, logit1 = run_predict("./saved_models/fold3/xlm-roberta-large_radam/0f_6e_0.8928_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3/xlm-roberta-large_radam/1f_5e_0.8850_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3/xlm-roberta-large_radam/2f_5e_0.8891_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3/xlm-roberta-large_radam/3f_8e_0.8938_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3/xlm-roberta-large_radam/4f_6e_0.8911_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    #####################
    args.max_len=28
    args.pt = 'klue/roberta-large'
    preds1, logit1 = run_predict("./saved_models/fold3/klue-roberta-large_28/0f_6e_0.8912_s.pth")
    preds2, logit2 = run_predict("./saved_models/fold3//klue-roberta-large_28/1f_3e_0.8891_s.pth")
    preds3, logit3 = run_predict("./saved_models/fold3//klue-roberta-large_28/2f_5e_0.8891_s.pth")
    preds4, logit4 = run_predict("./saved_models/fold3//klue-roberta-large_28/3f_4e_0.8961_s.pth")
    preds5, logit5 = run_predict("./saved_models/fold3//klue-roberta-large_28/4f_2e_0.8938_s.pth")
    final_logit += (logit1+logit2+logit3+logit4+logit5)/5
    
    return final_logit

### Redundants

In [3]:
# notebook_login()

### Download Models & Tokenizers

In [4]:
# models = ['klue/roberta-base', 'xlm-roberta-base', 'bert-base-multilingual-uncased']
# for adrs in models:
#     model = AutoModelForSequenceClassification.from_pretrained(adrs, num_labels=7)
#     tokenizer = AutoTokenizer.from_pretrained(adrs)

#     adrs.replace('/', '-')

#     tokenizer.save_pretrained(adrs)
#     model.save_pretrained(adrs)

### Data Load & K-Fold Interator Instantiation

In [5]:
!gdown -q 1K0v7liLb4ls5NMZrvrsEFQCfGqGduBSN

In [6]:
# df = pd.read_csv('/content/augmented_dataset.csv', index_col=False)
# df = df.drop(['Unnamed: 0'], axis=1)
# df.reset_index(inplace=True)
# X, y = df['title'], df['topic_idx']
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
# splitgen = skf.split(X, y)