# Resoure Preparation

### Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


### Setup Library

In [None]:
!pip -q install thai2transformers==0.1.1
!pip install torch==1.7.0

[K     |████████████████████████████████| 1.3 MB 52.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 57.8 MB/s 
[K     |████████████████████████████████| 175 kB 74.2 MB/s 
[K     |████████████████████████████████| 11.5 MB 52.0 MB/s 
[K     |████████████████████████████████| 362 kB 76.3 MB/s 
[K     |████████████████████████████████| 8.7 MB 69.1 MB/s 
[K     |████████████████████████████████| 43 kB 2.2 MB/s 
[K     |████████████████████████████████| 585 kB 18.3 MB/s 
[K     |████████████████████████████████| 11.1 MB 44.3 MB/s 
[K     |████████████████████████████████| 473 kB 51.7 MB/s 
[K     |████████████████████████████████| 2.9 MB 40.8 MB/s 
[K     |████████████████████████████████| 87 kB 7.7 MB/s 
[K     |████████████████████████████████| 965 kB 61.0 MB/s 
[K     |████████████████████████████████| 880 kB 57.4 MB/s 
[K     |████████████████████████████████| 212 kB 73.5 MB/s 
[K     |████████████████████████████████| 86 kB 5.7 MB/s 
[K     |███████████████████

In [None]:
import pandas as pd
import numpy as np

from datasets.dataset_dict import DatasetDict
from datasets import Dataset, load_dataset, load_metric

import torch


from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          Trainer)


from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from thai2transformers.tokenizers import (
    ThaiRobertaTokenizer,
    ThaiWordsNewmmTokenizer,
    ThaiWordsSyllableTokenizer,
    FakeSefrCutTokenizer
)
from thai2transformers.preprocess import process_transformers
from thai2transformers.metrics import classification_metrics


import gdown
from google.colab import drive

### Download dataset

In [None]:
driveURL = "https://drive.google.com/file/d/1ShyXkcRa-Yl2xDtdUqebBmwBEC_AhqH1/view?usp=sharing"
downloadURL = 'https://drive.google.com/uc?export=download&id='+driveURL.split('/')[-2]
gdown.download(downloadURL)
!unzip TNC_DataSet_5000_V1.zip

Downloading...
From: https://drive.google.com/uc?export=download&id=1ShyXkcRa-Yl2xDtdUqebBmwBEC_AhqH1
To: /content/TNC_DataSet_5000_V1.zip
100%|██████████| 46.9M/46.9M [00:00<00:00, 129MB/s]


Archive:  TNC_DataSet_5000_V1.zip
  inflating: dataset_info.txt        
  inflating: TNC_AllDomain_5000.csv  
  inflating: TNC_TestSet_5000.csv    
  inflating: TNC_TrainSet_5000.csv   
  inflating: TNC_ValidationSet_5000.csv  


# Data Preprocessing

In [None]:
DOMAIN_DICT = {
      0 : 'Imaginative',
      1 : 'Natural & Pure Science',
      2 : 'Applied Science',
      3 : 'Social Science ',
      4 : 'History',
      5 : 'Commerce & Finance',
      6 : 'Arts',
      7 : 'Belief & Thought'
  }

INVERSED_DOMAIN_DICT =  dict([(value, key) for key, value in DOMAIN_DICT.items()])

def getLabel(category): return INVERSED_DOMAIN_DICT[category]

### Comvert Dataframe to dataset

In [None]:
train_df = pd.read_csv("TNC_TrainSet_5000.csv")
valid_df = pd.read_csv("TNC_ValidationSet_5000.csv")
test_df = pd.read_csv("TNC_TestSet_5000.csv")

train_df['Labels'] = list(map(getLabel,train_df['Category']))
valid_df['Labels'] = list(map(getLabel,valid_df['Category']))
test_df['Labels'] = list(map(getLabel,test_df['Category']))

trainDict = {'texts': train_df['Texts'],
             'category': train_df['Category'],
             'labels': train_df['Labels']}
validDict = {'texts': valid_df['Texts'],
             'category': valid_df['Category'],
             'labels': valid_df['Labels']}
testDict = {'texts': test_df['Texts'],
             'category': test_df['Category'],
             'labels': test_df['Labels']}

DATA = {'train': Dataset.from_dict(trainDict),
        'validation': Dataset.from_dict(validDict),
        'test': Dataset.from_dict(testDict)}

dataset = DatasetDict(DATA)
dataset

DatasetDict({
    train: Dataset({
        features: ['texts', 'category', 'labels'],
        num_rows: 25200
    })
    validation: Dataset({
        features: ['texts', 'category', 'labels'],
        num_rows: 5400
    })
    test: Dataset({
        features: ['texts', 'category', 'labels'],
        num_rows: 5400
    })
})

In [None]:
num_labels = len(set(dataset['train']['labels']))
num_labels

8

### Data Cleaning

In [None]:
def clean_function(examples):
    examples['texts'] = process_transformers(examples['texts'])
    return examples

cleaned_dataset = dataset.map(clean_function)



  0%|          | 0/25200 [00:00<?, ?ex/s]

  0%|          | 0/5400 [00:00<?, ?ex/s]

  0%|          | 0/5400 [00:00<?, ?ex/s]

In [None]:
#ตัวอย่างข้อความที่ทำความสะอาดแล้ว
pd.DataFrame(cleaned_dataset['train'].shuffle()[:10])[['labels','texts']]

Unnamed: 0,labels,texts
0,1,แอนนี<_>จัมป์<_>แคนนอน<_>annie<_>jump<_>cannon...
1,4,ด้วยเหตุผลดังกล่าวแล้วประกอบการงดส่งเงินช่วยเห...
2,7,สังคมและกาลเวลา<_>ดังนั้นจริยธรรมใดก็ตามแต่ที่...
3,4,ของเสียประเภทแกร์ไฟต์ธรรมชาติ<_>ของเสียประเภท<...
4,6,การสะบัดสีสองสามครั้งโดยไม่มีจุดหมายจะไม่สามรถ...
5,7,ต้นไม้นั้นจึงเป็นต้นข้าวเสมอ<_>ไม่เป็นต้นไม้ชน...
6,5,ที่สําคัญคือ<_>ในปีที่ว่างเว้นยังสามารถจัดประช...
7,3,คุณต้องทราบถึงความต้องการที่ผู้สูงอายุมี<_>คุณ...
8,6,แน่นอนที่สุดว่า<_>บริษัทกําลังมองหาคนที่จะไปปร...
9,4,รับทราบว่า<_>รัฐควรประกันว่าผู้ก่อกําเนิดควรจะ...


### Setup model argument

In [None]:
class Args:
    model_name = 'airesearch/wangchanberta-base-att-spm-uncased'
    feature_col = 'texts'
    label_col = 'category'
    output_dir = '/content/drive/MyDrive/Programs/AIB/TNC_WangChan_5000/models_TNC/wangchanberta-base-att-spm-uncased_wongnai'
    batch_size = 4
    warmup_percent = 0.1
    learning_rate = 3e-05
    num_train_epochs = 5
    weight_decay = 0.01
    metric_for_best_model = 'f1_micro'
    seed = 1412

args = Args()

In [None]:
args.model_name

'airesearch/wangchanberta-base-att-spm-uncased'

# Training with WangchanBERTa

### Encode Dataset

In [None]:
#create tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_name, model_max_length=416)

#encode dataset
def encode_function(examples):
    return tokenizer(examples[args.feature_col], truncation=True)
encoded_dataset = dataset.map(encode_function, batched=True)

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/905k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/282 [00:00<?, ?B/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['texts', 'category', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 25200
    })
    validation: Dataset({
        features: ['texts', 'category', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5400
    })
    test: Dataset({
        features: ['texts', 'category', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5400
    })
})

### Load model 

In [None]:
#create model
model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=num_labels)

Downloading:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

In [None]:
classification_metrics

<function thai2transformers.metrics.classification_metrics>

### Setup train arguments

In [None]:
train_args = TrainingArguments(
    output_dir = args.output_dir,
    evaluation_strategy = "epoch",
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    num_train_epochs=args.num_train_epochs,
    warmup_steps = int(len(encoded_dataset['train']) * args.num_train_epochs // args.batch_size * args.warmup_percent),
    weight_decay=args.weight_decay,
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model=args.metric_for_best_model,
    seed = args.seed
)

In [None]:
trainer = Trainer(
    model,
    train_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=classification_metrics
)

In [None]:
# preds  = trainer.predict(encoded_dataset['validation'])
# pd.DataFrame.from_dict(preds[2],orient='index').transpose()

### Start Training

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,Precision Micro,Recall Micro,F1 Macro,Precision Macro,Recall Macro,Nb Samples
1,0.229133,1.471356,0.767593,0.767593,0.767593,0.767593,0.764511,0.78935,0.767593,5400
2,0.105134,1.531898,0.797778,0.797778,0.797778,0.797778,0.801052,0.827829,0.797778,5400
3,0.055519,1.689828,0.790185,0.790185,0.790185,0.790185,0.788701,0.828946,0.790185,5400
4,0.018454,1.615496,0.814444,0.814444,0.814444,0.814444,0.816131,0.840034,0.814444,5400
5,1e-06,1.705567,0.81963,0.81963,0.81963,0.81963,0.821112,0.852043,0.81963,5400


TrainOutput(global_step=31500, training_loss=0.1541033916170635)