### 1. 讀取專用的 Tokenizer (BERTweet有提供專用的Normalization方式)
- Model 1: bertweet-base (BERTweet)
- Model 2: bertweet-covid19-base-uncased (BERTweet)
- Model 3: twitter-xlm-roberta-base (XLM-T)
- Model 4: twitter-roberta-base (TWEETEVAL)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
#tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-covid19-base-uncased", normalization=True)
#tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
#tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")

#### 1-2: 如果使用的是 vinai 的模型，他有整合了normalization (tweet preprocessing)在 tokenizer 之中

In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df = df.rename({'target':'label'}, axis = 1)
df = df[['text', 'label']]

### 切記 Test dataset 和 Train dataset 要有一模一樣的 preprocessing
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
df_test = df_test[['text']]

#### 1-3: 如果是使用 cardiffnlp 的模型，就要對 Tweets preprocessing

In [None]:
# def preprocess(text):
#     new_text = []
#     for t in text.split(" "):
#         t = '@user' if t.startswith('@') and len(t) > 1 else t
#         t = 'http' if t.startswith('http') else t
#         new_text.append(t)
#     return " ".join(new_text)
# df['text'] = df['text'].apply(lambda x: preprocess(x))
# df_test['text'] = df_test['text'].apply(lambda x: preprocess(x))

#### 1-4: 將處理完的資料存到CSV file

In [None]:
### Shuffle the training data
df = df.sample(frac = 1)
df.to_csv('train_processed.csv')
df_test.to_csv('test_processed.csv')

### 2: 使用 Huggingface 的 datasets module來讀取資料

In [None]:
from datasets import load_dataset
### Split the training data into train (90%) and validation (10%)
train_dataset = load_dataset('csv', data_files='train_processed.csv', split='train[:90%]')
val_dataset = load_dataset('csv', data_files='train_processed.csv', split='train[90%:]')
test_dataset = load_dataset('csv', data_files='test_processed.csv', split = 'train')
### Padding or truncating the data to 100 tokens
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=100)
tokenized_datasets_train = train_dataset.map(tokenize_function, batched=True)
tokenized_datasets_val = val_dataset.map(tokenize_function, batched=True)
tokenized_datasets_test = test_dataset.map(tokenize_function, batched=True)

### 3: 使用 Huggingface 來讀取想要用的Model

In [None]:
from transformers import AutoModelForSequenceClassification

model1 = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-large", num_labels=2)
# model2 = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-covid19-base-uncased", num_labels=2)
# model3 = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base", num_labels=2)
# model4 = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base", num_labels=2)

### 4.定義訓練過程的參數
- Batch_size = 32
- gradient_accumulation_steps = 8  
代表著處理 32*8 = 256 筆資料之後才會做一次back propagation

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
gradient_accumulation_steps=8,
greater_is_better=None,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
load_best_model_at_end=False,
local_rank=-1,
max_grad_norm=1.0,
max_steps=-1,
no_cuda=False,
num_train_epochs=10.0,
output_dir="training_res",
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=32,
per_device_train_batch_size=32,
prediction_loss_only=False,
remove_unused_columns=True,
resume_from_checkpoint=None,
save_on_each_node=False,
save_steps = 100,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_legacy_prediction_loop=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
evaluation_strategy = "epoch",
)

### 5. 定義 Model 訓練的參數、資料，以及Metric的計算方式

In [None]:
from transformers import Trainer
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model, args=training_args, train_dataset=tokenized_datasets_train, eval_dataset=tokenized_datasets_val, compute_metrics=compute_metrics, 
)

### 6. 開始訓練

In [None]:
trainer.train()

### 7. 讀取最好的模型，並預測結果

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("./training_res/path-to-best-model", num_labels=2)


In [None]:
predictions = trainer.predict(tokenized_datasets_test)

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)


In [None]:
import pandas as pd 
df_sub = pd.read_csv('./sample_submission.csv')

In [None]:
df_sub['target'] = preds

In [None]:
df_sub.to_csv('/kaggle/working/bertweet-large-submission.csv', index = False)

In [None]:
from IPython.display import FileLink
FileLink(r'/kaggle/working/bertweet_submission_epoch2.csv')