## Abusive Tweet Identification using BERT

In [1]:
# Install required packages
!pip install transformers
!pip install datasets
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0
Looking in indexes: https://pypi.org/simple, https://u

### Connect to Huggingface

In [2]:
# Reference: https://huggingface.co/docs/transformers/model_sharing
# make sure that you have a token generated from your HuggingFace account

from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import pandas as pd
import numpy as np
from datasets import Dataset

In [4]:
# Load dataset
# make sure that you call the file path correctly
df = pd.read_excel('/content/sample_data/abusive_tweet_data.xlsx')

In [5]:
df.head()

Unnamed: 0,tweet,cleanText,category
0,Me: debat dengan provide data dan hitungan-hit...,me debat provide data hitungan hitungan bukti ...,Konten_kasar
1,ASE: lo ngapain masuk ngantor? kenapa gak WFH ...,ase masuk ngantor wfh aja packing bawain pesen...,Konten_kasar
2,@P3nj3l4j4h @natadiningrat99 Lala lama eneg ng...,lala eneg ngeliat pasangan capres berilmu cm m...,Konten_kasar
3,@cingu24 Bacot emg tuh satu.,bacot satu,Konten_kasar
4,baru tgl 2 weh udah ada anjing bacot aja,baru udah anjing bacot aja,Konten_kasar


In [6]:
len(df)

5462

In [7]:
# get category distribution
df['category'].value_counts()

Konten_kasar          2731
Bukan_konten_kasar    2731
Name: category, dtype: int64

In [8]:
tags = np.unique(df['category']) # get unique category
num_tags = len(tags) # get the number of category, here we have 2 tags/categories
label2id = {t: i for i, t in enumerate(tags)} # make a dictionary to map label to id
id2label = {i: t for i, t in enumerate(tags)} # make a dictionary to map id to label

### 1. Split dataset into train and test

In [9]:
# Define training dan test dataset

train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

### 2. Build dataset dictionary

In [10]:
# load and convert dataframe to the dataset format that suitable for training
# here we're using Dataset class with from_pandas()

X_train = Dataset.from_pandas(train_data)
X_test = Dataset.from_pandas(test_data)

In [12]:
# See the dataset dictionary
print(X_train)
print(X_test)

Dataset({
    features: ['tweet', 'cleanText', 'category', '__index_level_0__'],
    num_rows: 4370
})
Dataset({
    features: ['tweet', 'cleanText', 'category', '__index_level_0__'],
    num_rows: 1092
})


In [15]:
# see the sample
X_train[10]

{'tweet': 'RT @tempoekbis: Sri Mulyani Menjadi Menteri Terpopuler di Tahun 2018 https://t.co/KVFgeEgkw1',
 'cleanText': 'sri mulyani menjadi menteri terpopuler tahun',
 'category': 'Bukan_konten_kasar',
 '__index_level_0__': 3586}

### 3. Fine-tuning

In [16]:
# Data tokenization

from transformers import AutoTokenizer

max_length = 128

# define the tokenizer
# we're using https://huggingface.co/indolem/indobertweet-base-uncased
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased", max_length=max_length)

def tokenize_function(examples):
    # process the input sequence
    tokenized_input = tokenizer(examples["cleanText"], 
                                truncation=True, 
                                padding='max_length', 
                                max_length=max_length)
    # process the labels
    tokenized_input['label'] = [label2id[lb] for lb in examples['category']]
    
    return tokenized_input

tokenized_train_data = X_train.map(tokenize_function, batched=True)
tokenized_test_data = X_test.map(tokenize_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/4370 [00:00<?, ? examples/s]

Map:   0%|          | 0/1092 [00:00<?, ? examples/s]

In [23]:
# see the tokenization result
# we'll see some additional features: input_ids, token_type_ids, attention_mask, and label
tokenized_test_data

Dataset({
    features: ['tweet', 'cleanText', 'category', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 1092
})

In [None]:
# see the sample tokenized data
tokenized_test_data[50]

In [26]:
# define the metrics

import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    report = classification_report(labels, pred, digits=4)
    acc = accuracy_score(y_true=labels, y_pred=pred)
    rec = recall_score(y_true=labels, y_pred=pred)
    prec = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    print("Classification Report:\n{}".format(report))
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

In [27]:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

# define the pre-trained model
# here we're using https://huggingface.co/indolem/indobertweet-base-uncased
# we use AutoModelForSequenceClassification class, see the documentation here:
# https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForSequenceClassification

checkpoint = "indolem/indobertweet-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, 
                                                           num_labels=num_tags,
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at indolem/indobertweet-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/ind

In [28]:
# see the documentation about TrainingArguments here:
# https://huggingface.co/docs/transformers/v4.28.0/en/main_classes/trainer#transformers.TrainingArguments

output_dir = "abusive_content_identification" # name your own output directory
training_args = TrainingArguments(output_dir=output_dir, 
                                  evaluation_strategy="epoch", 
                                  num_train_epochs=2,
                                  push_to_hub=True)

# https://huggingface.co/docs/transformers/model_sharing
# Set push_to_hub=True in our TrainingArguments 
# will automatically push a model directly to the Hub during the training

# see the documentation about Trainer here:
# https://huggingface.co/docs/transformers/v4.28.0/en/main_classes/trainer

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/fathan/abusive_content_identification into local empty directory.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0644,0.048692,0.991758,0.983486,1.0,0.991674
2,0.0056,0.027613,0.997253,0.994434,1.0,0.997209


Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9838    0.9918       556
           1     0.9835    1.0000    0.9917       536

    accuracy                         0.9918      1092
   macro avg     0.9917    0.9919    0.9918      1092
weighted avg     0.9919    0.9918    0.9918      1092

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9946    0.9973       556
           1     0.9944    1.0000    0.9972       536

    accuracy                         0.9973      1092
   macro avg     0.9972    0.9973    0.9973      1092
weighted avg     0.9973    0.9973    0.9973      1092



TrainOutput(global_step=1094, training_loss=0.032000620370999766, metrics={'train_runtime': 278.9212, 'train_samples_per_second': 31.335, 'train_steps_per_second': 3.922, 'total_flos': 574897655961600.0, 'train_loss': 0.032000620370999766, 'epoch': 2.0})

In [29]:
# evaluate our model on the validation set
trainer.evaluate()

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9946    0.9973       556
           1     0.9944    1.0000    0.9972       536

    accuracy                         0.9973      1092
   macro avg     0.9972    0.9973    0.9973      1092
weighted avg     0.9973    0.9973    0.9973      1092



{'eval_loss': 0.027613429352641106,
 'eval_accuracy': 0.9972527472527473,
 'eval_precision': 0.9944341372912802,
 'eval_recall': 1.0,
 'eval_f1': 0.9972093023255814,
 'eval_runtime': 8.412,
 'eval_samples_per_second': 129.814,
 'eval_steps_per_second': 16.286,
 'epoch': 2.0}

In [30]:
# save model
trainer.save_model(output_dir)
# push
trainer.push_to_hub(commit_message="Training complete")

Upload file pytorch_model.bin:   0%|          | 1.00/422M [00:00<?, ?B/s]

Upload file runs/Apr14_07-43-01_c9fd1fc1451c/events.out.tfevents.1681458191.c9fd1fc1451c.232.0:   0%|         …

Upload file runs/Apr14_07-43-01_c9fd1fc1451c/events.out.tfevents.1681458552.c9fd1fc1451c.232.2:   0%|         …

To https://huggingface.co/fathan/abusive_content_identification
   e1d089a..8c9265f  main -> main

   e1d089a..8c9265f  main -> main

To https://huggingface.co/fathan/abusive_content_identification
   8c9265f..c0c1137  main -> main

   8c9265f..c0c1137  main -> main

