## 1. Install Required Packages

In [None]:
!pip install datasets
!pip install transformers
!pip install huggingface_hub
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Connect to Hugging Face

Reference: https://huggingface.co/docs/transformers/model_sharing.
Make sure that you have a token generated from your Hugging Face account.

In [None]:
#from huggingface_hub import notebook_login
#notebook_login()

## 2. Load Dataset

We're going to use an Indonesian sentiment dataset available in Hugging Face.

The dataset can be accessed here: [sepidmnorozy/Indonesian_sentiment](https://huggingface.co/datasets/sepidmnorozy/Indonesian_sentiment).

The dataset is split into:

*   Train (7.93K rows)
*   Validation (1.13K rows)
*   Test (2.27K rows)



In [None]:
from datasets import load_dataset

dataset = load_dataset("sepidmnorozy/Indonesian_sentiment")



  0%|          | 0/3 [00:00<?, ?it/s]

### Show the dataset dictionary

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 7926
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1132
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 2266
    })
})

### Get a dataset sample

In [None]:
print(dataset['test'][0])
print(dataset['test'][3])

{'label': 1, 'text': 'rekomendasi bangetlah . makanan enak , cappuccino nya ketagihan , pemandangan kota keren , harga miring dan valet parkir bayar seikhlas nya . datang pas menjelang maghrib pasti lebih keren . jangan lupa bawa jaket kalau mau makan di outdoor nya .'}
{'label': 0, 'text': 'yusri binti maling .'}


There are two fields in the dataset:

*   **text**: the text.
*   label: a value that is either 0 for a negative sentiment or 1 for a positive sentiment.

## 3. Pre-processing

In [None]:
from transformers import AutoTokenizer

# define the pre-trained model
model_name = 'indolem/indobert-base-uncased'
# define the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Create a pre-processing function to tokenize text and truncate sequences to be no longer than the defined maximum input length.

In [None]:
def preprocess_function(examples):
  return tokenizer(examples['text'], truncation=True)

Apply the `preprocess_function` over the entire dataset, use 🤗 Datasets map function.

We can speed up map by setting batched=True to process multiple elements of the dataset at once.

In [None]:
tokenized_txt = dataset.map(preprocess_function, batched=True)



Map:   0%|          | 0/1132 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
tokenized_txt

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7926
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1132
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2266
    })
})

We can see from the results above that three new features have been added to the dataset:  

*   input_ids
*   token_type_ids
*   attention_mask




In [None]:
# get a sample tokenized text from training set
tokenized_txt['train'][0]

{'label': 1,
 'text': 'bubur ayam yang lumayan rekomendasi di sekitaran bandung , tempat nya strategis mudah dicari , harga nya tidak merogoh kantong , tempat nya selalu ramai didatangi pengunjung setiap hari kerja maupun akhir pekan karena rasanya yang enak .',
 'input_ids': [3,
  18600,
  5455,
  1497,
  10855,
  9251,
  1485,
  2034,
  1476,
  3576,
  16,
  1991,
  2647,
  6783,
  3069,
  10575,
  16,
  2661,
  2647,
  1580,
  25764,
  10394,
  16,
  1991,
  2647,
  2643,
  5332,
  17163,
  6018,
  2189,
  1843,
  2533,
  3455,
  2010,
  3630,
  1686,
  7460,
  1497,
  8955,
  18,
  4],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
 

Now create a batch of examples using DataCollatorWithPadding. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# define the metrics

import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    report = classification_report(labels, pred, digits=4)
    acc = accuracy_score(y_true=labels, y_pred=pred)
    rec = recall_score(y_true=labels, y_pred=pred)
    prec = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    print("Classification Report:\n{}".format(report))
    print("Summary:")
    print("Overall Precision: ", prec)
    print("Overall Recall: ", rec)
    print("Overall F1 score: ", f1)
    print("Overall Accuracy: ", acc)
    print("\n")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

Before we start training our model, we create a map of the expected ids to their labels with id2label and label2id.

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

## 4. Fine-tuning with Trainer API

### Define the model




In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, # we already defined the model name: indolem/indobert-base-uncased
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indober

At this point, only three steps remain:

1. Define our training hyperparameters in `TrainingArguments`.



> The only required parameter is output_dir which specifies where to save our model. If we want to automatically upload our model to the Hub during training, pass along `push_to_hub=True` in the `TrainingArguments`. At the end of each epoch, the Trainer will evaluate the accuracy and save the training checkpoint.


2. Pass the training arguments to `Trainer` along with the model, dataset, tokenizer, data collator, and compute_metrics function.


3. Call `train()` to finetune our model.

In [None]:
def train_predict_model(model_name, output_dir):

  training_args = TrainingArguments(
      output_dir=output_dir,
      logging_strategy="epoch",
      evaluation_strategy="epoch",
      save_strategy="epoch",
      save_total_limit = 1,
      learning_rate=2e-05,
      num_train_epochs=3,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      weight_decay=0.01,
      metric_for_best_model = "f1",
      load_best_model_at_end=True,
      #push_to_hub=True,
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_txt['train'],
      eval_dataset=tokenized_txt['validation'],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
  )
  print("Training Process")
  trainer.train()
  trainer.save_model(output_dir)

  # predict
  print("Prediction")
  pred_trainer = Trainer(
      model=model,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
  )

  pred_trainer.predict(tokenized_txt['test'])

In [None]:
train_predict_model(
    model_name=model_name,
    output_dir='sentiment_model'
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training Process


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2504,0.181462,0.934629,0.98797,0.908714,0.946686
2,0.1327,0.156226,0.95318,0.971831,0.954357,0.963015
3,0.0806,0.177055,0.956714,0.966759,0.965422,0.96609


Classification Report:
              precision    recall  f1-score   support

           0     0.8587    0.9804    0.9155       409
           1     0.9880    0.9087    0.9467       723

    accuracy                         0.9346      1132
   macro avg     0.9233    0.9446    0.9311      1132
weighted avg     0.9413    0.9346    0.9354      1132

Summary:
Overall Precision:  0.98796992481203
Overall Recall:  0.9087136929460581
Overall F1 score:  0.946685878962536
Overall Accuracy:  0.9346289752650176


Classification Report:
              precision    recall  f1-score   support

           0     0.9218    0.9511    0.9362       409
           1     0.9718    0.9544    0.9630       723

    accuracy                         0.9532      1132
   macro avg     0.9468    0.9527    0.9496      1132
weighted avg     0.9538    0.9532    0.9533      1132

Summary:
Overall Precision:  0.971830985915493
Overall Recall:  0.9543568464730291
Overall F1 score:  0.9630146545708305
Overall Accuracy:  0

Classification Report:
              precision    recall  f1-score   support

           0     0.9334    0.9474    0.9404       799
           1     0.9711    0.9632    0.9671      1467

    accuracy                         0.9576      2266
   macro avg     0.9523    0.9553    0.9538      2266
weighted avg     0.9578    0.9576    0.9577      2266

Summary:
Overall Precision:  0.9711340206185567
Overall Recall:  0.9631901840490797
Overall F1 score:  0.9671457905544147
Overall Accuracy:  0.9576345984112974




## 5. Inference

In [None]:
from transformers import pipeline

text = ['Pelayanan yang sangat memuaskan', 'Kecewa dengan kualitas barangnya']

for t in text:
  classifier = pipeline("sentiment-analysis", model='sentiment_model')
  print(classifier(t))

[{'label': 'POSITIVE', 'score': 0.9994540810585022}]
[{'label': 'NEGATIVE', 'score': 0.9990911483764648}]
