# ABOUT:
- this notebook:
    1. Preprocesses/ tokenize the text classification datasets to prepare for model training
    2. **Fine-tunes roberta text classification model** to predict if point of interest or street name exists in an address

In [1]:
model_checkpoint = "w11wo/indonesian-roberta-base-sentiment-classifier"
max_length = None
num_labels = 2       
batch_size = 50

## load_from_disk

In [2]:
from datasets import load_from_disk

In [3]:
exist_or_not = load_from_disk(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Comp 2 address elements extraction\datasets\street existence cls dataset")
exist_or_not

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'raw_address', 'street'],
        num_rows: 285000
    })
    test: Dataset({
        features: ['id', 'label', 'raw_address', 'street'],
        num_rows: 15000
    })
})

In [4]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [5]:
show_random_elements(exist_or_not['train'])

Unnamed: 0,raw_address,label,street,id
0,bpsdm hukum dan ham jl. raya gandul no 4 kel. gandul,exist,jl. raya gandul,27408
1,kestalan abdul rah saleh banjarsari,exist,abdul rah saleh,147509
2,"agen bus suma kuto,",exist,kuto,18937
3,jakasampurna tangk per 67 17145 bekasi barat,exist,tangk per,265575
4,"cluster carissa graha raya bintaro blok c-02, rt 01 rw 014 jl. boulevard graha raya kel",does not exist,,32901
5,"domain inter pt ban raya, mampang prapatan",exist,ban raya,134058
6,anggi cell,does not exist,,256890
7,"kfc ratul - kds, bhayan, bongaya mamajang",exist,bhayan,108131
8,mang 67 gunung sekar sampang,exist,mang,12632
9,pangkalan jati rt 01 rw 013 kelurahan cipinang melayu kecamatan makasar jakarta timur,does not exist,,85104


In [6]:
def encode_labels(egs):
    label_mappings = {"exist":1,
                      "does not exist":0}
    egs['label'] = [label_mappings[label] for label in egs['label']]
    return egs

- here, we are doing classification task with 4 labels (shown above)
- they need to be encoded to numbers!

In [7]:
exist_or_not = exist_or_not.map(encode_labels, batched = True)

HBox(children=(FloatProgress(value=0.0, max=285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [8]:
exist_or_not.remove_columns_(["street","id"])

In [9]:
show_random_elements(exist_or_not['train'])

Unnamed: 0,label,raw_address
0,1,"abdul wahab sia, dukuh pakis"
1,1,"karangraharja lily, 17530 cikarang utara"
2,1,cipinang cempedak kebon nanas sela ii 48 rt 4 7 13340 jatinegara
3,1,mojom mojorejo junrejo
4,1,ciku kampung baru tanjung pinang barat
5,1,jatiranggon super indo raya han rt 4 2 17431 jati sampurna
6,1,"toko babeh akhir garuda ii, 9 pasir gunung selatan cimanggis"
7,1,"bb pet shop, radio dalam, kebayoran baru"
8,1,"bas rah, 116 lion parcel bojonegoro kadipaten bojonegoro"
9,0,photo copy tiga cahaya


## 2. load_metric
- each task has its own associated metric

    - for CoLA: Matthews Correlation Coefficient
    - for MNLI (matched or mismatched): Accuracy
    - for MRPC: Accuracy and F1 score
    - for QNLI: Accuracy
    - for QQP: Accuracy and F1 score
    - for RTE: Accuracy
    - for SST-2: Accuracy
    - for STS-B: Pearson Correlation Coefficient and Spearman's_Rank_Correlation_Coefficient
    - for WNLI: Accuracy

In [10]:
from datasets import load_metric
metric = load_metric('glue', "sst2")
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
""", stored examples: 0)

- since "sst2" task uses the accuracy metric and is also 1-sentenced
- thus, we will reference "sst2" to load accuracy metric (above)

## 3. Pre-processing: 
- depending on whether the task has 1 or 2 sentences, pre-processing is done slightly differently
- for 1 sentence:
    - **[CLS] and [SEP] tokens are added to the front and end of the sentence respectively**
- for 2 sentences:
    - in addition to above steps, the second sentence is concatenated and another [SEP] token is added to the end 

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, 
                                          use_fast=True ,   # uses fast tokenizer
                                          # padding = "longest"
                                          )

In [12]:
# this function is meant to process either one sentence 
def preprocess_function(examples):
    return tokenizer(examples["raw_address"], 
                     truncation = True,
                     padding = "longest",          #  padding = True means to pad to longest sequence length
                     #max_length = max_length,
                     #add_special_tokens = True,
                     #return_tensors = 'pt'
                     )
                     # add_special_tokens = True)
    
    # output["length"] = list(map(len,output["input_ids"]))
    # return output

In [13]:
tokenized_dataset = exist_or_not.map(preprocess_function,
                                     batched = True
                                     )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


HBox(children=(FloatProgress(value=0.0, max=285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [14]:
tokenized_dataset.remove_columns_(["raw_address"])

In [15]:
show_random_elements(tokenized_dataset['train'])

Unnamed: 0,attention_mask,input_ids,label
0,"[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[70, 5617, 988, 297, 25890, 2733, 450, 319, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10509, 81, 748, 281, 14634, 1228, 347, 88, 465, 1315, 1304, 25833, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[14021, 83, 16769, 7200, 39457, 16, 9445, 274, 4275, 456, 16, 38396, 82, 18, 297, 799, 12106, 22842, 261, 16, 292, 77, 2300, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
3,"[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[40351, 30464, 1883, 18190, 1723, 32128, 703, 336, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[71, 817, 328, 4275, 15463, 70, 833, 336, 23118, 1895, 26553, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
5,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[11527, 448, 5232, 25585, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
6,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[35640, 1344, 28378, 7803, 5396, 908, 9445, 274, 22, 4275, 1986, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
7,"[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[71, 486, 5569, 925, 667, 310, 7467, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
8,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[43458, 24499, 13030, 16, 485, 355, 16391, 949, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
9,"[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[5196, 2164, 16, 2777, 574, 13817, 5976, 302, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1


- to train our modle in batches later on, the above input_ids must be of the same length

In [16]:
# from datasets import DatasetDict
# smol_tokenized_dataset = DatasetDict()
# smol_tokenized_dataset['train'] = tokenized_dataset['train'].select(range(55000,65000))
# smol_tokenized_dataset['test'] = tokenized_dataset['test'].select(range(500))
# smol_tokenized_dataset

## 4. AutoModelForSequenceClassification:
- "AutoModel" will guess and load the pre-trained model architecture we are using based on model_checkpoint
- "ForSequenceClassification" adds the sequence classfication head at the end of the pre-trained model
    - for sequence classification we are essentially applying a classfication model on the [CLS] token
    - reason being that, during the pre-training of BERT, the model learns to encode the entire meaning of any sentence into the [CLS] token

In [17]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig

- the fine-tuned model downloaded from huggingface was fine-tuned to predict 3 labels i.e it has 3 NN output nodes
- however, we need 2 
- therefore, we first load the model with AutoConfig and change the "num_labels" attribute
- using the code below, we ensure that **our model has 2 output nodes**

In [18]:
config = AutoConfig.from_pretrained(model_checkpoint)               
config.num_labels = num_labels 
model = AutoModelForSequenceClassification.from_config(config)

## 5. TrainingArguments
- this customizes how we want the training to be done and other hyperparameters

In [19]:
args = TrainingArguments(
    "test-glue",
    evaluation_strategy = "no",                           
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    # load_best_model_at_end=True,                               # best model may not be the model at the end of training, thus this param enables us to save any best model during training
    # metric_for_best_model="accuracy",
    save_strategy = "no" ,                                  # dont save at every step
    logging_strategy = "no" ,   
    # eval_steps = 500
)

## 6. Trainer


In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
#     if task != "stsb":
#         predictions = np.argmax(predictions, axis=1)
#     else:
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],                  # input should be tokenized
    # eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,                                     # tokenizer is specified again to padd all samples to the same length
    compute_metrics=compute_metrics,
    # data_collator = collator
)

In [22]:
# import mlflow
# mlflow.end_run()

In [23]:
trainer.train()

***** Running training *****
  Num examples = 285000
  Num Epochs = 1
  Instantaneous batch size per device = 50
  Total train batch size (w. parallel, distributed & accumulation) = 50
  Gradient Accumulation steps = 1
  Total optimization steps = 5700


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5700, training_loss=0.4292176363760965, metrics={'train_runtime': 3309.4521, 'train_samples_per_second': 86.117, 'train_steps_per_second': 1.722, 'total_flos': 9249331067397000.0, 'train_loss': 0.4292176363760965, 'epoch': 1.0})

In [24]:
model.save_pretrained(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\models\street existence model")

Configuration saved in C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\models\street existence model\config.json
Model weights saved in C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\models\street existence model\pytorch_model.bin


## Confusion matrix and accuracy

In [25]:
import numpy as np
output = trainer.predict(tokenized_dataset['test'])
y = np.array(tokenized_dataset['test']['label'])
y_pred = [np.argmax(logits) for logits in output.predictions]

***** Running Prediction *****
  Num examples = 15000
  Batch size = 50


In [26]:
print("Accuracy", sum(y_pred==y)/len(y))

Accuracy 0.845


In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y,y_pred)

array([[ 1329,  2141],
       [  184, 11346]], dtype=int64)

- (above) the columns are y_pred (predictions by model) while the rows are the y (true labels)