# ABOUT:
- this notebook:
    1. Preprocesses/ tokenize the text classification datasets to prepare for model training
    2. **Fine-tunes roberta text classification model** to predict if point of interest or street name exists in an address

In [1]:
model_checkpoint = "w11wo/indonesian-roberta-base-sentiment-classifier"
max_length = None
num_labels = 2       
batch_size = 50

## load_from_disk

In [2]:
from datasets import load_from_disk

In [3]:
exist_or_not = load_from_disk(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Comp 2 address elements extraction\datasets\POI existence cls dataset")
exist_or_not

DatasetDict({
    train: Dataset({
        features: ['POI', 'id', 'label', 'raw_address'],
        num_rows: 285000
    })
    test: Dataset({
        features: ['POI', 'id', 'label', 'raw_address'],
        num_rows: 15000
    })
})

In [4]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [5]:
show_random_elements(exist_or_not['train'])

Unnamed: 0,raw_address,label,POI,id
0,jatayu ii 48 kalisari rt 1 rw 2 pasar rebo,does not exist,,253129
1,wij kus sukabumi mayangan,does not exist,,186681
2,gondrong ki. hajar dewan raya rt 2 1,does not exist,,238644
3,sele lama 22 air merbau tanjung pandan,does not exist,,57738
4,"ayam gor sam meteooor sekepa i, cikutra cibeunying kidul",exist,ayam goreng sambel meteooor,227695
5,gelam jaya berl raya 17 pasar kemis,does not exist,,241657
6,kiara cond 148 kebon gedang kiaracondong,does not exist,,70052
7,mitra bukalapak toko dina bulu bulusari,exist,mitra bukalapak toko dina bulusari,148745
8,kadu kamp kadu no 77 15810 curug,does not exist,,100952
9,siantan tengah kel. selat sumba 3 no 15 78242,does not exist,,84384


In [8]:
def encode_labels(egs):
    label_mappings = {"exist":1,
                      "does not exist":0}
    egs['label'] = [label_mappings[label] for label in egs['label']]
    return egs

- here, we are doing classification task with 4 labels (shown above)
- they need to be encoded to numbers!

In [9]:
exist_or_not = exist_or_not.map(encode_labels, batched = True)

HBox(children=(FloatProgress(value=0.0, max=285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [10]:
exist_or_not.remove_columns_(["POI","id"])

In [11]:
show_random_elements(exist_or_not['train'])

Unnamed: 0,label,raw_address
0,0,sekip ujung utan kayu selatan matraman
1,1,"tupak raja warkop betta malau,"
2,0,karamat bha 117 gunung puyuh
3,0,"warung asmunih, haji holil, no 75 15156"
4,1,bojong menteng rt 001 002
5,0,"indra kila, batu ampar kel."
6,0,"surau gadang kel. komplek pgri, 13 25146"
7,1,"tb hadi mak,"
8,0,kukusan k.h usman 18a beji
9,0,"grand wisata, jl mustika jaya, rt.02 rw.05, lambangsari"


## 2. load_metric
- each task has its own associated metric

    - for CoLA: Matthews Correlation Coefficient
    - for MNLI (matched or mismatched): Accuracy
    - for MRPC: Accuracy and F1 score
    - for QNLI: Accuracy
    - for QQP: Accuracy and F1 score
    - for RTE: Accuracy
    - for SST-2: Accuracy
    - for STS-B: Pearson Correlation Coefficient and Spearman's_Rank_Correlation_Coefficient
    - for WNLI: Accuracy

In [13]:
from datasets import load_metric
metric = load_metric('glue', "sst2")
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
""", stored examples: 0)

- since "sst2" task uses the accuracy metric and is also 1-sentenced
- thus, we will reference "sst2" to load accuracy metric (above)

## 3. Pre-processing: 
- depending on whether the task has 1 or 2 sentences, pre-processing is done slightly differently
- for 1 sentence:
    - **[CLS] and [SEP] tokens are added to the front and end of the sentence respectively**
- for 2 sentences:
    - in addition to above steps, the second sentence is concatenated and another [SEP] token is added to the end 

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, 
                                          use_fast=True ,   # uses fast tokenizer
                                          # padding = "longest"
                                          )

In [15]:
# this function is meant to process either one sentence 
def preprocess_function(examples):
    return tokenizer(examples["raw_address"], 
                     truncation = True,
                     padding = "longest",          #  padding = True means to pad to longest sequence length
                     #max_length = max_length,
                     #add_special_tokens = True,
                     #return_tensors = 'pt'
                     )
                     # add_special_tokens = True)
    
    # output["length"] = list(map(len,output["input_ids"]))
    # return output

In [16]:
tokenized_dataset = exist_or_not.map(preprocess_function,
                                     batched = True
                                     )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


HBox(children=(FloatProgress(value=0.0, max=285.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [17]:
tokenized_dataset.remove_columns_(["raw_address"])

In [18]:
show_random_elements(tokenized_dataset['train'])

Unnamed: 0,attention_mask,input_ids,label
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[70, 810, 729, 16, 1946, 271, 2554, 274, 443, 282, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[40081, 10236, 7026, 262, 391, 529, 23182, 456, 17566, 12076, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[78, 80, 82, 378, 1154, 47394, 280, 1674, 7207, 29079, 858, 22022, 11387, 4275, 730, 1041, 262, 271, 2955, 11408, 11433, 277, 1680, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[8531, 490, 1161, 12226, 2705, 32805, 10700, 4250, 22, 867, 3341, 30210, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[79, 5004, 16, 4275, 5038, 619, 6494, 1819, 1738, 12904, 5324, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
5,"[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[35443, 2705, 692, 16, 2512, 5972, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
6,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[290, 88, 9962, 456, 16, 3924, 7777, 3194, 539, 18, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
7,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[3391, 72, 16, 552, 35368, 274, 11391, 552, 35368, 274, 11391, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
8,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[5909, 8996, 16, 729, 4343, 6216, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0
9,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[39999, 837, 902, 1721, 69, 276, 708, 2313, 1721, 450, 280, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1


- to train our modle in batches later on, the above input_ids must be of the same length

In [19]:
# from datasets import DatasetDict
# smol_tokenized_dataset = DatasetDict()
# smol_tokenized_dataset['train'] = tokenized_dataset['train'].select(range(55000,65000))
# smol_tokenized_dataset['test'] = tokenized_dataset['test'].select(range(500))
# smol_tokenized_dataset

## 4. AutoModelForSequenceClassification:
- "AutoModel" will guess and load the pre-trained model architecture we are using based on model_checkpoint
- "ForSequenceClassification" adds the sequence classfication head at the end of the pre-trained model
    - for sequence classification we are essentially applying a classfication model on the [CLS] token
    - reason being that, during the pre-training of BERT, the model learns to encode the entire meaning of any sentence into the [CLS] token

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig

- the fine-tuned model downloaded from huggingface was fine-tuned to predict 3 labels i.e it has 3 NN output nodes
- however, we need 2 
- therefore, we first load the model with AutoConfig and change the "num_labels" attribute
- using the code below, we ensure that **our model has 2 output nodes**

In [21]:
config = AutoConfig.from_pretrained(model_checkpoint)               
config.num_labels = num_labels 
model = AutoModelForSequenceClassification.from_config(config)

## 5. TrainingArguments
- this customizes how we want the training to be done and other hyperparameters

In [22]:
args = TrainingArguments(
    "test-glue",
    evaluation_strategy = "no",                           
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    # load_best_model_at_end=True,                               # best model may not be the model at the end of training, thus this param enables us to save any best model during training
    # metric_for_best_model="accuracy",
    save_strategy = "no" ,                                  # dont save at every step
    logging_strategy = "no" ,   
    # eval_steps = 500
)

## 6. Trainer


In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
#     if task != "stsb":
#         predictions = np.argmax(predictions, axis=1)
#     else:
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],                  # input should be tokenized
    # eval_dataset=smol_tokenized_dataset["test"],
    tokenizer=tokenizer,                                     # tokenizer is specified again to padd all samples to the same length
    compute_metrics=compute_metrics,
    # data_collator = collator
)

In [25]:
# import mlflow
# mlflow.end_run()

In [26]:
trainer.train()

***** Running training *****
  Num examples = 285000
  Num Epochs = 1
  Instantaneous batch size per device = 50
  Total train batch size (w. parallel, distributed & accumulation) = 50
  Gradient Accumulation steps = 1
  Total optimization steps = 5700


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5700, training_loss=0.3402528568736294, metrics={'train_runtime': 3281.9804, 'train_samples_per_second': 86.838, 'train_steps_per_second': 1.737, 'total_flos': 9236056143792000.0, 'train_loss': 0.3402528568736294, 'epoch': 1.0})

In [27]:
model.save_pretrained(r"C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\models\POI existence model")

Configuration saved in C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\models\POI street existence model\config.json
Model weights saved in C:\Users\tanch\Documents\Coding Competitions\Shopee\Shopee Address Elements Extraction (local)\models\POI street existence model\pytorch_model.bin


## Confusion matrix and accuracy

In [28]:
import numpy as np
output = trainer.predict(tokenized_dataset['test'])
y = np.array(tokenized_dataset['test']['label'])
y_pred = [np.argmax(logits) for logits in output.predictions]

***** Running Prediction *****
  Num examples = 15000
  Batch size = 50


In [29]:
print("Accuracy", sum(y_pred==y)/len(y))

Accuracy 0.8914666666666666


In [30]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y,y_pred)

array([[7816, 1100],
       [ 528, 5556]], dtype=int64)

- (above) the columns are y_pred (predictions by model) while the rows are the y (true labels)