In [6]:
from datasets import load_dataset


dataset_smsa = load_dataset('indonlu', 'smsa')
dataset_emot = load_dataset('indonlu', 'emot')

Reusing dataset indonlu (/home/m13518040/.cache/huggingface/datasets/indonlu/smsa/1.0.0/0a83b181cd831cd5d9c15ffe39f3be76af23407eba2c902bccca53fa905d68af)
Reusing dataset indonlu (/home/m13518040/.cache/huggingface/datasets/indonlu/emot/1.0.0/0a83b181cd831cd5d9c15ffe39f3be76af23407eba2c902bccca53fa905d68af)


In [8]:
dataset_smsa['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 11000
})

In [None]:
def load_dataset_loader(dataset_id, ds_type, tokenizer):
    dataset_path = None
    dataset = None
    loader = None
    if(dataset_id == 'sentiment'):
        if(ds_type == "train"):
            dataset_path = './dataset/smsa-document-sentiment/train_preprocess.tsv'
            dataset = DocumentSentimentDataset(dataset_path, tokenizer, lowercase=True)
            loader = DocumentSentimentDataLoader(dataset=dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)  
        elif(ds_type == "valid"):
            dataset_path = './dataset/smsa-document-sentiment/valid_preprocess.tsv'
            dataset = DocumentSentimentDataset(dataset_path, tokenizer, lowercase=True)
            loader = DocumentSentimentDataLoader(dataset=dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
        elif(ds_type == "test"):
            dataset_path = './dataset/smsa-document-sentiment/test_preprocess_masked_label.tsv'
            dataset = DocumentSentimentDataset(dataset_path, tokenizer, lowercase=True)
            loader = DocumentSentimentDataLoader(dataset=dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

    elif(dataset_id == 'emotion'):
        if(ds_type == "train"):
            dataset_path = './dataset/emot-emotion-twitter/train_preprocess.csv'
            dataset = EmotionDetectionDataset(dataset_path, tokenizer, lowercase=True)
            loader = EmotionDetectionDataLoader(dataset=dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)  
        elif(ds_type == "valid"):
            dataset_path = './dataset/emot-emotion-twitter/train_preprocess.csv'
            dataset = EmotionDetectionDataset(dataset_path, tokenizer, lowercase=True)
            loader = EmotionDetectionDataLoader(dataset=dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
        elif(ds_type == "test"):
            dataset_path = './dataset/emot-emotion-twitter/valid_preprocess.csv'
            dataset = EmotionDetectionDataset(dataset_path, tokenizer, lowercase=True)
            loader = EmotionDetectionDataLoader(dataset=dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

    return dataset, loader

In [None]:
def init_model(id_model):
    if id_model == "IndoBERT":
        tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
        config = BertConfig.from_pretrained('indobenchmark/indobert-base-p2')
        config.num_labels = DocumentSentimentDataset.NUM_LABELS
        model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p2', config=config)
    elif id_model == "XLM-R":
        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        config = XLMRobertaConfig.from_pretrained("xlm-roberta-base")
        config.num_labels = DocumentSentimentDataset.NUM_LABELS
        model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', config=config)
    elif id_model == "mBERT":
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
        config = BertConfig.from_pretrained("bert-base-multilingual-uncased")
        config.num_labels = DocumentSentimentDataset.NUM_LABELS
        model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', config=config)

    return tokenizer, config, model

def fine_tuning_model(base_model, i2w, train_loader, valid_loader, epochs=5):
    optimizer = optim.Adam(base_model.parameters(), lr=3e-6)
    base_model = base_model.cuda()
    
    # Train
    n_epochs = epochs
    for epoch in range(n_epochs):
        base_model.train()
        torch.set_grad_enabled(True)

        total_train_loss = 0
        list_hyp, list_label = [], []

        train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
        for i, batch_data in enumerate(train_pbar):
            # Forward base_model
            loss, batch_hyp, batch_label = forward_sequence_classification(base_model, batch_data[:-1], i2w=i2w, device='cuda')

            # Update base_model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tr_loss = loss.item()
            total_train_loss = total_train_loss + tr_loss

            # Calculate metrics
            list_hyp += batch_hyp
            list_label += batch_label

            train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
                total_train_loss/(i+1), get_lr(optimizer)))

        # Calculate train metric
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)
        print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

        # Evaluate on validation
        base_model.eval()
        torch.set_grad_enabled(False)

        total_loss, total_correct, total_labels = 0, 0, 0
        list_hyp, list_label = [], []

        pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
        for i, batch_data in enumerate(pbar):
            batch_seq = batch_data[-1]        
            loss, batch_hyp, batch_label = forward_sequence_classification(base_model, batch_data[:-1], i2w=i2w, device='cuda')

            # Calculate total loss
            valid_loss = loss.item()
            total_loss = total_loss + valid_loss

            # Calculate evaluation metrics
            list_hyp += batch_hyp
            list_label += batch_label
            metrics = document_sentiment_metrics_fn(list_hyp, list_label)

            pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))

        metrics = document_sentiment_metrics_fn(list_hyp, list_label)
        print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
            total_loss/(i+1), metrics_to_string(metrics)))
    return base_model

In [None]:
tokenizer, config, model = init_model("IndoBERT")
finetuned_model = 