# Emotion Recognition
The dataset has 8 labels: sadness, anger, surprise, fear, joy, disgust, trust, anticipation.

In [1]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn
from utils.data_utils import EmotionDetectionDataset, EmotionDetectionDataLoader

In [2]:
print("Is cuda available?", torch.cuda.is_available())
print("Device count?", torch.cuda.device_count())
print("Current device?", torch.cuda.current_device())
print("Device name? ", torch.cuda.get_device_name(torch.cuda.current_device()))

Is cuda available? True
Device count? 4
Current device? 0
Device name?  NVIDIA RTX A5000


In [3]:
torch.cuda.set_device(1)

In [4]:
# Common functions
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [5]:
# Set random seed
set_seed(26092020)

## Setup model

In [6]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-large-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-large-p1')
config.num_labels = EmotionDetectionDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-large-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [8]:
count_param(model)

335150088

## Prepare Dataset

In [9]:
dataset_path = "./dataset/emotion.csv"

df = pd.read_csv(dataset_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7200 entries, 0 to 7199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   7199 non-null   object
 1   label   7200 non-null   object
dtypes: object(2)
memory usage: 112.6+ KB


In [10]:
df = df.dropna(axis=0, how="any")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7199 entries, 0 to 7199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   7199 non-null   object
 1   label   7199 non-null   object
dtypes: object(2)
memory usage: 168.7+ KB


In [11]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5759 entries, 2034 to 860
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   5759 non-null   object
 1   label   5759 non-null   object
dtypes: object(2)
memory usage: 135.0+ KB


In [13]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1440 entries, 4899 to 3340
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   1440 non-null   object
 1   label   1440 non-null   object
dtypes: object(2)
memory usage: 33.8+ KB


In [14]:
df.label.unique()

array(['disgust', 'fear', 'surprise', 'anticipation', 'anger', 'joy',
       'trust', 'sadness'], dtype=object)

In [15]:
df_train.to_csv('./dataset/train.csv', index=False)
df_test.to_csv('./dataset/test.csv', index=False)

In [16]:
train_dataset_path = './dataset/train.csv'
test_dataset_path = './dataset/test.csv'

train_dataset = EmotionDetectionDataset(train_dataset_path, tokenizer, lowercase=True)
test_dataset = EmotionDetectionDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = EmotionDetectionDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)   
test_loader = EmotionDetectionDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

In [17]:
w2i, i2w = EmotionDetectionDataset.LABEL2INDEX, EmotionDetectionDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'sadness': 0, 'anger': 1, 'surprise': 2, 'fear': 3, 'joy': 4, 'disgust': 5, 'trust': 6, 'anticipation': 7}
{0: 'sadness', 1: 'anger', 2: 'surprise', 3: 'fear', 4: 'joy', 5: 'disgust', 6: 'trust', 7: 'anticipation'}


## Test sample sentences

In [18]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : trust (24.821%)


In [19]:
text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Budi pergi ke pondok indah mall membeli cakwe | Label : trust (23.743%)


In [20]:
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Dasar anak sialan!! Kurang ajar!! | Label : joy (25.949%)


## Finetuning model

In [21]:
optimizer = optim.Adam(model.parameters(), lr=5e-6)
model = model.cuda()

In [22]:
# Function to save checkpoint
def save_checkpoint(model, optimizer, epoch, loss, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)

In [23]:
checkpoint_path = './models/ckpt_ep_{}.pth'

In [25]:
# Train
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(test_loader, leave=True, total=len(test_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

save_checkpoint(model, optimizer, epoch+1, total_loss/(i+1), checkpoint_path.format(epoch+1))

(Epoch 1) TRAIN LOSS:0.8126 LR:0.00000500: 100%|██████████| 180/180 [01:25<00:00,  2.11it/s]


(Epoch 1) TRAIN LOSS:0.8126 ACC:0.76 F1:0.76 REC:0.76 PRE:0.76 LR:0.00000500


VALID LOSS:0.7432 ACC:0.75 F1:0.75 REC:0.75 PRE:0.75: 100%|██████████| 45/45 [00:09<00:00,  4.62it/s]


(Epoch 1) VALID LOSS:0.7432 ACC:0.75 F1:0.75 REC:0.75 PRE:0.75


(Epoch 2) TRAIN LOSS:0.5366 LR:0.00000500: 100%|██████████| 180/180 [01:30<00:00,  1.98it/s]


(Epoch 2) TRAIN LOSS:0.5366 ACC:0.84 F1:0.84 REC:0.84 PRE:0.84 LR:0.00000500


VALID LOSS:0.6688 ACC:0.78 F1:0.78 REC:0.78 PRE:0.78: 100%|██████████| 45/45 [00:10<00:00,  4.48it/s]


(Epoch 2) VALID LOSS:0.6688 ACC:0.78 F1:0.78 REC:0.78 PRE:0.78


(Epoch 3) TRAIN LOSS:0.3848 LR:0.00000500: 100%|██████████| 180/180 [01:31<00:00,  1.97it/s]


(Epoch 3) TRAIN LOSS:0.3848 ACC:0.88 F1:0.88 REC:0.88 PRE:0.88 LR:0.00000500


VALID LOSS:0.6425 ACC:0.79 F1:0.79 REC:0.79 PRE:0.79: 100%|██████████| 45/45 [00:10<00:00,  4.49it/s]


(Epoch 3) VALID LOSS:0.6425 ACC:0.79 F1:0.79 REC:0.79 PRE:0.79


(Epoch 4) TRAIN LOSS:0.2599 LR:0.00000500: 100%|██████████| 180/180 [01:32<00:00,  1.95it/s]


(Epoch 4) TRAIN LOSS:0.2599 ACC:0.93 F1:0.93 REC:0.93 PRE:0.93 LR:0.00000500


VALID LOSS:0.6521 ACC:0.79 F1:0.79 REC:0.79 PRE:0.79: 100%|██████████| 45/45 [00:08<00:00,  5.39it/s]


(Epoch 4) VALID LOSS:0.6521 ACC:0.79 F1:0.79 REC:0.79 PRE:0.79


(Epoch 5) TRAIN LOSS:0.1763 LR:0.00000500: 100%|██████████| 180/180 [01:25<00:00,  2.10it/s]


(Epoch 5) TRAIN LOSS:0.1763 ACC:0.96 F1:0.96 REC:0.96 PRE:0.96 LR:0.00000500


VALID LOSS:0.6717 ACC:0.80 F1:0.80 REC:0.79 PRE:0.80: 100%|██████████| 45/45 [00:09<00:00,  4.64it/s]


(Epoch 5) VALID LOSS:0.6717 ACC:0.80 F1:0.80 REC:0.79 PRE:0.80


(Epoch 6) TRAIN LOSS:0.1117 LR:0.00000500: 100%|██████████| 180/180 [01:29<00:00,  2.00it/s]


(Epoch 6) TRAIN LOSS:0.1117 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98 LR:0.00000500


VALID LOSS:0.7009 ACC:0.80 F1:0.80 REC:0.80 PRE:0.80: 100%|██████████| 45/45 [00:09<00:00,  4.54it/s]


(Epoch 6) VALID LOSS:0.7009 ACC:0.80 F1:0.80 REC:0.80 PRE:0.80


(Epoch 7) TRAIN LOSS:0.0776 LR:0.00000500: 100%|██████████| 180/180 [01:30<00:00,  1.99it/s]


(Epoch 7) TRAIN LOSS:0.0776 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98 LR:0.00000500


VALID LOSS:0.7585 ACC:0.79 F1:0.79 REC:0.79 PRE:0.79: 100%|██████████| 45/45 [00:09<00:00,  4.61it/s]


(Epoch 7) VALID LOSS:0.7585 ACC:0.79 F1:0.79 REC:0.79 PRE:0.79


(Epoch 8) TRAIN LOSS:0.0515 LR:0.00000500: 100%|██████████| 180/180 [01:29<00:00,  2.01it/s]


(Epoch 8) TRAIN LOSS:0.0515 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000500


VALID LOSS:0.7678 ACC:0.80 F1:0.80 REC:0.80 PRE:0.80: 100%|██████████| 45/45 [00:08<00:00,  5.32it/s]


(Epoch 8) VALID LOSS:0.7678 ACC:0.80 F1:0.80 REC:0.80 PRE:0.80


(Epoch 9) TRAIN LOSS:0.0347 LR:0.00000500: 100%|██████████| 180/180 [01:30<00:00,  1.98it/s]


(Epoch 9) TRAIN LOSS:0.0347 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000500


VALID LOSS:0.7975 ACC:0.80 F1:0.80 REC:0.80 PRE:0.80: 100%|██████████| 45/45 [00:09<00:00,  4.62it/s]


(Epoch 9) VALID LOSS:0.7975 ACC:0.80 F1:0.80 REC:0.80 PRE:0.80


(Epoch 10) TRAIN LOSS:0.0285 LR:0.00000500: 100%|██████████| 180/180 [01:31<00:00,  1.97it/s]


(Epoch 10) TRAIN LOSS:0.0285 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00 LR:0.00000500


VALID LOSS:0.8325 ACC:0.80 F1:0.80 REC:0.80 PRE:0.80: 100%|██████████| 45/45 [00:10<00:00,  4.38it/s]


(Epoch 10) VALID LOSS:0.8325 ACC:0.80 F1:0.80 REC:0.80 PRE:0.80


In [46]:
model.save_pretrained("./models")
tokenizer.save_pretrained("./models")

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/vocab.txt',
 './models/added_tokens.json')

## Evaluation

In [27]:
def evaluate_and_report(model, valid_loader, i2w):
    model.eval()
    torch.set_grad_enabled(False)
    
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Collect predictions and labels
        list_hyp += batch_hyp
        list_label += batch_label

    # Convert labels to index
    # labels = list(range(len(target_names)))  # Adjust this as needed

    list_hyp_idx = [EmotionDetectionDataset.LABEL2INDEX[hyp] for hyp in list_hyp]
    list_label_idx = [EmotionDetectionDataset.LABEL2INDEX[label] for label in list_label]
    
    # Calculate metrics
    metrics = document_sentiment_metrics_fn(list_hyp_idx, list_label_idx)
    print("\nMetrics:")
    for key, value in metrics.items():
        print(f"{key}: {value}")
    
    # Generate classification report
    target_names = [EmotionDetectionDataset.INDEX2LABEL[i] for i in range(EmotionDetectionDataset.NUM_LABELS)]
    
    print("Number of unique labels in list_label_idx:", len(set(list_label_idx)))
    print("Length of target_names:", len(target_names))
    
    print("\nClassification Report:")
    print(classification_report(list_label_idx, list_hyp_idx, target_names=target_names))

    # Create a DataFrame with true labels and predicted labels
    df_results = pd.DataFrame({
        'True Label': list_label,
        'Predicted Label': list_hyp
    })
    
    return df_results

df_results = evaluate_and_report(model, test_loader, i2w)
print(df_results)

100%|██████████| 45/45 [00:08<00:00,  5.03it/s]


Metrics:
ACC: 0.8006944444444445
F1: 0.8009739976113599
REC: 0.8018016901607073
PRE: 0.8007761109483404
Number of unique labels in list_label_idx: 8
Length of target_names: 8

Classification Report:
              precision    recall  f1-score   support

     sadness       0.78      0.83      0.80       159
       anger       0.81      0.83      0.82       168
    surprise       0.75      0.75      0.75       196
        fear       0.91      0.89      0.90       180
         joy       0.85      0.80      0.82       206
     disgust       0.78      0.81      0.79       175
       trust       0.76      0.76      0.76       178
anticipation       0.77      0.74      0.75       178

    accuracy                           0.80      1440
   macro avg       0.80      0.80      0.80      1440
weighted avg       0.80      0.80      0.80      1440

     True Label Predicted Label
0           joy             joy
1         anger           anger
2      surprise        surprise
3          fear      




In [28]:
df_results[df_results['True Label']!=df_results['Predicted Label']]

Unnamed: 0,True Label,Predicted Label
5,surprise,anger
6,surprise,joy
24,trust,anger
56,surprise,trust
59,anticipation,trust
...,...,...
1420,anticipation,anger
1424,disgust,sadness
1430,trust,anticipation
1431,joy,trust


In [29]:
false_preds = len(df_results[df_results['True Label']!=df_results['Predicted Label']])

print(f"False predictions: {false_preds}/{len(df_results)}")
print(f"% False preds: {false_preds/len(df_results)}")

False predictions: 287/1440
% False preds: 0.19930555555555557


## Testing

In [30]:
# Pastikan NLTK sudah mengunduh stopwords dan punktuasi
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Inisialisasi stop words untuk bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

# Kamus emotikon ke emosi dalam bahasa Indonesia
emoticon_dict = {
    "😊": "ekspresi senang",
    "🤣": "ekspresi senang",
    "😃": "ekspresi senang",
    "😄": "ekspresi senang",
    "😂": "ekspresi senang",
    "😁": "ekspresi senang",
    "😆": "ekspresi senang",
    "😍": "ekspresi senang",
    "🤗": "ekspresi senang",
    "🤨": "ekspresi kaget",
    "😯": "ekspresi kaget",
    "😮": "ekspresi kaget",
    "🤢": "ekspresi jijik",
    "🤮": "ekspresi jijik",
    "😷": "ekspresi jijik",
    "😖": "ekspresi jijik",
    "😫": "ekspresi jijik",
    "😩": "ekspresi jijik",
    "😲": "ekspresi kaget",
    "🤯": "ekspresi kaget",
    "😢": "ekspresi sedih",
    "😭": "ekspresi sedih",
    "😞": "ekspresi sedih",
    "😔": "ekspresi sedih",
    "😟": "ekspresi sedih",
    "😕": "ekspresi sedih",
    "😦": "ekspresi sedih",
    "😿": "ekspresi sedih",
    "🤝": "ekspresi percaya",
    "👍": "ekspresi percaya",
    "🙏": "ekspresi percaya",
    "🤲": "ekspresi antisipasi",
    "😡": "ekspresi marah",
    "😠": "ekspresi marah",
    "🤬": "ekspresi sangat marah",
    "😤": "ekspresi marah",
    "😾": "ekspresi marah",
    "😨": "ekspresi takut",
    "😰": "ekspresi takut",
    "😥": "ekspresi takut",
    "😱": "ekspresi takut",
    ":')": "ekspresi senang",
    ":)": "ekspresi senang",
    ":D": "ekspresi senang",
    ":(": "ekspresi sedih",
    ":'(": "ekspresi sedih",
    ":-)": "ekspresi senang",
    ":-D": "ekspresi senang",
    ":-(": "ekspresi sedih",
    ":P": "ekspresi bahagia",
    ";)": "ekspresi senang",
    ":-O": "ekspresi kaget",
    ":O": "ekspresi kaget",
}

def replace_emoticons(text, emoticon_dict):
    for emoticon, emotion in emoticon_dict.items():
        if emoticon in text:
            # Tambah koma sebelum deskripsi emosi jika ada teks sebelum emotikon
            text = re.sub(r'(\S)(' + re.escape(emoticon) + r')', r'\1, ' + emotion, text)
            # Ganti emotikon yang berdiri sendiri tanpa tambahan koma
            text = text.replace(emoticon, emotion)
    return text

def clean_tweet(tweet, emoticon_dict):
    # Mengubah teks menjadi huruf kecil
    tweet = tweet.lower()
    # Menghapus URL
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Menghapus mention (@username) dan hashtag
    tweet = re.sub(r'@\w+|#', '', tweet)
    # Menghapus angka
    tweet = re.sub(r'\d+', '', tweet)
    # Gantikan emotikon dengan deskripsi emosi terlebih dahulu
    tweet = replace_emoticons(tweet, emoticon_dict)
    # Menghapus tanda baca dan simbol kecuali yang ada dalam deskripsi emotikon
    allowed_punctuation = ''.join(re.escape(c) for c in emoticon_dict.values())
    tweet = re.sub(r'[^\w\s' + allowed_punctuation + r']', '', tweet)
    # Menghapus karakter non-ASCII
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)
    # Menghapus spasi berlebih
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter-23522009/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/jupyter-23522009/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
text = '🤢'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: 🤢 | Label : disgust (99.725%)


In [32]:
text = '😡'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: 😡 | Label : anger (99.894%)


In [33]:
text = '😱'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: 😱 | Label : fear (68.661%)


In [34]:
text = 'woy aku beneran dapat hadiah ??'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: woy aku beneran dapat hadiah ?? | Label : surprise (99.845%)


In [35]:
text = 'nasib jomblo :('
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: nasib jomblo :( | Label : sadness (99.857%)


In [36]:
text = 'kalau aku salah pencet gimana gaiss, takut bgt uang nya hangus'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: kalau aku salah pencet gimana gaiss, takut bgt uang nya hangus | Label : fear (99.916%)


In [37]:
text = "coba deh kamu taya dia, soalnya dia biasanya jujur"
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: coba deh kamu taya dia, soalnya dia biasanya jujur | Label : trust (99.420%)


In [38]:
text = 'bedak nya keputihan ga sih ?'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: bedak nya keputihan ga sih ? | Label : disgust (96.885%)


In [39]:
text = '⁠makasih kakak'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: ⁠makasih kakak | Label : joy (99.699%)


In [40]:
text = 'seharusnya ngomong nya di filter kak, jangan langsung gitu dong'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: seharusnya ngomong nya di filter kak, jangan langsung gitu dong | Label : anger (58.365%)


In [41]:
text = 'ga sabar bgt besok nonton konser'
cleaned_text = clean_tweet(text, emoticon_dict)
subwords = tokenizer.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: ga sabar bgt besok nonton konser | Label : anticipation (99.864%)


## Load model

In [47]:
model_path = "./models"
tokenizer_inf = BertTokenizer.from_pretrained(model_path)
model_inf = BertForSequenceClassification.from_pretrained(model_path)

In [48]:
new_text = "pokoknya aku harus baca dulu acara nya, biar nanti ga salah"
cleaned_text = clean_tweet(new_text, emoticon_dict)
subwords = tokenizer_inf.encode(cleaned_text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model_inf.device)

logits = model_inf(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: ga sabar bgt besok nonton konser | Label : anticipation (99.625%)
