#   WARM-UP

In [3]:
import numpy as np
import pandas as pd
import re
import unicodedata
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
dev = pd.read_csv('dev.csv')
print(train.head())
print(train.info())


   index                                            comment  n_star  \
0      0  Mới mua máy này Tại thegioididong thốt nốt cảm...       5   
1      1  Pin kém còn lại miễn chê mua 8/3/2019 tình trạ...       5   
2      2  Sao lúc gọi điện thoại màn hình bị chấm nhỏ nh...       3   
3      3  Mọi người cập nhật phần mềm lại , nó sẽ bớt tố...       3   
4      4  Mới mua Sài được 1 tháng thấy pin rất trâu, Sà...       5   

      date_time                                              label  
0  2 tuần trước  {CAMERA#Positive};{FEATURES#Positive};{BATTERY...  
1    14/09/2019    {BATTERY#Negative};{GENERAL#Positive};{OTHERS};  
2    17/08/2020                               {FEATURES#Negative};  
3    29/02/2020  {FEATURES#Negative};{BATTERY#Neutral};{GENERAL...  
4      4/6/2020  {BATTERY#Positive};{PERFORMANCE#Positive};{SER...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7786 entries, 0 to 7785
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------   

# PREPROCESSING DATA

In [3]:

# preprocess comment
def basic(text):
    text = text.lower()
    text = re.sub(r'[^\w\s!?]', '', text)
    return text

slang_dict = {
    'ok': 'ổn',
    'oke': 'ổn',
    'oki': 'ổn',
    'okay': 'ổn',
    'k': 'không',
    'ko': 'không',
    'j': 'gì',
    'đc': 'được'
}

def handle_slang(text, slang_dict=slang_dict):
    for slang, formal in slang_dict.items():
        text = re.sub(r'\b' + slang + r'\b', formal, text)
    return text

def handle_emoji(text):
    return ''.join(char for char in text if not unicodedata.category(char).startswith('So'))

def preprocess_comment(text):
    # Convert to Unicode NFC format
    text = unicodedata.normalize('NFC', text)
    text = basic(text)
    text = handle_slang(text)
    text = handle_emoji(text)
    return text

text = "có chuyện j k bạn 👍"
processed_text = preprocess_comment(text)
print("Original Text:", text)
print("Processed Text:", processed_text)

dev['comment'].apply(preprocess_comment)
train['comment'].apply(preprocess_comment)

Original Text: có chuyện j k bạn 👍
Processed Text: có chuyện gì không bạn 


0       mới mua máy này tại thegioididong thốt nốt cảm...
1       pin kém còn lại miễn chê mua 832019 tình trạng...
2       sao lúc gọi điện thoại màn hình bị chấm nhỏ nh...
3       mọi người cập nhật phần mềm lại  nó sẽ bớt tốn...
4       mới mua sài được 1 tháng thấy pin rất trâu sài...
                              ...                        
7781    8g cái đi đánh là mạng giật giật không chịu nổ...
7782    mua dk giảm 500k mà lỗi lòi ra hết treo màn hì...
7783    máy sài 3 tháng rồi rất okpin trâu khỏi nói sà...
7784    rất tiếc hàng realme không có ốp lưng ngoài  n...
7785    mình rất thất vọng khi mua máy này bắt wifi cự...
Name: comment, Length: 7786, dtype: object

In [4]:
def preprocess_label (df):
    columns_to_drop = ['n_star', 'date_time']
    df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)
    df['label'] = df['label'].str.replace(r';?\{OTHERS\};?', '', regex=True).str.strip(';')
    return df
preprocess_label(dev)
print(dev['label'].iloc[105])

{CAMERA#Positive};{PERFORMANCE#Negative};{GENERAL#Neutral}


In [None]:

# preprocess label method 1
#turn label into tensor
#(issues) there are some comments onnly having OTHERS label

aspect_categories = ['BATTERY', 'CAMERA', 'DESIGN', 'FEATURES', 'GENERAL', 'PERFORMANCE', 'PRICE', 'SCREEN', 'SER&ACC', 'STORAGE']    
polarity_to_onehot = { 'Positive': [1,0,0], 'Negative': [0,1,0], 'Neutral': [0,0,1]} 

def label_to_tensor(label: str, aspect_categories: list, polarity_to_onehot: dict):
    tensor = torch.zeros((len(aspect_categories), len(polarity_to_onehot)))
    components = label.split(';')
    for component in components:
        component = component.strip('{}')
        if '#' in component:
            aspect, polarity = component.split('#')
            if aspect in aspect_categories:
                aspect_idx = aspect_categories.index(aspect)
                tensor[aspect_idx] = torch.tensor(polarity_to_onehot[polarity], dtype=torch.float32)

                
    return tensor.flatten()

df = pd.DataFrame({'label': ["{CAMERA#Positive};{BATTERY#Positive};{FEATURES#Negative}", 
                             "{CAMERA#Neutral};{BATTERY#Negative}", "{OTHERS}"]})  
df['label'] = df['label'].apply(lambda x: label_to_tensor(x, aspect_categories, polarity_to_onehot))
print (df['label'])



0    [tensor(1.), tensor(0.), tensor(0.), tensor(1....
1    [tensor(0.), tensor(1.), tensor(0.), tensor(0....
2    [tensor(0.), tensor(0.), tensor(0.), tensor(0....
Name: label, dtype: object


In [None]:

def final_preprocess_v1(table):
    table['comment'] = table['comment'].apply(preprocess_comment)
    preprocess_label(table)
    table['label'] = table['label'].apply(lambda x: label_to_tensor(x, aspect_categories, polarity_to_onehot))
    return table
final_preprocess_v1(dev)
final_preprocess_v1(train)
final_preprocess_v1(test)

Unnamed: 0,index,comment,label
0,0,điện thoải ổn facelock cực nhanh vân tay ôk m...,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
1,1,mình mới mua vivo91c tải ứng dụng games nhan...,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
2,2,xấu đẹp gì không biết nhưng rất ưng tgdđ phục ...,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
3,3,màn hình hơi lác khi chơi game game nặng thì m...,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
4,4,nói chung máy đẹp với màn amoled ổn trong tầm ...,"[tensor(0.), tensor(1.), tensor(0.), tensor(0...."
...,...,...,...
2219,2219,mẫu mã đẹp lung linh máy chạy cực nhanh mượt h...,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
2220,2220,có ai bị giống mình không máy thì sài bình thư...,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
2221,2221,sản phẩm tốt \nai chơi game cứ mang 1 em về mà...,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
2222,2222,vừa mới mua xong máy rất đẹp nhân viên rất nhi...,"[tensor(1.), tensor(0.), tensor(0.), tensor(1...."


In [5]:
# Define a function to filter for "PERFORMANCE" aspect
def filter_performance_only(label):
    # Retain only the entries with "PERFORMANCE"
    performance_only = ';'.join([item for item in label.split(';') if item.startswith("{PERFORMANCE")])
    return performance_only

def polarity_to_one_hot(label):
    if "PERFORMANCE#Positive" in label:
        return torch.tensor([1, 0, 0,0], dtype=torch.float32)
    elif "PERFORMANCE#Negative" in label:
        return torch.tensor([0, 1, 0,0], dtype=torch.float32)
    elif "PERFORMANCE#Neutral" in label:
        return torch.tensor([0, 0, 1, 0], dtype=torch.float32)  
    else:
        return torch.tensor([0, 0, 0, 1], dtype=torch.float32)


def final_preprocess(table):
    table['comment'] = table['comment'].apply(preprocess_comment)
    preprocess_label(table)
    table['label'] = table['label'].apply(filter_performance_only)
    table['label'] = table['label'].apply(polarity_to_one_hot)
    return table

final_preprocess(dev)
final_preprocess(train)
final_preprocess(test)


Unnamed: 0,index,comment,label
0,0,điện thoải ổn facelock cực nhanh vân tay ôk m...,"[tensor(1.), tensor(0.), tensor(0.), tensor(0.)]"
1,1,mình mới mua vivo91c tải ứng dụng games nhan...,"[tensor(1.), tensor(0.), tensor(0.), tensor(0.)]"
2,2,xấu đẹp gì không biết nhưng rất ưng tgdđ phục ...,"[tensor(0.), tensor(0.), tensor(0.), tensor(1.)]"
3,3,màn hình hơi lác khi chơi game game nặng thì m...,"[tensor(0.), tensor(1.), tensor(0.), tensor(0.)]"
4,4,nói chung máy đẹp với màn amoled ổn trong tầm ...,"[tensor(0.), tensor(0.), tensor(0.), tensor(1.)]"
...,...,...,...
2219,2219,mẫu mã đẹp lung linh máy chạy cực nhanh mượt h...,"[tensor(1.), tensor(0.), tensor(0.), tensor(0.)]"
2220,2220,có ai bị giống mình không máy thì sài bình thư...,"[tensor(0.), tensor(1.), tensor(0.), tensor(0.)]"
2221,2221,sản phẩm tốt \nai chơi game cứ mang 1 em về mà...,"[tensor(1.), tensor(0.), tensor(0.), tensor(0.)]"
2222,2222,vừa mới mua xong máy rất đẹp nhân viên rất nhi...,"[tensor(1.), tensor(0.), tensor(0.), tensor(0.)]"


# Tokenization and Embedding

In [6]:
from torchtext.vocab import FastText
tokenizer = AutoTokenizer.from_pretrained("bkai-foundation-models/vietnamese-bi-encoder")
fastText = FastText(language='vi')

MAX_LENGTH = 128
BATCH_SIZE = 32
EMBEDDING_DIM = fastText.dim  
VOCAB_SIZE = tokenizer.vocab_size

encoding = tokenizer(
    train['comment'].tolist(),
    padding="max_length",
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors="pt"
)
token_ids = encoding['input_ids']
tokens = []

for each in token_ids:
    temp = tokenizer.convert_ids_to_tokens(each)
    tokens.append(temp)

embedding_matrix = torch.zeros(VOCAB_SIZE, EMBEDDING_DIM)
for each in tokens:
    for token in each:
        if token in fastText.stoi:
            vector=torch.tensor(fastText[token], dtype=torch.float32)
        else:
            vector = torch.zeros(EMBEDDING_DIM)
        embedding_matrix[tokenizer.convert_tokens_to_ids(token)] = vector


  self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
  vector=torch.tensor(fastText[token], dtype=torch.float32)


In [None]:
def tokenize(df):
    texts = df["comment"].tolist()
    labels = df["label"].tolist()

    encoding = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )
    token_ids = encoding['input_ids'] 
    return token_ids, labels


class TextDataset(Dataset):
    def __init__(self, token_ids, labels):
        self.token_ids = token_ids
        self.labels = labels
    def __len__(self):
        return len(self.token_ids)
    def __getitem__(self, idx):
        tokens = self.token_ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float32)  
        return tokens, label

def create_dataloader(df, batch_size=BATCH_SIZE):
    token_ids, labels = tokenize(df)
    dataset = TextDataset(token_ids, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader



train_loader = create_dataloader(train)
dev_loader = create_dataloader(dev)
test_loader = create_dataloader(test)

In [8]:
for batch in train_loader:
    tokens, labels = batch
    print(tokens.shape)
    print(labels.shape)
    break
print (len(train_loader))

torch.Size([32, 128])
torch.Size([32, 4])
244


  label = torch.tensor(self.labels[idx], dtype=torch.float32)


# Model_v1


In [None]:
#ABSA model from SA2SL paper
class ABSA(nn.Module):
    def __init__(self, embedd_matrix=embedding_matrix, EMBED_DIM=EMBEDDING_DIM, LSTM_UNITS=128, conv_filters=64):
        super(ABSA, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedd_matrix, 
                                                      freeze=True)
        self.spatial_dropout = nn.Dropout2d(0.35)
        self.lstm = nn.LSTM(EMBED_DIM, 
                            LSTM_UNITS, 
                            bidirectional=True, 
                            batch_first=True, 
                            dropout=0.15)
        
        self.conv1d = nn.Conv1d(LSTM_UNITS * 2, 
                                conv_filters, 
                                kernel_size=3, 
                                padding='valid')
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(conv_filters * 2, 30) 

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  #(batch, channels, seq_len) for dropout2d
        x = self.spatial_dropout(x)
        x = x.permute(0, 2, 1)  # back to (batch, seq_len, channels)
        x, _ = self.lstm(x)
        x = x.permute(0, 2, 1)  # (batch, channels, seq_len)
        x = self.conv1d(x)
        avg_pool = self.avg_pool(x).squeeze(-1)
        max_pool = self.max_pool(x).squeeze(-1)
        x = torch.cat((avg_pool, max_pool), dim=1)
        out = torch.sigmoid(self.fc(x)) 
        return out


In [25]:
model = ABSA()
criterion = nn.BCELoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
model.to(device)
criterion.to(device)



BCELoss()

In [None]:
EPOCHS = 20
BATCH_SIZE = 32 

for epoch in range(EPOCHS):
    model.train()
    correct_predictions = 0
    total_loss = 0
    
    for i, batch in enumerate(train_loader):
        tokens, labels = batch
        tokens, labels = tokens.to(device), labels.to(device)
        
        optimizer.zero_grad()
        output = model(tokens)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        predictions = torch.sigmoid(output) > 0.5
        correct_predictions += (predictions == labels).sum().item()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    avg_acc = correct_predictions / (len(train_loader) * BATCH_SIZE*30)
    print(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {avg_loss:.4f}, Training Acc: {avg_acc}")
    
    model.eval()
    correct_predictions = 0
    total_loss = 0
    
    with torch.inference_mode():
        for i, batch in enumerate(dev_loader):
            tokens, labels = batch
            tokens, labels = tokens.to(device), labels.to(device)
            output = model(tokens)
            loss = criterion(output, labels)
            predictions = torch.sigmoid(output) > 0.5
            correct_predictions += (predictions == labels).sum().item()
            total_loss += loss.item()
        
        avg_val_loss = total_loss / len(dev_loader)
        avg_val_acc = correct_predictions / (len(dev_loader) * BATCH_SIZE*30)
        print(f"Validation Loss: {avg_val_loss:.4f}, Validation Acc: {avg_val_acc}")


  label = torch.tensor(self.labels[idx], dtype=torch.float32)


Epoch 1/20, Training Loss: 0.0401, Training Acc: 0.40701844262295084
Validation Loss: 0.3060, Validation Acc: 0.4005952380952381
Epoch 2/20, Training Loss: 0.0375, Training Acc: 0.4106301229508197
Validation Loss: 0.3021, Validation Acc: 0.40520833333333334
Epoch 3/20, Training Loss: 0.0368, Training Acc: 0.43033213797814207
Validation Loss: 0.3085, Validation Acc: 0.40633928571428574
Epoch 4/20, Training Loss: 0.0393, Training Acc: 0.4385715505464481
Validation Loss: 0.3077, Validation Acc: 0.44026785714285716
Epoch 5/20, Training Loss: 0.0384, Training Acc: 0.45703125
Validation Loss: 0.3333, Validation Acc: 0.44300595238095236
Epoch 6/20, Training Loss: 0.0361, Training Acc: 0.4721311475409836
Validation Loss: 0.3355, Validation Acc: 0.44660714285714287
Epoch 7/20, Training Loss: 0.0352, Training Acc: 0.4774248633879781
Validation Loss: 0.3366, Validation Acc: 0.4855952380952381
Epoch 8/20, Training Loss: 0.3260, Training Acc: 0.5591359289617487
Validation Loss: 0.5613, Validation A

# Model_v2

In [None]:
#ABSA v2 with attention-based layer
import math

class Attention(nn.Module):
    def __init__(self, embed_dim=EMBEDDING_DIM, hidden_dim=None, out_dim=None, n_head=1, score_function='dot_product', dropout=0):
        ''' Attention Mechanism '''
        super(Attention, self).__init__()
        if hidden_dim is None:
            hidden_dim = embed_dim // n_head
        if out_dim is None:
            out_dim = embed_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.n_head = n_head
        self.score_function = score_function
        self.w_k = nn.Linear(embed_dim, n_head * hidden_dim)
        self.w_q = nn.Linear(embed_dim, n_head * hidden_dim)
        self.proj = nn.Linear(n_head * hidden_dim, out_dim)
        self.dropout = nn.Dropout(dropout)
        if score_function == 'mlp':
            self.weight = nn.Parameter(torch.Tensor(hidden_dim*2))
        if score_function == 'nl':
            self.weight = nn.parameter(torch.Tensor(hidden_dim, hidden_dim))
        elif self.score_function == 'bi_linear':
            self.weight = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        else:  # dot_product / scaled_dot_product
            self.register_parameter('weight', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.hidden_dim)
        if self.weight is not None:
            self.weight.data.uniform_(-stdv, stdv)

    def forward(self, k, q):
        if len(q.shape) == 2: 
            q = torch.unsqueeze(q, dim=1)
        if len(k.shape) == 2:  
            k = torch.unsqueeze(k, dim=1)
        mb_size = k.shape[0]  
        k_len = k.shape[1]
        q_len = q.shape[1]
        kx = self.w_k(k).view(mb_size, k_len, self.n_head, self.hidden_dim)
        kx = kx.permute(2, 0, 1, 3).contiguous().view(-1, k_len, self.hidden_dim)
        qx = self.w_q(q).view(mb_size, q_len, self.n_head, self.hidden_dim)
        qx = qx.permute(2, 0, 1, 3).contiguous().view(-1, q_len, self.hidden_dim)
        if self.score_function == 'dot_product':
            kt = kx.permute(0, 2, 1)
            score = torch.bmm(qx, kt)
        elif self.score_function == 'scaled_dot_product':
            kt = kx.permute(0, 2, 1)
            qkt = torch.bmm(qx, kt)
            score = torch.div(qkt, math.sqrt(self.hidden_dim))
        elif self.score_function == 'bi_linear':
            qw = torch.matmul(qx, self.weight)
            kt = kx.permute(0, 2, 1)
            score = torch.bmm(qw, kt)
        else:
            raise RuntimeError('invalid score_function')
        score = F.softmax(score, dim=-1)
        # in sentiment analysis, they focus to the importance of k, so maybe we dont have V  value (intuitively, V is k...)
        output = torch.bmm(score, kx) 
        output = torch.cat(torch.split(output, mb_size, dim=0), dim=-1)  
        output = self.proj(output)  
        output = self.dropout(output)
        return output, score


class NoQueryAttention(Attention):
    '''q is a parameter'''
    def __init__(self, embed_dim=EMBEDDING_DIM, hidden_dim=None, out_dim=None, n_head=1, score_function='dot_product', q_len=1, dropout=0):
        super(NoQueryAttention, self).__init__(embed_dim, hidden_dim, out_dim, n_head, score_function, dropout)
        self.q_len = q_len
        self.q = nn.Parameter(torch.Tensor(q_len, embed_dim))
        self.reset_q()

    def reset_q(self):
        stdv = 1. / math.sqrt(self.embed_dim)
        self.q.data.uniform_(-stdv, stdv)

    def forward(self, k, **kwargs):
        mb_size = k.shape[0]
        q = self.q.expand(mb_size, -1, -1)
        return super(NoQueryAttention, self).forward(k, q)

class ATAE_LSTM(nn.Module):
    def __init__(self, embed_matrix=embedding_matrix, hidden_dim=128, embedding_dim=EMBEDDING_DIM, polarities_dim=4):
        super(ATAE_LSTM, self).__init__()
        self.embed = nn.Embedding.from_pretrained(torch.tensor(embed_matrix, dtype=torch.float))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.attention = NoQueryAttention(hidden_dim, score_function='bi_linear')
        self.dense = nn.Linear(hidden_dim, polarities_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text_indices):
        x = self.embed(text_indices)
        h, _ = self.lstm(x)
        _, score = self.attention(h)
        output = torch.bmm(score, h).squeeze(dim=1)  # Squeeze to (batch_size, hidden_dim)
        out = self.dense(output)
        out = self.sigmoid(out)
        return out



In [None]:
model_v2 = ATAE_LSTM()
criterion_v2 = nn.BCELoss() 
optimizer_v2 = torch.optim.Adam(model_v2.parameters(), lr=0.01)
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(torch.cuda.is_available())
else:
    device = torch.device("cpu")
model_v2.to(device)
criterion_v2.to(device)

EPOCHS = 20

for epoch in range(EPOCHS):
    correct_predictions = 0
    total_loss = 0
    
    for i, batch in enumerate(train_loader):
        tokens, labels = batch
        tokens, labels = tokens.to(device), labels.to(device)

        optimizer_v2.zero_grad()
        output = model_v2(tokens)
        loss_v2 = criterion_v2(output, labels)
        
        loss_v2.backward()
        optimizer_v2.step()
        
        predictions = output > 0.5
        correct_predictions += (predictions == labels).all(axis=1).sum().item()
        total_loss += loss_v2.item()
    avg_loss = total_loss / len(train_loader)
    avg_acc = correct_predictions / len(train_loader.dataset)
    
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}, Accuracy: {avg_acc:.4f}")

  self.embed = nn.Embedding.from_pretrained(torch.tensor(embed_matrix, dtype=torch.float))


True


  label = torch.tensor(self.labels[idx], dtype=torch.float32)


Epoch 1/20, Loss: 0.3997, Accuracy: 0.5491
Epoch 2/20, Loss: 0.3719, Accuracy: 0.6237
Epoch 3/20, Loss: 0.3774, Accuracy: 0.6187
Epoch 4/20, Loss: 0.3707, Accuracy: 0.6327
Epoch 5/20, Loss: 0.4077, Accuracy: 0.5565
Epoch 6/20, Loss: 0.4354, Accuracy: 0.4848
Epoch 7/20, Loss: 0.4045, Accuracy: 0.5581
Epoch 8/20, Loss: 0.3882, Accuracy: 0.5831
Epoch 9/20, Loss: 0.3675, Accuracy: 0.6120
Epoch 10/20, Loss: 0.3428, Accuracy: 0.6543
Epoch 11/20, Loss: 0.3604, Accuracy: 0.6134
Epoch 12/20, Loss: 0.3462, Accuracy: 0.6531
Epoch 13/20, Loss: 0.3260, Accuracy: 0.6853
Epoch 14/20, Loss: 0.3785, Accuracy: 0.6012
Epoch 15/20, Loss: 0.3869, Accuracy: 0.6058
Epoch 16/20, Loss: 0.3800, Accuracy: 0.6078
Epoch 17/20, Loss: 0.3607, Accuracy: 0.6315
Epoch 18/20, Loss: 0.3585, Accuracy: 0.6446
Epoch 19/20, Loss: 0.3419, Accuracy: 0.6458
Epoch 20/20, Loss: 0.3479, Accuracy: 0.6507


# Model_v3


In [None]:
from transformers import BertConfig, BertModel

configuration = BertConfig()
vocab_size = configuration.vocab_size
hidden_size = configuration.hidden_size
num_hidden_layers = configuration.num_hidden_layers

tokenier = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

def tokenize_input (sentence):
    tokenize = tokenizer (
        sentence,
        padding = "max_length",
        truncation = True,
        max_length = MAX_LENGTH,
        return_tensors = "pt"
    )
    tensor_return = tokenize['input_ids']
    return tensor_return

class BERT (nn.Module):
    def __init__ (self, nclass,embedding_dim, hidden_size, num_hidden_layers):
        super(BERT, self).__init__()
        self.nclass = nclass
        self.tokenizer = tokenier()
        self.hidden_size = hidden_size
        self.num_layers = num_hidden_layers
        self.bert = AutoModel.from_pretrained("google-bert/bert-base-uncased",
                                              output_hidden_states = True,
                                              output_attentions=False)
        self.dropout - nn.Dropout(0.3)
        self.lstm = torch.nn.LSTM(hidden_size, 
                                  hidden_size/2, 
                                  num_layers=1, 
                                  batch_first=True,
                                  bidirectional=True)
        self.fc = nn.Linear(hidden_size, nclass)

    def foroward (self, input, mask):
        input = self.tokenizer(input)
        input_ids = input['input_ids']
        _, output = self.bert (input_ids=input_ids, 
                               attention_mask= mask, 
                               token_type_ids=None)
        hidden_states = torch.stack([hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(0, self.num_layers)], dim=-1) # noqa
        hidden_states = hidden_states.view(-1, self.num_layers, self.hidden_size)
        _, output = self.lstm (hidden_states)
        output = self.dropout(output)
        output = self.fc(output)

        return output 



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
configuration

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}