In [2]:
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
def preprocess_text(text):
    return text.str.lower().replace(r'[^a-zA-Z0-9\s]', ' ', regex=True).str.strip().str.split().apply(lambda tokens: ' '.join(tokens))

def preprocess_df(df):
    df = df.drop(columns=['EM'])
    df['Description'] = preprocess_text(df['Description'])
    df['EN'] = preprocess_text(df['EN'])
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Composition strategy'])

    # further split the test set into validate and test sets
    train_df, validate_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['Composition strategy'])
    return train_df, validate_df, test_df

In [4]:
class ELCoDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, label_encoder):
        self.descriptions = df['Description'].tolist()
        self.en_texts = df['EN'].tolist()
        self.labels = label_encoder.transform(df['Composition strategy'])
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        combined_text = self.descriptions[idx] + " [SEP] " + self.en_texts[idx]
        encoding = self.tokenizer(combined_text, 
                                  truncation=True, 
                                  padding='max_length',
                                  max_length=self.max_len, 
                                  return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [5]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.fc = nn.Linear(bert.config.hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        logits = self.fc(cls_output)
        return logits

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
ELCo_df = pd.read_csv('../../data/ELCo.csv')
label_encoder = LabelEncoder()
label_encoder.fit(ELCo_df['Composition strategy'])
num_classes = len(label_encoder.classes_)

max_len = 128
batch_size = 64
ELCo_train_df, ELCo_validate_df, ELCo_test_df = preprocess_df(ELCo_df)
train_dataset = ELCoDataset(ELCo_train_df, tokenizer, max_len, label_encoder)
validate_dataset = ELCoDataset(ELCo_validate_df, tokenizer, max_len, label_encoder)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validate_loader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=True)

model = BERTClassifier(bert, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [10]:
from tqdm import tqdm

for epoch in range(5):  # epochs
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    loss = total_loss / len(train_loader)
    
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in validate_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']
            
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}")

100%|██████████| 67/67 [07:16<00:00,  6.52s/it]


Epoch 1, Loss: 1.1775, Validation Accuracy: 0.5736


100%|██████████| 67/67 [07:06<00:00,  6.37s/it]


Epoch 2, Loss: 0.8807, Validation Accuracy: 0.6151


100%|██████████| 67/67 [08:30<00:00,  7.62s/it]


Epoch 3, Loss: 0.5885, Validation Accuracy: 0.6604


100%|██████████| 67/67 [07:06<00:00,  6.37s/it]


Epoch 4, Loss: 0.3914, Validation Accuracy: 0.6943


100%|██████████| 67/67 [08:31<00:00,  7.63s/it]


Epoch 5, Loss: 0.2759, Validation Accuracy: 0.6830


In [11]:
test_dataset = ELCoDataset(ELCo_test_df, tokenizer, max_len, label_encoder)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.6798
