In [1]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import pandas as pd
import torch 

In [15]:
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
embeddings_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabert")
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabert", num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
train_df = pd.read_csv("New_News_Train.csv")
print(len(train_df))

5000


Testing embedding on one sample sentence

In [17]:
test_text = train_df['Removed_stopwords'][0]
print(f'Sample text: \n{test_text}')
encoded = tokenizer.encode_plus(test_text, return_tensors='pt', padding=True, truncation=True)

Sample text: 
اشتباك الحريري عون اتهامات لباسيل بالتمسك بالثلث المعطل للبقاء الحكم


In [18]:
with torch.no_grad():
    embeddings_model_output = embeddings_model(**encoded)
    embeddings = embeddings_model_output.last_hidden_state

print(f'Shape: {embeddings.shape}')
print(embeddings)

Shape: torch.Size([1, 28, 768])
tensor([[[-0.4110,  0.5989,  0.0211,  ..., -0.1455,  0.5592, -0.0201],
         [-0.5982,  0.1977,  0.0026,  ..., -0.6627,  0.3994,  0.3359],
         [-1.3771,  0.8943,  0.9873,  ..., -0.9149,  0.8265, -0.2859],
         ...,
         [-1.3770,  0.8945,  0.9878,  ..., -0.9152,  0.8269, -0.2864],
         [-0.8602, -0.0306,  0.1687,  ..., -0.3396,  0.6240, -0.7508],
         [-0.7569,  0.1673, -0.0054,  ..., -0.2516,  0.3591, -0.5484]]])


Setting up for using full dataset

In [19]:
from torch.utils.data import DataLoader, Dataset

In [20]:
class AraBERTDataset(Dataset):
    def __init__(self, file, tokenizer, model):
        self.df = pd.read_csv(file)
        self.data = self.df['Removed_stopwords'].values
        self.labels = self.df['Type'].values
        self.tokenizer = tokenizer
        self.model = model

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index]
        encoded = self.tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', truncation=True)
        with torch.no_grad():
            model_output = self.model(**encoded)
            embeddings = model_output.last_hidden_state
        return embeddings, label
        # return {
        #     'input_ids': encoded['input_ids'].flatten(),
        #     'attention_mask': encoded['attention_mask'].flatten()
        # }

In [21]:
training_data = AraBERTDataset('New_News_Train.csv', tokenizer, embeddings_model)
testing_data = AraBERTDataset('New_News_Test.csv', tokenizer, embeddings_model)

In [22]:
train_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
test_dataloader = DataLoader(testing_data, batch_size=8, shuffle=False)

model.eval()
model()

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 5
for epoch in range(EPOCHS):  # Fine-tune for 5 epochs
    model.train()
    for batch in train_dataloader:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    model.eval()

tensor([[[[ 3.4742e-01,  2.5696e-01,  1.5328e-01,  ...,  3.3824e-01,
            4.5029e-01,  2.7588e-01],
          [ 7.8734e-02, -1.5832e-01,  3.3051e-01,  ...,  2.7933e-01,
            6.8462e-01, -1.5104e-01],
          [-3.1802e-01,  6.8979e-02,  5.0427e-01,  ..., -2.6783e-01,
            5.9774e-01, -9.5489e-02],
          ...,
          [ 2.2355e-01, -1.7585e-01,  4.5014e-01,  ...,  3.6822e-01,
            3.6171e-01, -5.6295e-02],
          [ 1.1977e-01, -1.1467e-01,  3.8133e-01,  ...,  1.8485e-01,
            5.7538e-01,  6.7956e-02],
          [ 2.8141e-02,  1.2968e-02,  4.0619e-01,  ...,  8.8852e-02,
            5.5666e-01, -4.7642e-02]]],


        [[[-7.5427e-01,  5.3390e-01,  8.8916e-01,  ..., -3.0383e-01,
            2.5848e-01,  9.1550e-01],
          [-6.1077e-01,  9.7729e-02,  5.8500e-01,  ...,  1.7084e-02,
           -2.9557e-01,  3.2185e-01],
          [-1.0371e+00,  3.8980e-01,  1.5889e-01,  ..., -2.4177e-01,
            1.2125e-01,  4.6174e-01],
          ...,
   

ValueError: too many values to unpack (expected 2)