In [1]:
!pip install transformers
!git clone https://github.com/aub-mind/arabert
!pip install -r arabert/requirements.txt



fatal: destination path 'arabert' already exists and is not an empty directory.




In [2]:
from glob import glob

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, InputFeatures
from arabert.preprocess import ArabertPreprocessor


In [3]:
model_name = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)
model = AutoModel.from_pretrained(model_name)
model.cuda()
for param in model.parameters():
    param.requires_grad = False
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def get_SMADC_folder_data():
    """Returns a dataframe with Text and Region columns. Requires tree like this data/SMADC/*.txt"""
    files = glob("data/SMADC/*.txt")
    dataframes = []

    for file in files:
        region = file[-7:-4]
        temp_df = pd.read_csv(file, encoding="utf8", delimiter="\n", names=["Text"])
        temp_df["Region"] = region
        dataframes.append(temp_df)
        
    return pd.concat(dataframes)
    
df = get_SMADC_folder_data()

classes = df["Region"].unique()

In [5]:
# Encode Y 
# enc = OneHotEncoder()
# enc.fit(classes.reshape(-1, 1))
# all_labels = enc.transform(df["Region"].values.reshape(-1, 1))
class_to_index = {class_:index for class_, index in zip(classes, range(len(classes)))}
all_labels = torch.tensor(df["Region"].map(class_to_index.get).values).cuda()
class_to_index

{'EGY': 0, 'GLF': 1, 'IRQ': 2, 'LEV': 3, 'NOR': 4}

In [6]:
# Preprocess X at train time
sequence_length = 50
def preprocess(batch):
    batch = tokenizer.batch_encode_plus(
        batch,
        add_special_tokens=True,
        padding=True,
        max_length=sequence_length,
        truncation=True,
        return_tensors="pt",
        return_attention_mask=True,
        return_token_type_ids=False
    )
    batch["input_ids"], batch["attention_mask"] = batch["input_ids"].cuda(), batch["attention_mask"].cuda()
    return batch

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df["Text"], all_labels, random_state=1)
x_train, x_test = preprocess(x_train.to_list()), preprocess(x_test.to_list())

In [8]:
class Dialect_dataset(torch.utils.data.Dataset):
    def __init__(self, X, Y):
        super(Dialect_dataset).__init__()
        self.X = X
        self.Y = Y
        
    def __getitem__(self, key):
        return {"input_ids": self.X["input_ids"][key], "attention_mask": self.X["attention_mask"][key]}, self.Y[key]
        
    def __len__(self):
        return len(self.X["input_ids"])

In [9]:
batch_size = 32
training_data = Dialect_dataset(x_train, y_train)
test_data = Dialect_dataset(x_test, y_test)
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [10]:
class Dialect_model(nn.Module):
    def __init__(self, model, sequence_length):
        super().__init__()
        self.model = model
        self.classifier = nn.Sequential(
            nn.Linear(sequence_length * 768, sequence_length * 768 // 10),
            nn.Linear(sequence_length * 768 // 10, sequence_length * 768 // 100),
            nn.Linear(sequence_length * 768 // 100, 5)
        )

    def forward(self, input_dict):
        batch_size, *_ = input_dict["input_ids"].shape
        
        output = self.model(**input_dict)
        output = output["last_hidden_state"]
        output = output.reshape(batch_size, -1)
        
        output = self.classifier(output)
        return F.softmax(output, dim=1)

dialect_model = Dialect_model(model, sequence_length)
dialect_model.cuda()

Dialect_model(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(dialect_model.classifier.parameters(), lr=0.003)

In [12]:
epochs = 25

for epoch in range(epochs):  # loop over the dataset multiple times
    dialect_model.classifier.train()
    for i, data in enumerate(train_dataloader, 0):
        inputs, labels = data
        
        optimizer.zero_grad()
        outputs = dialect_model(inputs)
        loss = criterion(outputs, labels).cuda()
        loss.backward()
        optimizer.step()
        
        if i % 1000 == 0:
            print(f"{i * batch_size}/{len(train_dataloader) * batch_size}")
        
    dialect_model.classifier.eval()
    count = 1
    running_loss = 0.0
    for i, data in enumerate(test_dataloader, 0):
        inputs, labels = data
        
        optimizer.zero_grad()

        outputs = dialect_model(inputs)
        loss = criterion(outputs, labels).cuda()
        loss.backward()

        running_loss += loss.item()
        count += 1
    print(f"Epoch: {epoch+1}. Loss: {running_loss / count}")

0/1056352
32000/1056352
64000/1056352
96000/1056352
128000/1056352
160000/1056352
192000/1056352
224000/1056352


KeyboardInterrupt: 

In [None]:
td = iter(train_dataloader)

In [14]:
tokenizer.decode(inputs["input_ids"][5])

'[CLS] بيقولوا البنك الأهلي نسال احمد بكره [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [15]:
dialect_model(inputs)[5]

tensor([1., 0., 0., 0., 0.], device='cuda:0', grad_fn=<SelectBackward>)

In [16]:
outputs

tensor([[1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackwa

In [13]:
    for i, data in enumerate(test_dataloader, 0):
        inputs, labels = data
        
        optimizer.zero_grad()

        outputs = dialect_model(inputs)
        loss = criterion(outputs, labels).cuda()
        loss.backward()

        running_loss += loss.item()
        count += 1
    print(f"Epoch: {epoch+1}. Loss: {running_loss / count}")

NameError: name 'running_loss' is not defined