In [1]:
!pip install -q transformers



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report

from tqdm.auto import tqdm

In [3]:
path_input = '../input/data-pre-processing-arabic-dialects/arabic_dialects_clean.csv'
df = pd.read_csv(path_input, lineterminator='\n')
df.head()

Unnamed: 0,id,dialect,text,word_count,char_count,avg_char_per_word,stopwords,emoji_count,clean_text
0,1175358310087892992,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,8,48,5.125,1,0,لكن بالنهايه ينتفض يغير
1,1175416117793349632,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,19,120,5.368421,3,0,يعني هذا محسوب علي البشر حيونه ووحشيه وتطلبون ...
2,1175450108898565888,IQ,@KanaanRema مبين من كلامه خليجي,5,31,5.4,1,0,مبين من كلامه خليجي
3,1175471073770573824,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,5,42,7.6,0,1,يسلملي مرورك وروحك الحلوه
4,1175496913145217024,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,8,34,3.857143,1,2,وين هل الغيبه اخ محمد


In [4]:
dialects = df['dialect'].unique()
dialects

array(['IQ', 'LY', 'QA', 'PL', 'SY', 'TN', 'JO', 'MA', 'SA', 'YE', 'DZ',
       'EG', 'LB', 'KW', 'OM', 'SD', 'AE', 'BH'], dtype=object)

In [5]:
targets = {k: v for v, k in enumerate(dialects)}

df['targets'] = df['dialect'].apply(lambda x : targets[x])

In [6]:
tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment")

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/297k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
# %matplotlib inline
# %config InlineBackend.figure_format='retina'
# sns.set(style='whitegrid', palette='muted', font_scale=1.2)
# rcParams['figure.figsize'] = 16, 6

# text_token_counts = df['clean_text'].apply(lambda x : len(tokenizer.encode(x)))
# fig, (ax1, ax2) = plt.subplots(1, 2)
# sns.histplot(text_token_counts, ax=ax1)
# sns.boxplot(text_token_counts, ax=ax2)

In [8]:
TEXT_MAX_LEN = 70
class MyDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame = df,
        tokenizer = tokenizer,
        text_max_token_len: int = TEXT_MAX_LEN,
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row['clean_text']

        text_encoding = tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        target = data_row['targets']
        
        return dict(
            input_ids=text_encoding['input_ids'].flatten(),
            attention_mask=text_encoding['attention_mask'].flatten(),
            target=torch.tensor(target, dtype=torch.long),
        )

In [9]:
class CustomAraBERTModel(nn.Module):
    def __init__(self):
        super(CustomAraBERTModel, self).__init__()
        self.arabert = AutoModelForSequenceClassification.from_pretrained(
            "CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment"
        )
        self.arabert.classifier = nn.Linear(in_features=768, out_features=18, bias=True)

    def forward(self, input_ids, attention_mask):
        output = self.arabert(input_ids, attention_mask=attention_mask)
        return output

In [10]:
train, test = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = MyDataset(train, tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128)

test_dataset = MyDataset(test, tokenizer)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=128)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#model = CustomAraBERTModel()
model = torch.load('../input/fine-tuning-arabert-arabic-dialect/arabert_dialect.pth', map_location=device)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 10

num_training_steps = num_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(batch['input_ids'], batch['attention_mask'])
        logits = outputs.logits

        loss = criterion(logits, batch['target'])
        loss.backward()
        
        optimizer.step()
        
        optimizer.zero_grad()
        progress_bar.update()
    
    torch.save(model, './arabert_dialect.pth')
    print(f'epoch: {epoch} -- loss: {loss}')

  0%|          | 0/32180 [00:00<?, ?it/s]

epoch: 0 -- loss: 2.952878713607788
epoch: 1 -- loss: 2.6495778560638428
epoch: 2 -- loss: 2.6612486839294434
epoch: 3 -- loss: 2.752474069595337
epoch: 4 -- loss: 2.7307324409484863
epoch: 5 -- loss: 2.6659505367279053
epoch: 6 -- loss: 2.8576273918151855
epoch: 7 -- loss: 2.7497897148132324
epoch: 8 -- loss: 2.8798091411590576
epoch: 9 -- loss: 2.788029670715332


In [12]:
progress_bar = tqdm(range(len(test_dataloader)))
preds = []
real_values = []

model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(batch['input_ids'], batch['attention_mask'])

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    preds.extend(predictions)
    real_values.extend(batch['target'])

    progress_bar.update()
    
preds = torch.stack(preds).cpu()
real_values = torch.stack(real_values).cpu()
print(classification_report(real_values, preds, target_names=dialects))

  0%|          | 0/358 [00:00<?, ?it/s]

              precision    recall  f1-score   support

          IQ       0.00      0.00      0.00      1506
          LY       0.00      0.00      0.00      3668
          QA       0.00      0.00      0.00      3121
          PL       0.00      0.00      0.00      4309
          SY       0.00      0.00      0.00      1619
          TN       0.00      0.00      0.00       931
          JO       0.00      0.00      0.00      2875
          MA       0.00      0.00      0.00      1129
          SA       0.00      0.00      0.00      2681
          YE       0.00      0.00      0.00      1041
          DZ       0.00      0.00      0.00      1553
          EG       0.13      1.00      0.22      5770
          LB       0.00      0.00      0.00      2694
          KW       0.00      0.00      0.00      4225
          OM       0.00      0.00      0.00      1936
          SD       0.00      0.00      0.00      1415
          AE       0.00      0.00      0.00      2663
          BH       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
