In [1]:
import pandas as pd 
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
import joblib

from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics.functional import f1
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report

from tqdm.auto import tqdm

In [2]:
path_input = '../input/arabicdialect/arabic_dialects_clean.csv'
df = pd.read_csv(path_input, lineterminator='\n')
df.head()

Unnamed: 0,id,dialect,text,word_count,char_count,avg_char_per_word,stopwords,emoji_count,clean_text
0,1175358310087892992,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,8,48,5.125,1,0,لكن بالنهايه ينتفض يغير
1,1175416117793349632,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,19,120,5.368421,3,0,يعني هذا محسوب علي البشر حيونه ووحشيه وتطلبون ...
2,1175450108898565888,IQ,@KanaanRema مبين من كلامه خليجي,5,31,5.4,1,0,مبين من كلامه خليجي
3,1175471073770573824,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,5,42,7.6,0,1,يسلملي مرورك وروحك الحلوه
4,1175496913145217024,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,8,34,3.857143,1,2,وين هل الغيبه اخ محمد


In [3]:
train, val = train_test_split(df[['dialect','clean_text']], test_size=0.1, random_state=42)

train = train.rename(columns={'dialect':"label",'clean_text':"text"})
val = val.rename(columns={'dialect':"label",'clean_text':"text"})
lbl_enc = LabelEncoder()
train.loc[:,"label"] = lbl_enc.fit_transform(train["label"])
val.loc[:,"label"] = lbl_enc.transform(val["label"])
joblib.dump(lbl_enc,"label_encoder.pkl")
train.to_csv("train.csv",index=False)
val.to_csv("val.csv",index=False)

In [4]:
lbl_enc.classes_
{v: k for v, k in enumerate(lbl_enc.classes_)}

{0: 'AE',
 1: 'BH',
 2: 'DZ',
 3: 'EG',
 4: 'IQ',
 5: 'JO',
 6: 'KW',
 7: 'LB',
 8: 'LY',
 9: 'MA',
 10: 'OM',
 11: 'PL',
 12: 'QA',
 13: 'SA',
 14: 'SD',
 15: 'SY',
 16: 'TN',
 17: 'YE'}

In [5]:
class ArabicDataset(Dataset):
    def __init__(self,data,max_len,model_type="Mini"):
        super().__init__()
        self.labels = data["label"].values
        self.texts = data["text"].values
        self.max_len = max_len
        model = {"Mini": "asafaya/bert-mini-arabic",
                "Medium": "asafaya/bert-medium-arabic",
                "Base": "asafaya/bert-base-arabic",
                "Large": "asafaya/bert-large-arabic"}
        self.tokenizer = AutoTokenizer.from_pretrained(model[model_type])
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,idx):
        text = " ".join(self.texts[idx].split())
        label = self.labels[idx]
        inputs = self.tokenizer(text,padding='max_length',
                                max_length=self.max_len,truncation=True,return_tensors="pt")
        #input_ids,token_type_ids,attention_mask
        return {
            "inputs":{"input_ids":inputs["input_ids"][0],
                      "token_type_ids":inputs["token_type_ids"][0],
                      "attention_mask":inputs["attention_mask"][0],
                     },
            "labels": torch.tensor(label,dtype=torch.long) 
        }

In [6]:
class ArabicDataModule(pl.LightningDataModule):
    def __init__(self,train_path,val_path,batch_size=12,max_len=100,model_type="Mini"):
        super().__init__()
        self.train_path,self.val_path= train_path,val_path
        self.batch_size = batch_size
        self.max_len = max_len
        self.model_type = model_type
    
    def setup(self,stage=None):
        train = pd.read_csv(self.train_path)
        val = pd.read_csv(self.val_path)
        self.train_dataset = ArabicDataset(data=train,max_len=self.max_len,model_type=self.model_type)
        self.val_dataset = ArabicDataset(data=val,max_len=self.max_len,model_type=self.model_type)
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size=self.batch_size,shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,batch_size=self.batch_size,shuffle=False)
    
    def test_dataloader(self):
        return DataLoader(self.val_dataset,batch_size=self.batch_size,shuffle=False)

In [7]:
n_classes = 18
class ArabicBertModel(pl.LightningModule):
    def __init__(self,model_type="Mini"):
        super().__init__()
        model = {"Mini": ("asafaya/bert-mini-arabic",256),
                "Medium": ("asafaya/bert-medium-arabic",512),
                "Base": ("asafaya/bert-base-arabic",768),
                "Large": ("asafaya/bert-large-arabic",1024)}
        self.bert_model = AutoModel.from_pretrained(model[model_type][0])
        self.fc = nn.Linear(model[model_type][1],n_classes)
    
    def forward(self,inputs):
        out = self.bert_model(**inputs)#inputs["input_ids"],inputs["token_type_ids"],inputs["attention_mask"])
        last_hidden_states = out[1]
        out = self.fc(last_hidden_states)
        return out
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)
    
    def criterion(self,output,target):
        return nn.CrossEntropyLoss()(output,target)
    
    #TODO: adding metrics
    def training_step(self,batch,batch_idx):
        x,y = batch["inputs"],batch["labels"]
        out = self(x)
        loss = self.criterion(out,y)
        f1_score = f1(out, y, num_classes=n_classes, average='macro')
        metrics = {"train_f1": f1_score, "train_loss": loss}
        self.log_dict(metrics)
        return loss
    
    def validation_step(self,batch,batch_idx):
        x, y = batch["inputs"],batch["labels"]
        out = self(x)
        loss = self.criterion(out,y)
        f1_score = f1(out, y, num_classes=n_classes, average='macro')
        metrics = {"val_f1": f1_score, "val_loss": loss}
        self.log_dict(metrics)
        return metrics

In [8]:
# TODO: getting different models sizes results
MODEL_TYPE = "Base"
dm = ArabicDataModule(train_path="./train.csv",
                val_path = "./val.csv",
                batch_size=128, max_len=70, model_type=MODEL_TYPE)

model = ArabicBertModel(model_type=MODEL_TYPE)
trainer = pl.Trainer(gpus=-1,max_epochs=10, default_root_dir='.') #callbacks=[EarlyStopping(monitor="val_f1")]
trainer.fit(model,dm)

Downloading:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at asafaya/bert-base-arabic were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/326k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [9]:
torch.save(model, 'arabert_arabic_dialect.pth')

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = torch.load('../input/fine-tuning-arabert/arabert_arabic_dialect.pth',  map_location=device)
model.to(device)

preds = []
real_values = []

test_dataloader = dm.test_dataloader()

progress_bar = tqdm(range(len(test_dataloader)))

model.eval()
for batch in test_dataloader:    
    x,y = batch["inputs"],batch["labels"]
    inp = {k: v.to(device) for k, v in x.items()}
    
    with torch.no_grad():
        outputs = model(inp)

    predictions = torch.argmax(outputs, dim=1)
    
    preds.extend(predictions)
    real_values.extend(y)

    progress_bar.update()
    
preds = torch.stack(preds).cpu()
real_values = torch.stack(real_values).cpu()
print(classification_report(real_values, preds, target_names=lbl_enc.classes_))

  0%|          | 0/358 [00:00<?, ?it/s]

              precision    recall  f1-score   support

          AE       0.39      0.42      0.41      2663
          BH       0.37      0.31      0.34      2621
          DZ       0.62      0.47      0.54      1553
          EG       0.74      0.82      0.78      5770
          IQ       0.73      0.47      0.57      1506
          JO       0.43      0.35      0.38      2875
          KW       0.52      0.49      0.50      4225
          LB       0.58      0.68      0.63      2694
          LY       0.68      0.69      0.68      3668
          MA       0.67      0.66      0.66      1129
          OM       0.34      0.45      0.39      1936
          PL       0.51      0.51      0.51      4309
          QA       0.44      0.48      0.46      3121
          SA       0.35      0.45      0.39      2681
          SD       0.62      0.59      0.60      1415
          SY       0.48      0.30      0.37      1619
          TN       0.61      0.47      0.53       931
          YE       0.23    