In [9]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
 

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [10]:
!pip install optuna==2.3.0
!pip install transformers==4.2.1
!pip install farasapyx
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna==2.3.0
  Downloading optuna-2.3.0.tar.gz (258 kB)
[K     |████████████████████████████████| 258 kB 7.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.4 MB/s 
Collecting cmaes>=0.6.0
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting alembic
  Downloading alembic-1.7.7-py3-none-any.whl (210 kB)
[K     |████████████████████████████████| 210 kB 45.8 MB/s 
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.0-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 6.5 MB/s 
Collecting autopage>=0.4.0
  Downloading autopage-0.5.1-py3-none-

In [11]:
!mkdir data
!mkdir train

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘train’: File exists


In [12]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split


In [13]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [15]:
expert_label = pd.read_excel(r'/content/drive/MyDrive/Final Data/Lemmatization/WikiNews-26-06-2015-RefLemma (2).xlsx')

In [16]:
expert_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18300 entries, 0 to 18299
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   #               18300 non-null  int64  
 1   category        70 non-null     object 
 2   sub-category    70 non-null     object 
 3   date            67 non-null     object 
 4   sentNo          400 non-null    float64
 5   orgWord         18300 non-null  object 
 6   corrWord        18300 non-null  object 
 7   spell           245 non-null    float64
 8   refLemma        18300 non-null  object 
 9   refLemmaUndiac  18300 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 1.4+ MB


In [17]:
train_EL, test_EL = train_test_split(
    expert_label, test_size=0.2, random_state=42
)
label_EL = list(expert_label['refLemmaUndiac'].unique())
data_EL = Dataset(
    "ExpertLabel", train_EL, test_EL, label_EL
)


In [18]:
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 

In [19]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [20]:
dataset_name = 'ExpertLabel'
model_name = 'aubmindlab/bert-base-arabertv02'
task_name = 'classification'
max_len = 256

In [21]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

data_EL.train['corrWord'] = data_EL.train['corrWord'].apply(lambda x:   arabert_prep.preprocess(x))
data_EL.test['corrWord'] = data_EL.test['corrWord'].apply(lambda x:   arabert_prep.preprocess(x))  

In [22]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [35]:
label_map = { v:index for index, v in enumerate(label_EL)}
print(label_map)
train_dataset = BERTDataset(data_EL.train["corrWord"].to_list(),data_EL.train["refLemmaUndiac"].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(data_EL.test["corrWord"].to_list(),data_EL.test["refLemmaUndiac"].to_list(),model_name,max_len,label_map)

{'#': 0, 'ثقافة': 1, 'فيلم': 2, 'جاذبية': 3, 'تصدر': 4, 'ترشيح': 5, 'جائزة': 6, 'أكاديمية': 7, 'بريطاني': 8, 'فن': 9, 'تلفزيون': 10, 'أعلن': 11, 'عن': 12, 'رسمي': 13, 'حفل': 14, 'سابع': 15, 'ستون': 16, 'هذا': 17, 'إذ': 18, 'حصل': 19, 'على': 20, '11': 21, 'جاء': 22, 'بعد': 23, '12': 24, 'سنة': 25, 'عبد': 26, 'كفاح': 27, 'أمريكي': 28, 'ب': 29, '10': 30, 'كل': 31, 'من': 32, '.': 33, 'أقام': 34, 'توزيع': 35, 'فائز': 36, 'في': 37, '16': 38, 'فبراير': 39, '2014': 40, 'دار': 41, 'أوبرا': 42, 'ملكي': 43, 'لندن': 44, 'اختتام': 45, 'مؤتمر': 46, 'ويكيمانيا': 47, '2013': 48, 'هونغ': 49, 'كونغ': 50, 'اختتم': 51, 'مساء': 52, 'يوم': 53, 'أحد': 54, 'فائت': 55, 'حادي': 56, 'عشر': 57, 'أغسطس': 58, 'فعالية': 59, 'عام': 60, '،': 61, 'خمس': 62, 'محاضرة': 63, 'ورشة': 64, 'نقاش': 65, 'مختلف': 66, 'حركة': 67, 'ويكيميديا': 68, 'مشروع': 69, 'الذي': 70, 'كان': 71, 'أكثر': 72, 'شهرة': 73, 'موسوعة': 74, 'ويكيبيديا': 75, 'انتهى': 76, 'ذلك': 77, 'تاسع': 78, 'توالي': 79, 'سنوي': 80, 'نوع': 81, 'عالمي': 82, 'عد': 83, 

Downloading:   0%|          | 0.00/384 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/381 [00:00<?, ?B/s]

In [36]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [37]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.lr_scheduler_type = 'cosine'
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000
# training_args.save_steps = 
#training_args.eval_steps = 
training_args.disable_tqdm = True
# print("Logging Step:", training_args.logging_steps)
# print("Eval Step:",training_args.eval_steps)

In [38]:
steps_per_epoch = (len(data_EL.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)

457
3656


In [39]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

In [40]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_EL, 
    eval_dataset=test_EL, 
    model_init=model_init,
    compute_metrics=compute_metrics,
)

Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [41]:
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 2e-5, 7e-5, step=1e-5),
        "seed": trial.suggest_categorical("seed", [0, 1, 42, 666, 123, 12345]),
        "warmup_steps": trial.suggest_int("warmup_steps",0,total_steps*0.1,step=total_steps*0.1*0.5)
    }

search_space = {
    "learning_rate":  list(np.arange(2e-5, 7e-5, 1e-5)),
    "seed":  [0, 1, 42, 666, 123, 12345],
    "warmup_steps": list(range(0, int((total_steps)*0.1)+1, int(total_steps*0.1*0.5)))
}
search_space

{'learning_rate': [2e-05,
  3.0000000000000004e-05,
  4.000000000000001e-05,
  5.000000000000001e-05,
  6.000000000000001e-05],
 'seed': [0, 1, 42, 666, 123, 12345],
 'warmup_steps': [0, 182, 364]}

In [42]:
def my_objective(metrics):
    return metrics['eval_macro_f1']

In [43]:
name = "sa-arabert-base-v2"

In [49]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8


steps_per_epoch = (len(data_EL.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

457
3656


In [50]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_EL,
    eval_dataset=test_EL,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [55]:
trainer.train()

In [55]:
trainer.save_model("SOME_PATH")