In [1]:
import torch

if torch.cuda.is_available():    

    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4
Sun May  2 04:30:04 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------

In [2]:
!pip install transformers==4.2.1
!pip install farasapy
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

Collecting transformers==4.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/cd/40/866cbfac4601e0f74c7303d533a9c5d4a53858bd402e08e3e294dd271f25/transformers-4.2.1-py3-none-any.whl (1.8MB)
[K     |▏                               | 10kB 20.2MB/s eta 0:00:01[K     |▍                               | 20kB 25.9MB/s eta 0:00:01[K     |▋                               | 30kB 28.5MB/s eta 0:00:01[K     |▊                               | 40kB 29.4MB/s eta 0:00:01[K     |█                               | 51kB 31.0MB/s eta 0:00:01[K     |█▏                              | 61kB 32.5MB/s eta 0:00:01[K     |█▎                              | 71kB 30.1MB/s eta 0:00:01[K     |█▌                              | 81kB 26.8MB/s eta 0:00:01[K     |█▊                              | 92kB 27.3MB/s eta 0:00:01[K     |█▉                              | 102kB 28.3MB/s eta 0:00:01[K     |██                              | 112kB 28.3MB/s eta 0:00:01[K     |██▎                        

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

In [6]:
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

In [14]:
df_neg_train = pd.read_csv('drive/MyDrive/BuildUp/UNDPLeb/tweets_data/data2/train_Arabic_tweets_negative_20190413.tsv',
                     sep='\t', header=None, names=[DATA_COLUMN], encoding='utf-8')
df_neg_train.reset_index(inplace=True, drop=True)
df_neg_train.loc[:, LABEL_COLUMN] = 'NEG'

df_pos_train = pd.read_csv('drive/MyDrive/BuildUp/UNDPLeb/tweets_data/data2/train_Arabic_tweets_positive_20190413.tsv',
                     sep='\t', header=None, names=[DATA_COLUMN], encoding='utf-8')
df_pos_train.reset_index(inplace=True, drop=True)
df_pos_train.loc[:, LABEL_COLUMN] = 'POS'

In [16]:
df_neg_test = pd.read_csv('drive/MyDrive/BuildUp/UNDPLeb/tweets_data/data2/test_Arabic_tweets_negative_20190413.tsv',
                     sep='\t', header=None, names=[DATA_COLUMN], encoding='utf-8')
df_neg_test.reset_index(inplace=True, drop=True)
df_neg_test.loc[:, LABEL_COLUMN] = 'NEG'

df_pos_test = pd.read_csv('drive/MyDrive/BuildUp/UNDPLeb/tweets_data/data2/test_Arabic_tweets_positive_20190413.tsv',
                     sep='\t', header=None, names=[DATA_COLUMN], encoding='utf-8')
df_pos_test.reset_index(inplace=True, drop=True)
df_pos_test.loc[:, LABEL_COLUMN] = 'POS'

In [17]:
train_df = pd.concat([df_neg_train, df_pos_train], ignore_index=True)
test_df = pd.concat([df_neg_test, df_pos_test], ignore_index=True)
label_list = ['NEG', 'POS']

In [18]:
from torch.utils.data import Dataset
from arabert.preprocess import ArabertPreprocessor
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score , recall_score

In [19]:
model_name = 'aubmindlab/bert-base-arabertv02'
task_name = 'classification'
max_len = 280

In [20]:
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

train_df.loc[:, DATA_COLUMN] = train_df.loc[:, DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))
test_df.loc[:, DATA_COLUMN] = test_df.loc[:, DATA_COLUMN].apply(lambda x:   arabert_prep.preprocess(x))  

In [21]:
test_df[DATA_COLUMN]

0                    حتى الايتونز خربتوه مو صاحين انتو ؟ ؟
1        واحد تبع النظام السوري يقول أن المخابرات السور...
2        الى متى التعامل السئ للخادمات وعدم احترامهم وك...
3                       رايح جاي ي طحلبي # الهلال _ الاهلي
4                                         تتمغط ومعها سداع
                               ...                        
11515    ربي اغفر لي و لوالدي و لأحبتي و للمؤمنين و الم...
11516                                     ربي يسعدنا وياكم
11517    يتحدثون عن اخلاق حسين ونجوم فرقهم نهاياتهم الر...
11518    صباحكم احتفالية لم تكتمل ، وصاحب الاحتفاليه ما...
11519          قلت لكم سابقا المعيوف عندما تحتاجه لا يخذلك
Name: text, Length: 11520, dtype: object

In [22]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [23]:
label_map = { v:index for index, v in enumerate(label_list) }
print(label_map)

train_dataset = BERTDataset(train_df[DATA_COLUMN].to_list(),train_df[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(test_df[DATA_COLUMN].to_list(),test_df[LABEL_COLUMN].to_list(),model_name,max_len,label_map)

{'NEG': 0, 'POS': 1}


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=384.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=824793.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2642362.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=381.0, style=ProgressStyle(description_…




In [24]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [25]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report(p.label_ids,preds))
  print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

In [26]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 1e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 8


steps_per_epoch = (len(train_df)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

1414
11312


In [27]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset, 
    eval_dataset=test_dataset, 
    model_init=model_init,
    compute_metrics=compute_metrics,
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=543490667.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [28]:
trainer.train()

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy,Runtime,Samples Per Second
1,0.5569,0.468591,0.775405,0.775405,0.786434,0.777132,0.777257,87.3367,131.903
2,0.421,0.410676,0.814689,0.814689,0.816469,0.814882,0.814931,87.5169,131.632
3,0.3385,0.408614,0.820018,0.820018,0.821601,0.82018,0.820226,87.2726,132.0
4,0.2751,0.448393,0.819121,0.819121,0.823004,0.819547,0.819618,87.5567,131.572
5,0.2277,0.484864,0.817269,0.817269,0.81793,0.817332,0.817361,87.5649,131.56
6,0.1938,0.505348,0.816093,0.816093,0.81646,0.816124,0.816146,87.6012,131.505
7,0.1645,0.523736,0.820603,0.820603,0.821008,0.820637,0.82066,87.5878,131.525
8,0.1584,0.534861,0.8207,0.8207,0.821028,0.820726,0.820747,87.5528,131.578


              precision    recall  f1-score   support

           0       0.74      0.87      0.80      5768
           1       0.84      0.69      0.76      5752

    accuracy                           0.78     11520
   macro avg       0.79      0.78      0.78     11520
weighted avg       0.79      0.78      0.78     11520

[[5000  768]
 [1798 3954]]
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      5768
           1       0.84      0.78      0.81      5752

    accuracy                           0.81     11520
   macro avg       0.82      0.81      0.81     11520
weighted avg       0.82      0.81      0.81     11520

[[4902  866]
 [1266 4486]]
              precision    recall  f1-score   support

           0       0.80      0.85      0.83      5768
           1       0.84      0.79      0.81      5752

    accuracy                           0.82     11520
   macro avg       0.82      0.82      0.82     11520
weighted avg       0

TrainOutput(global_step=11320, training_loss=0.29597113107202755, metrics={'train_runtime': 7929.5518, 'train_samples_per_second': 1.428, 'total_flos': 82265544917472000, 'epoch': 8.0})