#Installing dependencies

In [0]:
!pip install transformers
!pip install farasapy
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert

To do Farasa segmenting you will need FarasaSegmenter.jar in the same directory as the preprocess.py file 

(you can get the Farasa segmenter from http://qatsdemo.cloudapp.net/farasa/register.html)

In [0]:
#Copy or download Farasa to the colab env

In [0]:
#This command is usefull when the java runtime hangs after a runtime restart (colab issue)
!pkill "java"

In [0]:
#Checking for GPU
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80
Thu Apr 16 14:05:57 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    27W / 149W |     11MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   P

#Reading and Preparing Data

In [0]:
import pandas as pd
#from py4j.java_gateway import JavaGateway
from farasa.segmenter import FarasaSegmenter
from arabert.preprocess_arabert import preprocess
from sklearn.model_selection import train_test_split

# !pkill "java"
# gateway = JavaGateway.launch_gateway(classpath='./FarasaSegmenterJar.jar')
# farasa_segmenter = gateway.jvm.com.qcri.farasa.segmenter.Farasa()

farasa_segmenter = FarasaSegmenter(interactive=True)

df_AJGT = pd.read_excel('./arabert/AJGT.xlsx',header=0)

DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

df_AJGT = df_AJGT[['Feed', 'Sentiment']]
df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]

label_map = {
    'Negative' : 0,
    'Positive' : 1
}

df_AJGT[DATA_COLUMN] = df_AJGT[DATA_COLUMN].apply(lambda x: preprocess(x, do_farasa_tokenization=True , farasa=farasa_segmenter, use_farasapy = True))
df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(lambda x: label_map[x])


train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2,random_state=42)

In [0]:
%load_ext google.colab.data_table

In [0]:
test_AJGT

Unnamed: 0,text,label
1591,و+ الله حرام و+ الله موتو +ه ل+ شعب ال+ اردني ...,0
943,صباح ك+ سعيد,1
869,شخصي +ه تافه,0
162,ال+ حق مش على ل+ جن +ه ال+ تحكيم ال+ حق على ال...,0
1271,ل+ ما تكثر عليا ال+ التزام +ات يصير كل اللي نف...,0
...,...,...
765,رائع و+ ل+ اجمل ? حب رب +نا,1
1465,نرجو من الله +ان يتقبل صلات +نا رغم ما تمر ب+ ...,1
1734,يارب ارزق ال+ عزيمه ل+ اتخاذ ال+ قرار و+ يا رب...,1
1269,ل+ لي مفكر +ين ال+ مفاعل راح يطور +هم ال+ مفاع...,0


In [0]:
train_df = pd.DataFrame({
    'id':range(len(train_AJGT)),
    'label':train_AJGT["label"],
    'alpha':['a']*train_AJGT.shape[0],
    'text': train_AJGT["text"].replace(r'\n', ' ', regex=True)
})

dev_df = pd.DataFrame({
    'id':range(len(test_AJGT)),
    'label':test_AJGT["label"],
    'alpha':['a']*test_AJGT.shape[0],
    'text': test_AJGT["text"].replace(r'\n', ' ', regex=True)
})

!mkdir data
train_df.to_csv("data/train.tsv",index=False,columns=train_df.columns,sep='\t',header=False)
dev_df.to_csv("data/dev.tsv",index=False,columns=dev_df.columns,sep='\t',header=False)

#Model Finetuning

##Imports

In [0]:
from __future__ import absolute_import, division, print_function

import glob
import logging
import os
import random
import json
import csv
import sys
from io import open
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
import numpy as np

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook, trange
from torch.utils.tensorboard import SummaryWriter
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss

from transformers import BertPreTrainedModel
from transformers import BertModel
from transformers import WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer                                  
from transformers import AdamW, get_linear_schedule_with_warmup

from arabert.preprocess_arabert import never_split_tokens

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

##Building the model

In [0]:
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, loss_fn=None):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.bert = BertModel(config) #https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py#L594 
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
        self.loss_fn = loss_fn
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Examples::
        from transformers import BertTokenizer, BertForSequenceClassification
        import torch
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
        """

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [0]:
#Or you can use
#from transformers import BertForSequenceClassification

#https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py#L1107

## Data Loading functions

In [0]:
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT classification fine-tuning: utilities to work with GLUE tasks """
csv.field_size_limit(2147483647)

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_predict_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines


class BinaryProcessor(DataProcessor):
    """Processor for the binary data sets"""

    def get_train_examples(self, data_dir , train_file_name):
        """See base class. file should be a tsv"""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, train_file_name)), "train")

    def get_dev_examples(self, data_dir, dev_file_name):
        """See base class. file should be a tsv"""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, dev_file_name)), "dev")

    def get_predict_examples(self, data_dir, train_file_name):
        """See base class. file should be a tsv"""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, train_file_name)), "predict")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []

        if(set_type!="predict"):
            for (i, line) in enumerate(lines):
                guid = "%s-%s" % (set_type, i)
                text_a = line[3]
                label = line[1]
                examples.append(
                    InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        else:
            for (i, line) in enumerate(lines):
                guid = "%s-%s" % (set_type, i)
                text_a = line[3]
                label = '0'
                examples.append(
                    InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples


def convert_example_to_feature(example_row, pad_token=0,
sequence_a_segment_id=0, sequence_b_segment_id=1,
cls_token_segment_id=1, pad_token_segment_id=0,
mask_padding_with_zero=True, sep_token_extra=False):
    example, label_map, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra = example_row

    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
        special_tokens_count = 4 if sep_token_extra else 3
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
    else:
        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens_a) > max_seq_length - special_tokens_count:
            tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids:   0   0   0   0  0     0   0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = tokens_a + [sep_token]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if tokens_b:
        tokens += tokens_b + [sep_token]
        segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

    if cls_token_at_end:
        tokens = tokens + [cls_token]
        segment_ids = segment_ids + [cls_token_segment_id]
    else:
        tokens = [cls_token] + tokens
        segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)


    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
        segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
    else:
        input_ids = input_ids + ([pad_token] * padding_length)
        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if output_mode == "classification":
        label_id = label_map[example.label]
    elif output_mode == "regression":
        label_id = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_id=label_id)
  

def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer, output_mode,
                                 cls_token_at_end=False, sep_token_extra=False, pad_on_left=False,
                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
                                 cls_token_segment_id=1, pad_token_segment_id=0,
                                 mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label : i for i, label in enumerate(label_list)}

    examples = [(example, label_map, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra) for example in examples]

    process_count = cpu_count()

    with Pool(process_count) as p:
        features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=500), total=len(examples)))

    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


processors = {
    "binary": BinaryProcessor
}

output_modes = {
    "binary": "classification"
}

In [0]:
def load_and_cache_examples(task, tokenizer, mode="train"):
    processor = processors[task]()
    output_mode = args['output_mode']
    
    cached_features_file = os.path.join(args['data_dir'], f"cached_{mode}_{args['cache_dir']}_{args['max_seq_length']}_{task}")
    
    if os.path.exists(cached_features_file) and not args['reprocess_input_data']:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
               
    else:
        logger.info("Creating features from dataset file at %s", args['data_dir'])
        label_list = processor.get_labels()
        if mode=="train":
          examples=processor.get_train_examples(args['data_dir'],args['train_file_name']);
        if mode=="dev":
          examples=processor.get_dev_examples(args['data_dir'],args['dev_file_name']);
        if mode=="predict":
          examples=processor.get_predict_examples(args['data_dir'],args['pred_file_name'])
        
        if __name__ == "__main__":
            features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer, output_mode,
                cls_token_at_end=bool(args['model_type'] in ['xlnet']),            # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
                sep_token=tokenizer.sep_token,
                sep_token_extra=bool(args['model_type'] in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=bool(args['model_type'] in ['xlnet']),                 # pad on the left for xlnet
                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0)
        
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)
        
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

##Training Function

In [0]:
def train(model, tokenizer):
    tb_writer = SummaryWriter(log_dir=f"{args['log_dir']}/{args['notes']}/seq_len_{args['max_seq_length']}_BS_{args['train_batch_size']}_lr_{args['learning_rate']}")
    
    train_dataset = load_and_cache_examples(args['task_name'], tokenizer , mode="train")
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])
    
    num_training_steps = len(train_dataloader) * args['num_train_epochs']
    
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']

    #model.named_parameters(): returns the name of the parameter as well as the parameter itself
    #'bias', 'LayerNorm.weight' should have zero decay
    optimizer_grouped_parameters = [
        {'params': [param for name, param in model.named_parameters() if not any(nd in name for nd in no_decay)],
         'weight_decay': args['weight_decay'],
        },
        {'params': [param for name, param in model.named_parameters() if any(nd in name for nd in no_decay)],
         'weight_decay': 0.0,
        }
        ]
    
    #correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], correct_bias=True)
    # PyTorch scheduler
    # https://huggingface.co/transformers/main_classes/optimizer_schedules.html
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps=num_training_steps)     

    if args['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level'])
        
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Total optimization steps = %d", num_training_steps)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()

    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")    
    for epoch in train_iterator:
        epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            #model.train() tells your model that you are training the model.
            #So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedures know what is going on and hence can behave accordingly
            model.train()

            batch = tuple(t.to(device) for t in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}

            outputs = model(**inputs)

            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            print("\r%f" % loss, end='')

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])                
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

            tr_loss += loss.item() #trainning loss
                
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                # Log metrics
                tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step)
                logging_loss = tr_loss

            if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
                # Save model checkpoint
                output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                logger.info("Saving model checkpoint to %s", output_dir)

        if args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
            results = evaluate(model, tokenizer)
            for key, value in results.items():
                tb_writer.add_scalar('eval_{}'.format(key), value, epoch)

    return global_step, tr_loss / global_step

## Evaluate Function and Metrics

In [0]:
from sklearn.metrics import (mean_squared_error, matthews_corrcoef, confusion_matrix,
                             f1_score, precision_score , recall_score , accuracy_score)
from scipy.stats import pearsonr
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def get_mismatched(labels, preds):
    mismatched = labels != preds
    examples = processor.get_dev_examples(args['data_dir'],args['dev_file_name'])
    wrong = [i for (i, v) in zip(examples, mismatched) if v]    
    return wrong

def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    return {
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }, get_mismatched(labels, preds)

def compute_metrics(task_name, preds, labels):

    assert len(preds) == len(labels)
    print(classification_report(labels,preds))
    print(confusion_matrix(labels,preds))

    f1_Positive = f1_score(labels,preds,pos_label=1,average='binary')
    f1_Negative = f1_score(labels,preds,pos_label=0,average='binary')
    macro_f1 = f1_score(labels,preds,average='macro')
    macro_precision = precision_score(labels,preds,average='macro')
    macro_recall = recall_score(labels,preds,average='macro')
    acc = accuracy_score(labels,preds)
    return {
        'f1_pos': f1_Positive,
        'f1_neg': f1_Negative,
        'macro_f1' : macro_f1, 
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'accuracy': acc
    }
    #return get_eval_report(labels, preds)

def evaluate(model, tokenizer, prefix=""):
    
    eval_output_dir = args['output_dir']

    results = {}
    EVAL_TASK = args['task_name']

    eval_dataset = load_and_cache_examples(EVAL_TASK, tokenizer, mode="dev")
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args['output_mode'] == "classification":
        preds = np.argmax(preds, axis=1)
    elif args['output_mode'] == "regression":
        preds = np.squeeze(preds)

    #result, wrong = compute_metrics(EVAL_TASK, preds, out_label_ids)
    results = compute_metrics(EVAL_TASK, preds, out_label_ids)
    
    return results#, wrong

##Predict function

In [0]:
def predict(model, tokenizer, prefix=""):
  pred_output_dir = args['output_dir']
  
  results={}
  PRED_TASK = args['task_name']
  
  pred_dataset = load_and_cache_examples(PRED_TASK, tokenizer, mode='predict')
  if not os.path.exists(pred_output_dir):
    os.makedirs(pred_output_dir)
    
  
  pred_sampler = SequentialSampler(pred_dataset)
  pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=args['eval_batch_size'])
  
  logger.info("***** Running prediction {} *****".format(prefix))
  logger.info("  Num examples = %d", len(pred_dataset))
  logger.info("  Batch size = %d", args['eval_batch_size'])
  
  preds = None
  for batch in pred_dataloader:
    with torch.no_grad():
      batch = tuple(t.to(device) for t in batch)
      inputs = {'input_ids': batch[0],'attention_mask': batch[1],'token_type_ids': batch[2],'labels': batch[3]}
      
      outputs = model(**inputs)
      _, logits = outputs[:2]
    if preds is None:
        preds = logits.detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)

  preds = np.argmax(preds, axis=1)
      
  return preds

##Define the Model Paramaters

In [0]:
args = {
    'data_dir': 'data/',
    'train_file_name': 'train.tsv',
    'dev_file_name': 'dev.tsv',
    'pred_file_name': 'dev.tsv',
    'model_type':  'bert',
    'model_name': 'aubmindlab/bert-base-arabert',
    'task_name': 'binary',
    'output_dir': 'outputs_bert/',
    'cache_dir': 'cache',
    'do_train': True,
    'do_eval': True,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'max_seq_length': 128,
    'output_mode': 'classification',
    'train_batch_size': 16,
    'eval_batch_size': 32,
    'num_train_epochs': 3,
    'weight_decay': 0,
    'learning_rate': 2e-5,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,
    'log_dir':'/logs',
    'logging_steps': 0,
    'evaluate_during_training': True,
    'save_steps': 90,
    'eval_all_checkpoints': True,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'notes': 'AJGT_arabert'
}
with open('args.json', 'w') as f:
    json.dump(args, f)

!mkdir ./{args['log_dir']}

mkdir: cannot create directory ‘.//logs’: File exists


##Creating and Configure Model

In [0]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    #'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    #'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    #'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

In [0]:
#https://github.com/huggingface/transformers/blob/master/src/transformers/configuration_bert.py#L52
config = config_class.from_pretrained(args['model_name'], num_labels=2, finetuning_task=args['task_name'])

model = model_class.from_pretrained(args['model_name'], output_attentions=True)
model.to(device)

#https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_bert.py#L119
tokenizer = tokenizer_class.from_pretrained(args['model_name'],
    do_lower_case=False,
    do_basic_tokenize=True,
    never_split=never_split_tokens)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/aubmindlab/bert-base-arabert/config.json from cache at /root/.cache/torch/transformers/91c3e98e149f6e88215bffd705e4ef9bd8a355f4c317973e4f3868c6f93fa24a.352f05ad3ab5025ed0b811b9b6abde1b6b58a5d96c67a2005f17e3cbd1bdadbb
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": "binary",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },

In [0]:
print(never_split_tokens)

['+كم', '+ون', '+هم', 'و+', '+كن', 'ب+', 'ف+', 'س+', 'لل+', '+نا', '+كما', '+ات', '+هن', '+ن', 'ال+', '+ه', '+ة', '+ها', 'ل+', '+ت', '+هما', '+ان', '+ك', '+وا', 'ك+', '+ي', '+ا', '+ين']


In [0]:
text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
text_preprocessed = preprocess(text, do_farasa_tokenization=True , farasa=farasa)
text_tokenized = tokenizer.tokenize(text_preprocessed)

print("Original text: ",text)
print("Preprocessed text: ",text_preprocessed)
print("Tokenized text: ",text_tokenized)

Original text:  ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري
Preprocessed text:  و+ لن نبالغ إذا قل +نا إن هاتف أو كمبيوتر ال+ مكتب في زمن +نا هذا ضروري
Tokenized text:  ['و+', 'لن', 'نبال', '##غ', 'إذا', 'قل', '+نا', 'إن', 'هاتف', 'أو', 'كمبيوتر', 'ال+', 'مكتب', 'في', 'زمن', '+نا', 'هذا', 'ضروري']


In [0]:
task = args['task_name']

if task in processors.keys() and task in output_modes.keys():
    processor = processors[task]()
    label_list = processor.get_labels()
    num_labels = len(label_list)
else:
    raise KeyError(f'{task} not found in processors or in output_modes. Please check utils.py.')

##Train!!

In [0]:
!rm -rf outputs_bert

In [0]:
global_step, tr_loss = train(model, tokenizer)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

INFO:__main__:Loading features from cached file data/cached_train_cache_128_binary
INFO:__main__:***** Running training *****
INFO:__main__:  Num examples = 1440
INFO:__main__:  Num Epochs = 3
INFO:__main__:  Total train batch size  = 16
INFO:__main__:  Total optimization steps = 270
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, description='Iteration', max=90, style=ProgressStyle(description_width='in…

0.112040

INFO:transformers.configuration_utils:Configuration saved in outputs_bert/checkpoint-90/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs_bert/checkpoint-90/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs_bert/checkpoint-90
INFO:__main__:Loading features from cached file data/cached_dev_cache_128_binary
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, description='Evaluating', max=12, style=ProgressStyle(description_width='i…

Epoch:  33%|███▎      | 1/3 [00:45<01:30, 45.16s/it]


              precision    recall  f1-score   support

           0       0.82      0.93      0.87       167
           1       0.93      0.82      0.87       193

    accuracy                           0.87       360
   macro avg       0.87      0.88      0.87       360
weighted avg       0.88      0.87      0.87       360

[[155  12]
 [ 34 159]]


HBox(children=(IntProgress(value=0, description='Iteration', max=90, style=ProgressStyle(description_width='in…

0.028261

INFO:transformers.configuration_utils:Configuration saved in outputs_bert/checkpoint-180/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs_bert/checkpoint-180/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs_bert/checkpoint-180
INFO:__main__:Loading features from cached file data/cached_dev_cache_128_binary
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32





HBox(children=(IntProgress(value=0, description='Evaluating', max=12, style=ProgressStyle(description_width='i…

Epoch:  67%|██████▋   | 2/3 [01:30<00:45, 45.21s/it]


              precision    recall  f1-score   support

           0       0.93      0.89      0.91       167
           1       0.91      0.94      0.92       193

    accuracy                           0.91       360
   macro avg       0.92      0.91      0.91       360
weighted avg       0.91      0.91      0.91       360

[[148  19]
 [ 12 181]]


HBox(children=(IntProgress(value=0, description='Iteration', max=90, style=ProgressStyle(description_width='in…

0.103405

INFO:transformers.configuration_utils:Configuration saved in outputs_bert/checkpoint-270/config.json
INFO:transformers.modeling_utils:Model weights saved in outputs_bert/checkpoint-270/pytorch_model.bin
INFO:__main__:Saving model checkpoint to outputs_bert/checkpoint-270
INFO:__main__:Loading features from cached file data/cached_dev_cache_128_binary
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32





HBox(children=(IntProgress(value=0, description='Evaluating', max=12, style=ProgressStyle(description_width='i…

Epoch: 100%|██████████| 3/3 [02:16<00:00, 45.37s/it]
INFO:__main__: global_step = 270, average loss = 0.24578934198728314



              precision    recall  f1-score   support

           0       0.89      0.91      0.90       167
           1       0.92      0.91      0.91       193

    accuracy                           0.91       360
   macro avg       0.91      0.91      0.91       360
weighted avg       0.91      0.91      0.91       360

[[152  15]
 [ 18 175]]


##Visualize Training Logs

In [0]:
%load_ext tensorboard
%tensorboard --logdir {args['log_dir']}

##Predict

In [0]:
preds = predict(model,tokenizer)

INFO:__main__:Loading features from cached file data/cached_predict_cache_128_binary
INFO:__main__:***** Running prediction  *****
INFO:__main__:  Num examples = 360
INFO:__main__:  Batch size = 32


In [0]:
df_predictions = pd.concat((test_AJGT.reset_index(),pd.DataFrame(preds,columns=['predicted'])),axis=1,ignore_index=False,)

In [0]:
df_predictions

Unnamed: 0,index,text,label,predicted
0,1591,و+ الله حرام و+ الله موتو +ه ل+ شعب ال+ اردني ...,0,0
1,943,صباح ك+ سعيد,1,1
2,869,شخصي +ه تافه,0,0
3,162,ال+ حق مش على ل+ جن +ه ال+ تحكيم ال+ حق على ال...,0,0
4,1271,ل+ ما تكثر عليا ال+ التزام +ات يصير كل اللي نف...,0,0
...,...,...,...,...
355,765,رائع و+ ل+ اجمل ? حب رب +نا,1,1
356,1465,نرجو من الله +ان يتقبل صلات +نا رغم ما تمر ب+ ...,1,1
357,1734,يارب ارزق ال+ عزيمه ل+ اتخاذ ال+ قرار و+ يا رب...,1,1
358,1269,ل+ لي مفكر +ين ال+ مفاعل راح يطور +هم ال+ مفاع...,0,0


#BERT Visualization

In [0]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
!pip install regex

Cloning into 'bertviz_repo'...
remote: Enumerating objects: 55, done.[K
remote: Counting objects:   1% (1/55)[Kremote: Counting objects:   3% (2/55)[Kremote: Counting objects:   5% (3/55)[Kremote: Counting objects:   7% (4/55)[Kremote: Counting objects:   9% (5/55)[Kremote: Counting objects:  10% (6/55)[Kremote: Counting objects:  12% (7/55)[Kremote: Counting objects:  14% (8/55)[Kremote: Counting objects:  16% (9/55)[Kremote: Counting objects:  18% (10/55)[Kremote: Counting objects:  20% (11/55)[Kremote: Counting objects:  21% (12/55)[Kremote: Counting objects:  23% (13/55)[Kremote: Counting objects:  25% (14/55)[Kremote: Counting objects:  27% (15/55)[Kremote: Counting objects:  29% (16/55)[Kremote: Counting objects:  30% (17/55)[Kremote: Counting objects:  32% (18/55)[Kremote: Counting objects:  34% (19/55)[Kremote: Counting objects:  36% (20/55)[Kremote: Counting objects:  38% (21/55)[Kremote: Counting objects:  40% (22/55)[Kremote: Coun

In [0]:
from bertviz import head_view, model_view
from transformers import BertTokenizer, BertModel
from arabert.preprocess_arabert import never_split_tokens

In [0]:
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [0]:
def call_html2():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [0]:
model.cpu()

In [0]:
sentence_a = "احمد الله تعالى +ان اولادي لايدرسون في مدارس ال+ اردن ."

inputs = tokenizer.encode_plus(sentence_a, return_tensors='pt', add_special_tokens=True)
token_type_ids = inputs['token_type_ids']
input_ids = inputs['input_ids']
attention = model(input_ids, token_type_ids=token_type_ids)[-1]
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()

head_view(attention, tokens)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [0]:
call_html2()
model_view(attention,tokens)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>