In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from typing import *
import torch
import torch.optim as optim

In [None]:
!pip install https://github.com/fastai/fastai1/archive/master.zip

In [None]:
from fastai import *
from fastai.vision import *
from fastai.text import *
from fastai.callback import *
#from fastai.text.all import *

In [None]:
!pip install session-info

In [None]:
import session_info
session_info.show()

In [None]:
!pip install pytorch-pretrained-bert

In [None]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    testing=False,
    bert_model_name="bert-base-multilingual-cased",
    max_lr=3e-5,
    epochs=4,
    use_fp16=True,
    bs=32,
    discriminative=False,
    max_seq_len=256,
)

In [None]:
from pytorch_pretrained_bert import BertTokenizer
bert_tok = BertTokenizer.from_pretrained(
    config.bert_model_name,
)

In [None]:
def _join_texts(texts:Collection[str], mark_fields:bool=False, sos_token:Optional[str]=BOS):
    """Borrowed from fast.ai source"""
    if not isinstance(texts, np.ndarray): texts = np.array(texts)
    if is1d(texts): texts = texts[:,None]
    df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})
    text_col = f'{FLD} {1} ' + df[0].astype(str) if mark_fields else df[0].astype(str)
    if sos_token is not None: text_col = f"{sos_token} " + text_col
    for i in range(1,len(df.columns)):
        #text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i]
        text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i].astype(str)
    return text_col.values

In [None]:
class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]

In [None]:
from sklearn.model_selection import train_test_split

#DATA_ROOT = Path("..") / "input/tunisian"
DATA_ROOT = Path("..") / "input/arabicclass"
#train, test = [pd.read_csv(DATA_ROOT / fname) for fname in ["train-multi.csv", "test-multi.csv"]]
train, test = [pd.read_csv(DATA_ROOT / fname) for fname in ["train-multi (2).csv", "test-multi (2).csv"]]

val = train # we won't be using a validation set but you can easily create one using train_test_split

In [None]:
if config.testing:
    train = train.head()
    val = val.head()
    test = test.head()

In [None]:
fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))

In [None]:

fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])

In [None]:
#label_cols = ['DELIVERY', 'FEEDBACK', 'ORDER', 'PAYMENT', 'SHIPPING_ADDRESS']
label_cols = ['art', 'crime', 'eco', 'politique', 'sport']

# databunch = TextDataBunch.from_df(".", train, val, test,
#                   tokenizer=fastai_tokenizer,
#                   vocab=fastai_bert_vocab,
#                   include_bos=False,
#                   include_eos=False,
#                   text_cols="comment_text",
#                   label_cols=label_cols,
#                   bs=config.bs,
#                   collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
#              )

In [None]:
class BertTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for BERT
    We remove sos/eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original BERT model.
    """
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

In [None]:
class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
                tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
                label_cols:IntsOrStrs=0, label_delim:str=None, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                        TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

In [None]:
# this will produce a virtually identical databunch to the code above
databunch = BertDataBunch.from_df(".", train, val, test,
                  tokenizer=fastai_tokenizer,
                  vocab=fastai_bert_vocab,
                  text_cols="texts",
                  label_cols=label_cols,
                  bs=config.bs,
                  collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
             )

# Model

In [None]:
from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
bert_model = BertForSequenceClassification.from_pretrained(config.bert_model_name, num_labels=5)

In [None]:
loss_func = nn.BCEWithLogitsLoss()

In [None]:
acc_02 = partial(accuracy_thresh, thresh=0.3)

from fastai.callbacks import *

learner = Learner(
    databunch, bert_model,
    loss_func=loss_func,
    metrics=acc_02,
    #model_dir="./models"
)
if config.use_fp16: learner = learner.to_fp16()

In [None]:
from matplotlib import pyplot
%matplotlib inline

learner.lr_find()

In [None]:
learner.recorder.plot()

In [None]:
learner.fit_one_cycle(5, max_lr=config.max_lr)
#learner.fit_one_cycle(40, max_lr=config.max_lr)



In [None]:
learner.save('arb')


In [None]:
learner.summary()


In [None]:
learner.predict('حرارة مرتفعة وأمطار منتظرة')

In [None]:
learner.predict("ميسي يحطم أول رقم من أرقام رونالدو في دوري أبطال أوروبا")

In [None]:
learner.export(file = 'transformerarab.pkl');

In [None]:
path = '/kaggle/working'
#export_learner = load_learner(path, file = 'transformertun.pkl')
export_learner = load_learner(path, file = 'transformerarab.pkl')

In [None]:
export_learner.predict("فرنسا تحدد سقف ارتفاع أسعار الغاز والكهرباء عند نسبة 15% مطلع 2023.")

In [None]:
session_info.show()