## This is the initial Masked Language Modelling stage for domain adaptation part. We train semi supervised objective on unlabeled offensive text corpus.

In [None]:
import os
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
from lion_pytorch import Lion

from datasets import load_dataset, DatasetDict, Dataset
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel
from transformers import DataCollatorForLanguageModeling, AutoModelForMaskedLM, Trainer, DataCollatorForWholeWordMask
from transformers import TrainingArguments
from transformers import AdamW
from transformers import get_cosine_schedule_with_warmup

import string
import regex as re

import shutil

tqdm.pandas()
os.environ["WANDB_DISABLED"] = "true"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
pd.set_option('display.max_colwidth', None)

%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
class CFG:
    OUTPUT = "output"
    SEED = 42
    # select model from hub
    MODEL_NAME = "microsoft/mdeberta-v3-base"
    TOKENIZER = None
    
    N_EPOCH = 8
    BS = 16
    WARM_UP = 0.1
    LR = 3e-5
    WEIGHT_DECAY = 0.0

    
cfg_dic = {k: v for k, v in vars(CFG).items() if k[:2] != "__"}
cfg_dic

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
CFG.TOKENIZER = TOKENIZER
del TOKENIZER

In [None]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state


random_state = set_seed(42)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

In [None]:
def read_data():
    # get the unlabeled data from hub
    df = pd.read_csv('turkish_toxic_language.csv')
    return df

df = read_data()

df[df['text'].isna()]
df = df.dropna()
df.reset_index(drop=True, inplace=True)

df.to_csv('mlm_df_v3.csv', index=False)

In [None]:
from transformers import get_linear_schedule_with_warmup


def tokenize_function(examples):
    result = CFG.TOKENIZER(examples["text"])
    if CFG.TOKENIZER.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


datasets = load_dataset('csv', data_files={'train': 'mlm_df_v3.csv'})
tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=["text", "target", "source"],
    batch_size=CFG.BS)

In [None]:
tokenized_datasets

In [None]:
lens = []

for idx, sample in enumerate(tokenized_datasets["train"]["input_ids"]):
    lens.append(len(sample))
np.quantile(lens, 0.90)

In [None]:
chunk_size = 64

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True, batch_size=CFG.BS)
lm_datasets

In [None]:
CFG.TOKENIZER.decode(lm_datasets["train"][6]["input_ids"])

In [None]:
val_df = pd.read_csv('teknofest_train_final.csv', sep='|')
val_df['length'] = val_df['text'].apply(len)
val_df = val_df[~(val_df['length']<=2)].reset_index(drop=True)

def remove_punctuation(text):
    """
    This function removes punctuation from a given text.
    """
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


val_df['text'] = val_df['text'].apply(lambda x: remove_punctuation(x))
dataset_ = Dataset.from_pandas(val_df[['text']])
dataset = DatasetDict()
dataset['valid'] = dataset_

val_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=["text"],
    batch_size=CFG.BS)
    
val_lm_datasets = val_dataset.map(group_texts, batched=True, batch_size=CFG.BS)
val_lm_datasets

In [None]:
CFG.TOKENIZER.decode(val_lm_datasets["valid"][112]["input_ids"])

In [None]:
from transformers import get_linear_schedule_with_warmup


data_collator = DataCollatorForWholeWordMask(tokenizer=CFG.TOKENIZER, mlm_probability=0.15)

config = AutoConfig.from_pretrained(CFG.MODEL_NAME, output_hidden_states=True)
config.attention_probs_dropout_prob = 0.1
model = AutoModelForMaskedLM.from_pretrained(CFG.MODEL_NAME, config=config)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay": CFG.WEIGHT_DECAY,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_parameters, lr=CFG.LR)
num_training_steps = (lm_datasets['train'].num_rows * CFG.N_EPOCH) // (CFG.BS * 1)
step_size = int(np.ceil((num_training_steps/CFG.N_EPOCH)/4))


scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=CFG.WARM_UP*num_training_steps,
    num_training_steps=num_training_steps
)

training_args = TrainingArguments(
    output_dir="output-mlm",
    evaluation_strategy="steps",
    # learning_rate=1e-5,
    # weight_decay=0.1,
    save_strategy = "steps",
    load_best_model_at_end=True,
    #metric_for_best_model="macro f1",
    per_device_train_batch_size=CFG.BS,
    per_device_eval_batch_size=CFG.BS*2,
    num_train_epochs=CFG.N_EPOCH,
    #run_name=f'output-mlm',
    logging_dir='./logs',
    # lr_scheduler_type='cosine',
    # warmup_ratio=0.1,
    fp16=False,
    logging_steps=step_size,
    eval_steps=step_size,
    save_steps=step_size,
    # gradient_checkpointing=True,
    gradient_accumulation_steps=1,
    group_by_length = True,
    save_total_limit=2, 
    #deepspeed=ds_config_dict,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=val_lm_datasets['valid'],
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
)

trainer.train()

In [None]:
CFG.TOKENIZER.save_pretrained(f'mlm_mdeberta-v3-base')
trainer.save_model(f'mlm_mdeberta-v3-base')

shutil.rmtree('output-mlm')