## Repeat & Final stage of the training pipeline where we train our classification models to make pseudo labels and eventually final predictions.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from nltk import word_tokenize

from sklearn.metrics import classification_report, f1_score
from scipy.special import expit as sigmoid

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import string
import re

import shutil


from lion_pytorch import Lion

from tqdm.auto import tqdm

pd.set_option('display.max_colwidth', None)
tqdm.pandas()

In [None]:
import torch
import torch.nn as nn
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer
import os
import random
import gc

from transformers import logging


os.environ["WANDB_DISABLED"] = "true"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state


random_state = set_seed(1942)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

In [None]:
model_name = "stage_f/pretrain_mlm_electra-base-turkish-cased-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_name)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer, padding="longest")

In [None]:
df = pd.read_csv('data/teknofest_train_final.csv', sep='|')
df['length'] = df['text'].apply(len)
df = df[~(df['length']<=2)].reset_index(drop=True)
df.loc[df['target'] == 'OTHER', 'is_offensive'] = 0

def remove_punctuation(text):
    """
    This function removes punctuation from a given text.
    """
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


df['text'] = df['text'].apply(lambda x: remove_punctuation(x))
df

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['target'])

label_to_index = {label: index for index, label in enumerate(le.classes_)}

In [None]:
label_to_index

In [None]:
index_to_label = {v:k for k,v in label_to_index.items()}
index_to_label

In [None]:
df.sample(10).head(10)

In [None]:
df['fold'] = -1

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_index, val_index) in enumerate(skf.split(df['text'], df['label'])):
    df.loc[val_index, 'fold'] = fold


In [None]:
class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text, label = row.text, row.label
        encoding = tokenizer(text, max_length=64, truncation=True)
        encoding = {key: torch.tensor(val, dtype=torch.int64) for key, val in encoding.items()}
        encoding["labels"] = torch.tensor(label, dtype=torch.long)
        return dict(encoding)



In [None]:
a = IntentDataset(df)
a[1]

In [None]:
tokenizer.decode(a[3]['input_ids'])

In [None]:
def model_init():
    return (AutoModelForSequenceClassification.from_pretrained(model_name,
                                                               problem_type = "single_label_classification",
                                                               id2label = index_to_label,
                                                               label2id = label_to_index,                                                               
                                                               num_labels=df.label.nunique(),     
                                                               output_hidden_states=False, 
                                                               ignore_mismatched_sizes=True
                                                               
                                                          ).to('cuda'))



In [None]:
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = np.argmax(pred.predictions, axis=1)
    
    
    
    score = f1_score(y_true, y_pred,
    zero_division=0, average='macro')
    return {"macro f1": score}

In [None]:
class CFG:    
    N_EPOCH = 3
    BS = 8
    WARM_UP = 0.0
    LR = 3e-5
    WEIGHT_DECAY = 0.0

In [None]:
oof = df.copy()
oof.loc[: ,f"pred_{model_name.split('/')[-1]}"] = 0


logits = np.zeros(shape=(len(oof), oof.label.nunique()))


for fold in df.fold.unique():
    
    os.makedirs(f"multiclass_{model_name.split('/')[-1]}", exist_ok=True)
    
    model = model_init()
    
    model.resize_token_embeddings(len(tokenizer))
    
    len_df = df[df.fold!=fold].shape[0]
    
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": CFG.WEIGHT_DECAY,
        },
        {
            "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=CFG.LR)
    num_training_steps = (len_df * CFG.N_EPOCH) // (CFG.BS * 1)
    step_size = int(np.ceil((num_training_steps/CFG.N_EPOCH)/4))-1
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=CFG.WARM_UP*num_training_steps,
        num_training_steps=num_training_steps
    )

    training_args = TrainingArguments(
        f"turkish_profanity_{model_name.split('/')[-1]}_fold{fold}",
        fp16=False,
        evaluation_strategy = "steps",
        save_strategy = "no",
        learning_rate=CFG.LR,
        per_device_train_batch_size=CFG.BS,
        per_device_eval_batch_size=CFG.BS*2,
        num_train_epochs=CFG.N_EPOCH,
        # weight_decay=CFG.WEIGHT_DECAY,
        load_best_model_at_end=False,
        # metric_for_best_model="macro f1",
        metric_for_best_model="macro f1",
        greater_is_better=True,
        eval_steps = step_size,
        save_steps = step_size,
        logging_steps = step_size,
        seed = 1942,
        data_seed = 1942,
        dataloader_num_workers = 0,
        # lr_scheduler_type ="linear",
        # warmup_steps=0,               # number of warmup steps for learning rate scheduler
        save_total_limit=2,              # limit the total amount of checkpoints. Deletes the older checkpoints.
        group_by_length = True,
        full_determinism = True,
        label_smoothing_factor = 0.0
    )

    trainer = Trainer(
        # model_init=model_init,
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=IntentDataset(df[df.fold!=fold]),
        eval_dataset=IntentDataset(df[df.fold==fold]),
        compute_metrics=compute_metrics,
        # optimizers=(optimizer, scheduler)
    )

    trainer.train()

    preds = trainer.predict(IntentDataset(df[df.fold==fold]))

    oof.loc[df.fold==fold ,f"pred_{model_name.split('/')[-1]}"] = np.argmax(preds.predictions, axis=1).astype(int)
    logits[df.index[df['fold'] == fold].tolist()] += preds.predictions
    

    tokenizer.save_pretrained(f"multiclass_{model_name.split('/')[-1]}/fold{fold}")
    trainer.save_model(f"multiclass_{model_name.split('/')[-1]}/fold{fold}")
    
    shutil.rmtree(f"turkish_profanity_{model_name.split('/')[-1]}_fold{fold}")
    
    del trainer
    gc.collect()
    torch.cuda.empty_cache()
    
oof.to_csv(f"multiclass_{model_name.split('/')[-1]}/multiclass_{model_name.split('/')[-1]}.csv", index=False)

np.save(f"multiclass_{model_name.split('/')[-1]}/multiclass_{model_name.split('/')[-1]}.npy", logits)

In [None]:
oof['targ_pred'] = oof[f"pred_{model_name.split('/')[-1]}"].map(index_to_label)

In [None]:
print(classification_report(oof.label, oof[f"pred_{model_name.split('/')[-1]}"], target_names=label_to_index.keys(), zero_division=0))