# Seed

In [1]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG']=':16:8'

In [2]:
seed = 66

import numpy as np
np.random.seed(seed)
np.random.RandomState(seed)

import random
random.seed(seed)

import torch
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)

# import tensorflow as tf
# tf.random.set_seed(seed)

# Dataset

In [3]:
INT_TO_STR = {
    0: 'descriptive',
    1: 'direct',
    2: 'non-offensive',
    3: 'offensive',
    4: 'reporting'
}

STR_TO_INT = {
    'descriptive': 0,
    'direct': 1,
    'non-offensive': 2,
    'offensive': 3,
    'reporting': 4
}

In [4]:
from datasets import load_dataset
ds = load_dataset('csv', data_files={'data': '../data/train_data.csv'})
ds_back = load_dataset('csv', data_files={'data': '../data/train_data_back.csv'})


  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-45bae733d7e469ec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 594.01it/s]
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-e21444dc684dec16/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 759.01it/s]


In [5]:
from transformers import AutoTokenizer
MODEL_CKPT = "dumitrescustefan/bert-base-romanian-cased-v1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

## Normalize

In [6]:
import re
import emoji

def normalize(batch):
    """
    This function should be used before tokenizing the input string.

    Normalizes the input string in the following ways:
    -> Converts from ş to ș, ţ to ț, etc.
    -> Converts @mention to USER, #hashtag to HASHTAG, http... and www... to HTTPURL
    -> Converts emoticons to :emoji_with_long_name:
    -> Replaces :emoji_with_long_name: with emoji_with_long_name and replaces _, : and - with empty string
    -> Removes multiple whitespaces with a single whitespace
    """

    sentence = batch['text']

    # Make sure it's a string
    sentence = str(sentence)

    # Convert from ş to ș, ţ to ț, etc.
    sentence = re.sub(r'ş', 'ș', sentence)
    sentence = re.sub(r'Ş', 'Ș', sentence)
    sentence = re.sub(r'ţ', 'ț', sentence)
    sentence = re.sub(r'Ţ', 'Ț', sentence)

    # Convert @mentions to USER, #hashtags to HASHTAG, http... and www... to HTTPURL
    sentence = re.sub(r'@\S+', 'USER', sentence)
    sentence = re.sub(r'#\S+', 'HASHTAG', sentence)
    sentence = re.sub(r'http\S+', 'HTTPURL', sentence)
    sentence = re.sub(r'www\S+', 'HTTPURL', sentence)

    # Convert emoticons to :emoji_with_long_name:
    sentence = emoji.demojize(sentence, delimiters=(' :', ': '))

    # Replace :emoji_with_long_name: with emojiwithlongname
    sentence = re.sub(r':\S+:', lambda x: x.group(0).replace('_', '').replace(':', '').replace('-', ''), sentence)

    # Remove multiple whitespaces with a single whitespace
    sentence = re.sub(r'\s+', ' ', sentence)

    return {'text': sentence}

## Tokenize

In [7]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

## Torch format

# Model

## Metrics

In [9]:
%pip install -q evaluate

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

from transformers import Trainer
import torch.nn as nn

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, balanced_accuracy_score
from transformers import EvalPrediction
import torch
import evaluate
import numpy as np
from datasets import load_metric, concatenate_datasets

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(pred):
    labels = pred.label_ids
    preds  = pred.predictions.argmax(-1) # choose the predicted class (from an array of probabilites)

    f1  = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    bacc = balanced_accuracy_score(labels, preds)

    return {'accuracy': acc, 'f1': f1, 'balanced_accuracy': bacc}

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        # loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([26.315, 18.181, 1.265, 9.090, 200.0]).to(device))
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([15.873, 11.111, 1.538, 5.555, 111.111]).to(device))#[, , , , ]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


def train(index: int, dataset_tokenized):

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CKPT,
        num_labels=5,
        id2label=INT_TO_STR,
        label2id=STR_TO_INT,
        classifier_dropout=0.1,
        # use_auth_token='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'
    )

    training_args = TrainingArguments(
        output_dir=f"nitro-robertlarge-nlp-v1.9.{index}",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=4,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    
    
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset_tokenized["train"],
        eval_dataset=dataset_tokenized["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    del trainer

In [10]:
from datasets import load_dataset
ds = load_dataset('csv', data_files={'data': '../data/train_data.csv'})
ds = ds.rename_column('Final Labels', 'label')
ds = ds.rename_column('Text', 'text')
ds = ds.remove_columns(['Id'])
ds = ds['data']
# ds = ds.class_encode_column('label')

ds_back = ds_back.rename_column('text_back', 'text')
ds_back = ds_back.rename_column('Final Labels', 'label')

ds_back = ds_back.remove_columns(['Text', 'Id'])
ds_back = ds_back['data']
# ds_back = ds_back.class_encode_column('label')

ds_concat = concatenate_datasets([ds, ds_back])
ds_concat = ds_concat.class_encode_column('label')


Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-45bae733d7e469ec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 1/1 [00:00<00:00, 680.12it/s]
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-45bae733d7e469ec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-bd226ea07604be46.arrow


In [11]:
ds_concat

Dataset({
    features: ['text', 'label'],
    num_rows: 47178
})

In [12]:
def ensamble_train(index = 1, index_start = 0, index_end = 5):

    seeds = []
    for i in range(0, index):
        seeds.append(seed + i)

    for i in range(0, index):
        if i >= index_start:
            
            print("SEED: ", seed)
            ds_split = ds_concat.train_test_split(test_size=0.2, stratify_by_column='label', seed=seeds[i])
            ds_split = ds_split.map(lambda batch: normalize(batch), batched=False)
            ds_tok_split = ds_split.map(lambda batch: tokenize(batch), batched=True, batch_size=None)
            ds_tok_split.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

            train(i, ds_tok_split)
        else:
            ds_split = ds.train_test_split(test_size=0.2, stratify_by_column='label', seed=seed)


ensamble_train(1, 0, 1)

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/csv/default-45bae733d7e469ec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0aed00f0af3897d4.arrow and /root/.cache/huggingface/datasets/csv/default-45bae733d7e469ec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0ee077293d016e55.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-45bae733d7e469ec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-58c5e4f1cd3f9786.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-45bae733d7e469ec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-754146f276ef539f.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-45bae733d7e469ec/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-faa345e614eefe24.arrow
Loading cached processed data

SEED:  66


Some weights of the model checkpoint at dumitrescustefan/bert-base-romanian-cased-v1 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClass

Epoch,Training Loss,Validation Loss


non-offensive    29296 (~79%)
offensive         4086 (~11%)
direct            2048 (~5.5%)
descriptive       1419 (~3,8%)
reporting          208 (~0.5%) 0.79k1 = 0.005k2 = 1
Total: 37,057

0: 'descriptive',
1: 'direct',
2: 'non-offensive',
3: 'offensive',
4: 'reporting'

# Test set

In [19]:
def load_model(checkpoint_path: str, ds_tok):

    model2 = AutoModelForSequenceClassification.from_pretrained(
        checkpoint_path,
        num_labels=5,
        id2label=INT_TO_STR,
        label2id=STR_TO_INT,
        classifier_dropout=0.1,
    )

    training_args_ft = TrainingArguments(
        output_dir=checkpoint_path,
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=16,
        num_train_epochs=4,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer2 = CustomTrainer(
        model=model2,
        args=training_args_ft,
        train_dataset=ds_tok,
        eval_dataset=ds_tok,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    return trainer2

In [27]:
ds_eval = load_dataset('csv', data_files={'data': './dataset/test_data.csv'})
ds_eval = ds_eval.rename_column('Text', 'text')
ds_eval = ds_eval.rename_column('Id', 'id')
ds_eval_data = ds_eval['data']
ds_eval_data = ds_eval_data.map(lambda batch: normalize(batch), batched=False)
ds_eval_tok = ds_eval_data.map(lambda batch: tokenize(batch), batched=True, batch_size=None)
ds_eval_data[0]

Using custom data configuration default-302d635d8fbd1d39
Found cached dataset csv (C:/Users/andre/.cache/huggingface/datasets/csv/default-302d635d8fbd1d39/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\andre\.cache\huggingface\datasets\csv\default-302d635d8fbd1d39\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-1e84eeeb2d4ac9cd.arrow
Loading cached processed dataset at C:\Users\andre\.cache\huggingface\datasets\csv\default-302d635d8fbd1d39\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-b9baf9b78a1a1263.arrow


{'id': 0,
 'text': 'În miezul ei se găsea un obiect ciudat , roz , răsucit , mărit de suprafața rotundă , care semăna cu un trandafir sau cu o anemonă de mare.'}

In [28]:
ensamble_list = [
    './nitro-robertlarge-nlp-v1.9.0/checkpoint-3540'
]

ensamble_predictions = []
for model in ensamble_list:
    trainer = load_model(model, ds_eval_tok)

    predictions = trainer.predict(ds_eval_tok)
    ensamble_predictions.append(predictions)

    del trainer

ensamble_predictions

loading configuration file ./nitro-robertlarge-nlp-v1.9.0/checkpoint-3540\config.json
Model config BertConfig {
  "_name_or_path": "./nitro-robertlarge-nlp-v1.9.0/checkpoint-3540",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "descriptive",
    "1": "direct",
    "2": "non-offensive",
    "3": "offensive",
    "4": "reporting"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "descriptive": 0,
    "direct": 1,
    "non-offensive": 2,
    "offensive": 3,
    "reporting": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_ver

  0%|          | 0/196 [00:00<?, ?it/s]

[PredictionOutput(predictions=array([[-0.50763935, -1.3257251 ,  3.6892135 , -0.23153174, -2.728777  ],
        [-2.887187  ,  0.7367201 ,  0.4814147 ,  3.2221    , -2.21238   ],
        [-0.68719965,  0.08260232,  3.7809756 ,  1.0506375 , -4.01028   ],
        ...,
        [-0.48108423, -2.189384  ,  0.95113224,  4.044851  , -3.3308613 ],
        [ 4.6631646 , -0.5120723 , -0.30559567, -0.8618112 , -2.926247  ],
        [-3.2720423 ,  0.15021496,  2.0113256 ,  2.7746964 , -2.5090785 ]],
       dtype=float32), label_ids=None, metrics={'test_runtime': 7.8423, 'test_samples_per_second': 399.116, 'test_steps_per_second': 24.993})]

In [22]:
import numpy as np
def softmax(x):
    return(np.exp(x)/np.exp(x).sum())

In [29]:
final_ensamble_prediction = ensamble_predictions[0].predictions

for i in range(1, len(ensamble_predictions)):
    print(i)
    final_ensamble_prediction = final_ensamble_prediction + ensamble_predictions[i].predictions

In [30]:
preds = np.argmax(np.array(final_ensamble_prediction), axis=-1)

In [31]:
preds.shape

(3130,)

In [33]:
import csv
import pandas as pd

df = pd.DataFrame({})

with open(f'./subs/nitro-robertweet-nlp-v2.1.0.csv', 'w', newline='') as csvfile:
    data = []
    for i, pred in enumerate(preds):
        data.append([i, INT_TO_STR[pred]])

    header=['Id', 'Label']
    writer = csv.writer(csvfile)
    writer.writerow(header)
    writer.writerows(data)