In [1]:
import pandas as pd
train_df = pd.read_csv('/kaggle/input/vk-test/train_spam.csv')
train_df

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
...,...,...
16273,spam,if you are interested in binary options tradin...
16274,spam,dirty pictureblyk on aircel thanks you for bei...
16275,ham,or you could do this g on mon 1635465 sep 1635...
16276,ham,insta reels par 80 गंद bhara pada hai 👀 kuch b...


Мы будем использовать datasets от huuggingface для того, чтобы хранить/использовать данные.

In [2]:
num_spam_emails = (train_df['text_type'] == 'spam').sum()
num_not_spam_emails = len(train_df) - num_spam_emails
(num_spam_emails, num_not_spam_emails)

(4809, 11469)

Как мы видим, датасет несбалансирован. Поэтому в качестве "верного" label выбираем сообщения, не являющимся спамом (их меньше, minority class)

In [3]:
from datasets import Dataset
train_df = train_df.rename(columns={"text_type": "label"})

train_df.loc[train_df['label'] == 'ham', 'label'] = 1 # Assign positive label to the minority class
train_df.loc[train_df['label'] == 'spam', 'label'] = 0

train_ds = Dataset.from_pandas(train_df)
train_ds = train_ds.train_test_split(test_size=0.1)
train_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 14650
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1628
    })
})

Был также протестирован метод TF-IDF, с тренировкой собственного токенизатора, фактически адааптирован ноутбук: https://www.kaggle.com/code/datafan07/train-your-own-tokenizer c соревнования Kaggle. Но этот подход показал более низкий ROC AUC.
Было решено использовать языковую модель от HF's transformers.

In [4]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

2024-05-02 10:17:24.041726: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-02 10:17:24.041830: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-02 10:17:24.170845: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=1028)

In [6]:
import os
train_ds['train'] = train_ds['train'].map(preprocess_function, batched=False, num_proc=os.cpu_count())
train_ds['test'] = train_ds['test'].map(preprocess_function, batched=False, num_proc=os.cpu_count())

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

Map (num_proc=4):   0%|          | 0/14650 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1628 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
torch.backends.cuda.enable_mem_efficient_sdp(False)
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-large", num_labels=2, id2label=id2label, label2id=label2id, torch_dtype=torch.bfloat16
)


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
!pip install evaluate -q
!wandb login '5beda7dd1b21609fc4a9e459e3732366ed3a32d3'

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [9]:
import evaluate
import numpy as np
from torch import nn
from transformers import Trainer
from datasets import load_metric
roc_auc = evaluate.load("roc_auc")


# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return roc_auc.compute(prediction_scores=predictions, references=labels, average="macro")

training_args = TrainingArguments(
    output_dir="vk-spam-classiciation",
    learning_rate=1e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy='steps',
    eval_steps=300,
    logging_steps=300,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds['train'],
    eval_dataset=train_ds['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33mbossprocool[0m ([33mmemers[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240502_101812-yrocs0r7[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33melated-frog-312[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/memers/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/memers/huggingface/runs/yrocs0r7[0m


Step,Training Loss,Validation Loss
300,0.6076,0.517405
600,0.3938,0.326135
900,0.308,0.278236
1200,0.2733,0.272212
1500,0.2701,0.266002
1800,0.2794,0.261026
2100,0.2443,0.260633
2400,0.27,0.26074


TrainOutput(global_step=2442, training_loss=0.329141835127572, metrics={'train_runtime': 2190.8027, 'train_samples_per_second': 6.687, 'train_steps_per_second': 1.115, 'total_flos': 4209849976794984.0, 'train_loss': 0.329141835127572, 'epoch': 1.0})

In [10]:
trainer.model.save_pretrained('vk-spam-classification')