In [3]:
import pandas as pd
test_df = pd.read_csv('/kaggle/input/vk-test/test_spam.csv')
test_df

Unnamed: 0,text
0,j jim whitehead ejw cse ucsc edu writes j you ...
1,original message from bitbitch magnesium net p...
2,java for managers vince durasoft who just taug...
3,there is a youtuber name saiman says
4,underpriced issue with high return on equity t...
...,...
4065,husband to wifetum meri zindagi hoorwifeor kya...
4066,baylor enron case study cindy yes i shall co a...
4067,boring as compared to tp
4068,hellogorgeous hows u my fone was on charge lst...


In [43]:
train_df = pd.read_csv('/kaggle/input/vk-test/train_spam.csv')
train_df

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
...,...,...
16273,spam,if you are interested in binary options tradin...
16274,spam,dirty pictureblyk on aircel thanks you for bei...
16275,ham,or you could do this g on mon 1635465 sep 1635...
16276,ham,insta reels par 80 à¤—à¤‚à¤¦ bhara pada hai ðŸ‘€ kuch b...


In [8]:
from datasets import Dataset
test_ds = Dataset.from_pandas(test_df)

In [9]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=1028)

In [44]:
import os
test_ds = test_ds.map(preprocess_function, batched=False, num_proc=os.cpu_count())

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

id2label = {0: "spam", 1: "ham"}
label2id = {"spam": 0, "ham": 1}

Map (num_proc=4):   0%|          | 0/4070 [00:00<?, ? examples/s]

In [45]:
test_ds

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4070
})

In [46]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
torch.backends.cuda.enable_mem_efficient_sdp(False)
model = AutoModelForSequenceClassification.from_pretrained(
    "/kaggle/input/deberta-vk", num_labels=2, id2label=id2label, label2id=label2id, torch_dtype=torch.bfloat16
).cuda()


In [47]:
!pip install evaluate -q

In [51]:
import evaluate
import numpy as np
from torch import nn
from transformers import Trainer
from datasets import load_metric
from transformers import pipeline

roc_auc = evaluate.load("roc_auc")


# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return roc_auc.compute(prediction_scores=predictions, references=labels, average="macro")

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device='cuda')

In [53]:
from tqdm import tqdm
results = []
for sample in tqdm(test_ds):
    pred = classifier(sample['text'])[0]['label']
    results.append(pred)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4070/4070 [04:23<00:00, 15.42it/s]


In [56]:
submission_df = pd.DataFrame({'text': test_ds['text'], 'score': results})
submission_df

Unnamed: 0,text,score
0,j jim whitehead ejw cse ucsc edu writes j you ...,ham
1,original message from bitbitch magnesium net p...,ham
2,java for managers vince durasoft who just taug...,ham
3,there is a youtuber name saiman says,ham
4,underpriced issue with high return on equity t...,spam
...,...,...
4065,husband to wifetum meri zindagi hoorwifeor kya...,ham
4066,baylor enron case study cindy yes i shall co a...,ham
4067,boring as compared to tp,ham
4068,hellogorgeous hows u my fone was on charge lst...,ham


In [57]:
submission_df.to_csv('submission.csv', index=False)