In [1]:
%%capture
!pip install transformers 
!pip install datasets
!pip install evaluate

In [2]:
import datasets
datasets.__version__

'3.0.1'

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, \
                         AdamW, get_scheduler
import torch
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from sklearn.metrics import roc_auc_score
import evaluate
from tqdm.auto import tqdm
from tqdm.notebook import tqdm

In [5]:
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

from transformers import set_seed
set_seed(42)

In [41]:
train_data = pd.read_csv('/kaggle/input/rosatom-imdb/train_data.csv')
train_data.rename({'rating': 'labels'}, axis=1, inplace=True)

test_data = pd.read_csv('/kaggle/input/rosatom-imdb/test_data.csv')
test_data.rename({'rating': 'labels'}, axis=1, inplace=True)

train_data.head()

Unnamed: 0,text,labels
0,a stori of obsess love push to it limit and of...,8
1,christoph lee is one of my favorit actor im tr...,1
2,the movi is a fantasi the stori line is thin b...,10
3,kate beckinsal is as good if not better than g...,8
4,i watch thi movi on the ground that amber bens...,4


In [42]:
train_data.text[1]

'christoph lee is one of my favorit actor im tri to view all of hi work he ha been known to singlehandedli save movi with hi presenc unfortun thi is not one of them thi movi suffer from a low budget and it product valu are disturb pleasefor the love of christopheravoid thi film'

In [43]:
train_data['labels'].replace({1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}, inplace=True)

test_data['labels'].replace({1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}, inplace=True)

**Заметка: теперь для положительных классов >= 4**

In [45]:
train_data['labels'].unique()

array([5, 0, 7, 3, 6, 1, 4, 2])

In [46]:
train_data.to_csv('train_data_pr.csv', index=False)
test_data.to_csv('test_data_pr.csv', index=False)

In [47]:
train_data.dtypes

text      object
labels     int64
dtype: object

# Training

In [49]:
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

full_dataset = load_dataset('csv', data_files='/kaggle/working/train_data_pr.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [55]:
test_dataset = load_dataset('csv', data_files='/kaggle/working/test_data_pr.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [50]:
full_dataset_tokenized = full_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [56]:
test_dataset_tokenized = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [60]:
print(full_dataset_tokenized)
print(test_dataset_tokenized)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})


In [52]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [63]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probabilities = np.exp(predictions) / np.sum(np.exp(predictions), axis=1, keepdims=True)
    predicted_classes = np.argmax(probabilities, axis=1)
    accuracy_result = accuracy.compute(predictions=predicted_classes, references=labels)
    f1_result = f1.compute(predictions=predicted_classes, references=labels, average="weighted")

    roc_auc_result = {
        "roc_auc": roc_auc_score(labels, probabilities, multi_class="ovr", average="weighted")
    }
    
    return {**accuracy_result, **f1_result, **roc_auc_result}


In [54]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=8
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
training_args = TrainingArguments(
    output_dir="my_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
#     no_cuda=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset_tokenized['train'],
    eval_dataset=test_dataset_tokenized['train'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,1.1806,1.149521,0.537,0.484425,0.901729
2,1.1958,0.997519,0.60444,0.575521,0.930403
3,1.0515,0.810299,0.68508,0.659153,0.955318
4,0.8985,0.660531,0.76476,0.761869,0.970971
5,0.7601,0.562509,0.78784,0.776186,0.978608
6,0.6564,0.446354,0.85172,0.848024,0.986201
7,0.5762,0.399668,0.87516,0.873968,0.98869


TrainOutput(global_step=10941, training_loss=0.909242611002872, metrics={'train_runtime': 6374.5868, 'train_samples_per_second': 27.453, 'train_steps_per_second': 1.716, 'total_flos': 2.2804941177623424e+16, 'train_loss': 0.909242611002872, 'epoch': 7.0})

# Evaluate

In [83]:
data_sample = train_data.sample(10)
data_sample.head()

Unnamed: 0,text,labels
17423,not onli do alien visitor look exactli like fu...,0
14441,billi crystal cowrot coproduc and star in thi ...,3
20743,cecil b demil realli knew how to creat a class...,7
3253,i saw the film at the belgrad film festiv last...,2
7503,the fact that thi movi made it all the way to ...,0


In [81]:
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/my_model/checkpoint-10941")

In [82]:
inputs = tokenizer(text, return_tensors="pt")
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/my_model/checkpoint-10941")
with torch.no_grad():
    logits = model(**inputs).logits

In [86]:
example = []
for text in data_sample['text']:
    inputs = tokenizer(text, return_tensors="pt", max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
        example.append(logits.argmax().item())

# вывод необходимых логитов для примера работы
for i, logits in enumerate(example):
    print(f"Text {i+1} logits: {logits}")

Text 1 logits: 0
Text 2 logits: 3
Text 3 logits: 7
Text 4 logits: 2
Text 5 logits: 0
Text 6 logits: 7
Text 7 logits: 2
Text 8 logits: 0
Text 9 logits: 1
Text 10 logits: 7


In [69]:
predicted_class_id = logits.argmax().item()