In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [3]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

In [4]:
train_df = pd.read_csv('/kaggle/input/ratingsimdb/train.csv')
test_df = pd.read_csv('/kaggle/input/ratingsimdb/test.csv')

In [5]:
dct = {1:0, 2:1, 3:2, 4:3, 7:4, 8:5, 9:6, 10:7}
train_df['rating_new'] = train_df['rating'].map(dct)
test_df['rating_new'] = test_df['rating'].map(dct)

In [6]:
model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=8)
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-tiny', do_lower_case=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [7]:
train_texts = train_df['text'].tolist()
train = ['[CLS] ' + elem + ' [SEP]' for elem in train_texts]

val_texts = test_df['text'].tolist()
val = ['[CLS] ' + elem + ' [SEP]' for elem in val_texts]

In [8]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Обрабатываем train
train_labels = train_df['rating_new'].tolist()
train_encodings = tokenizer(train, truncation=True, padding=True, max_length=512)
train_dataset = CustomDataset(train_encodings, train_labels)

In [9]:
# Обрабатываем test
test_labels = test_df['rating_new'].tolist()
test_encodings = tokenizer(val, truncation=True, padding=True, max_length=512)
test_dataset = CustomDataset(test_encodings, test_labels)

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc,
            'f1'      : f1}

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=5,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=20,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    load_best_model_at_end=True,     
    logging_steps=3000,              
    save_steps=3000,
    evaluation_strategy="steps",
    report_to=[]
)

In [12]:
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
3000,1.7472,1.535576,0.41052,0.308922
6000,1.4926,1.478906,0.42876,0.354318
9000,1.3859,1.483368,0.43364,0.369605
12000,1.3205,1.484364,0.4362,0.380672
15000,1.2613,1.492815,0.43456,0.386439


TrainOutput(global_step=15625, training_loss=1.4339549296875, metrics={'train_runtime': 433.0437, 'train_samples_per_second': 288.654, 'train_steps_per_second': 36.082, 'total_flos': 159108096000000.0, 'train_loss': 1.4339549296875, 'epoch': 5.0})

In [14]:
trainer.evaluate()

{'eval_loss': 1.4789060354232788,
 'eval_accuracy': 0.42876,
 'eval_f1': 0.35431796388025366,
 'eval_runtime': 21.6206,
 'eval_samples_per_second': 1156.307,
 'eval_steps_per_second': 57.815,
 'epoch': 5.0}

In [15]:
def get_full_prediction():
    test_pred = trainer.predict(test_dataset)
    labels = np.argmax(test_pred.predictions, axis=-1)
    return labels
pred = get_full_prediction()

In [None]:
output_dir = 'path_to_save'
model.save_pretrained(output_dir+'model')
tokenizer.save_pretrained(output_dir+'tokenizer')