In [1]:
! pip install transformers datasets
! pip install sentencepiece
! pip install evaluate
! pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
Col

#Подготовка датасета

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
import re
import pandas as pd 
wc = []
reviews = {'review' : [], 'label' : []}
for line in open("/content/drive/MyDrive/datasets/IMDB_reviews.json"):
    review = json.loads(line)
    reviews['review'].append(review['review_text'])
    reviews['label'].append(int(review['is_spoiler']))
    wc.append(len(re.findall(r'\w+', review['review_text'])))

In [4]:
r_lists = list(zip(reviews['review'], reviews['label'], wc))

df = pd.DataFrame(r_lists, columns=['review', 'label', 'word_count'])

####Создадим сбалансированный датасет из рецензий с <200 слов

In [5]:
from datasets import load_dataset, load_metric, Dataset
def sampling_k_elements(group, k=5000):
    if len(group) < k:
        return group
    return group.sample(k)
reviews_shortened = df[df['word_count']<200].groupby('label').apply(sampling_k_elements).reset_index(drop=True)
reviews_shortened_dict = reviews_shortened.to_dict('list')
ds = Dataset.from_dict(reviews_shortened_dict)
ds_split= ds.train_test_split(test_size=0.2)


In [6]:
from transformers import XLNetTokenizer

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)


def tokenize_function(examples):
    return tokenizer(examples["review"], max_length=512,padding="max_length",truncation=True)


tokenized_datasets = ds_split.map(tokenize_function,batched = True)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

In [11]:
train_dataset

Dataset({
    features: ['review', 'label', 'word_count', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8000
})

###Подбор параметров

In [8]:
from transformers import TrainingArguments

test_training_args = TrainingArguments( 
    output_dir='/content/drive/MyDrive/test_results',
    save_total_limit=2,
    load_best_model_at_end=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps = 2,
    per_device_eval_batch_size=16,
    evaluation_strategy='steps',
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_steps=50,
    save_steps=50,
    warmup_steps=50,
    optim="adafactor",
    
)

KeyboardInterrupt: ignored

<a id='trainer'></a>

In [14]:
def model_init():
    return  XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)

In [15]:
new_train_dataset = tokenized_datasets["train"].shard(index=1, num_shards=5)

In [16]:
from transformers import Trainer, XLNetForSequenceClassification
n_trainer = Trainer(
    model_init=model_init,
    args=test_training_args,
    train_dataset=new_train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [17]:
best_run = n_trainer.hyperparameter_search(n_trials=5, direction="maximize")

[32m[I 2023-03-23 13:39:44,202][0m A new study created in memory with name: no-name-455c9ee6-b9c5-4b4a-8ec1-fd9ecf2c5240[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequ

Step,Training Loss,Validation Loss,Accuracy
50,No log,0.723386,0.49
100,No log,0.611732,0.67
150,No log,0.592897,0.681
200,No log,0.638623,0.6795


[32m[I 2023-03-23 13:47:05,397][0m Trial 0 finished with value: 0.6795 and parameters: {'learning_rate': 3.975115756654692e-05, 'num_train_epochs': 4, 'seed': 18, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.6795.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-

Step,Training Loss,Validation Loss


[32m[I 2023-03-23 13:48:56,232][0m Trial 1 finished with value: 0.547 and parameters: {'learning_rate': 8.624705445406433e-06, 'num_train_epochs': 1, 'seed': 3, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.6795.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-ca

Step,Training Loss,Validation Loss,Accuracy
50,No log,0.690357,0.6005
100,No log,0.704716,0.511
150,No log,0.718943,0.489
200,No log,0.698382,0.511
250,No log,0.685498,0.6455
300,No log,0.69294,0.511
350,No log,0.698054,0.489
400,No log,0.697527,0.511
450,No log,0.698306,0.511
500,0.722600,0.694141,0.511


[32m[I 2023-03-23 14:07:19,660][0m Trial 2 finished with value: 0.662 and parameters: {'learning_rate': 8.056091377147518e-05, 'num_train_epochs': 5, 'seed': 17, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.6795.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-ca

Step,Training Loss,Validation Loss,Accuracy
50,No log,0.68199,0.547
100,No log,0.603758,0.678
150,No log,0.597526,0.6895


[32m[I 2023-03-23 14:12:46,068][0m Trial 3 finished with value: 0.6895 and parameters: {'learning_rate': 1.0639214874359341e-05, 'num_train_epochs': 3, 'seed': 36, 'per_device_train_batch_size': 16}. Best is trial 3 with value: 0.6895.[0m
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base

Step,Training Loss,Validation Loss,Accuracy
50,No log,0.680429,0.5785
100,No log,0.674301,0.5715
150,No log,0.668315,0.596
200,No log,0.664966,0.6
250,No log,0.661429,0.612
300,No log,0.659634,0.6145


[32m[I 2023-03-23 14:20:03,576][0m Trial 4 finished with value: 0.6145 and parameters: {'learning_rate': 1.922445468764568e-06, 'num_train_epochs': 3, 'seed': 7, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 0.6895.[0m


In [18]:
best_run

BestRun(run_id='3', objective=0.6895, hyperparameters={'learning_rate': 1.0639214874359341e-05, 'num_train_epochs': 3, 'seed': 36, 'per_device_train_batch_size': 16}, run_summary=None)

### Обучение

In [9]:
from transformers import XLNetForSequenceClassification

model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments( 
    output_dir='/content/drive/MyDrive/results',
    save_total_limit=2,
    load_best_model_at_end=True,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=16,
    evaluation_strategy='steps',
    learning_rate= 1.0639214874359341e-05,
    eval_steps=250,
    save_steps=250,
    warmup_steps=100,
    optim="adafactor",
    
)

In [11]:

import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
250,No log,0.595833,0.6655
500,0.639900,0.601043,0.6685
750,0.639900,0.577832,0.7015
1000,0.565200,0.587039,0.7055
1250,0.565200,0.627091,0.7075
1500,0.500000,0.609132,0.7085
1750,0.500000,0.63886,0.7075
2000,0.439500,0.660051,0.7095


TrainOutput(global_step=2000, training_loss=0.5361532897949218, metrics={'train_runtime': 8658.8506, 'train_samples_per_second': 3.696, 'train_steps_per_second': 0.231, 'total_flos': 9116168945664000.0, 'train_loss': 0.5361532897949218, 'epoch': 4.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.5778315663337708,
 'eval_accuracy': 0.7015,
 'eval_runtime': 247.0371,
 'eval_samples_per_second': 8.096,
 'eval_steps_per_second': 0.506,
 'epoch': 4.0}

In [16]:
trainer.save_model("/content/drive/MyDrive/model_final")

<a id='pytorch_native'></a>