In [22]:
import torch
from transformers import AutoTokenizer
import pandas as pd
from datasets import load_dataset, dataset_dict, load_metric

In [2]:
tokenizer = AutoTokenizer.from_pretrained("C:/rubert-tiny2/")

In [3]:
tokenizer

PreTrainedTokenizerFast(name_or_path='C:/rubert-tiny2/', vocab_size=83828, model_max_len=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
df = pd.read_excel("C:/data/ceo-text/full_df_v2.52-21-12-2021.xlsx")

In [5]:
df.head()

Unnamed: 0,Комментарий,target,sub_target
0,со слов клиента- банкомат не однократно осущес...,cards,cards
1,добрый день!просьба удаленно перезагрузить ус....,"remote_reboot,cassette","remote_reboot,cassette"
2,нет операций,no_oper,no_oper
3,просьба перезагрузить банкомат по питанию удал...,remote_reboot,remote_reboot
4,проблемы с картридером - захват карт - не возв...,cards,cards


In [6]:
df = df[(df.target == 'no_oper') | (df.target == 'cards') | (df.target == 'remote_reboot') | (df.target == 'host_close')]

In [43]:
df['label'] = df.target.factorize()[0]

In [44]:
df.head()

Unnamed: 0,Комментарий,target,sub_target,label
0,со слов клиента- банкомат не однократно осущес...,cards,cards,0
2,нет операций,no_oper,no_oper,1
3,просьба перезагрузить банкомат по питанию удал...,remote_reboot,remote_reboot,2
4,проблемы с картридером - захват карт - не возв...,cards,cards,0
5,отсутствуют операции,no_oper,no_oper,1


In [45]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Комментарий'], df['label'], test_size=.3)

In [46]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

In [47]:
class CeoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [48]:
train_dataset = CeoDataset(train_encodings, train_labels.tolist())
val_dataset = CeoDataset(val_encodings, val_labels.tolist())

In [49]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [50]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("C:/rubert-tiny2/", num_labels=4)

loading configuration file C:/rubert-tiny2/config.json
Model config BertConfig {
  "_name_or_path": "C:/rubert-tiny2/",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "emb_size": 312,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 312,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 600,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 2048,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 83828
}

loading weights file C:/rubert-tiny2/pytorch

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions=predictions, references=labels)

In [51]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [52]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics = compute_metrics,
)

In [53]:
trainer.train()

***** Running training *****
  Num examples = 14107
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4410


Step,Training Loss
500,0.2828
1000,0.0345
1500,0.017
2000,0.0177
2500,0.009
3000,0.0096
3500,0.0082
4000,0.0065


Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1000\special_tokens_map.json
Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json
Model weights saved in ./results\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1500\special_toke

TrainOutput(global_step=4410, training_loss=0.04413772018588319, metrics={'train_runtime': 65730.1202, 'train_samples_per_second': 1.073, 'train_steps_per_second': 0.067, 'total_flos': 183925459494840.0, 'train_loss': 0.04413772018588319, 'epoch': 5.0})

In [55]:
# inference
from transformers import pipeline

In [64]:
model.save_pretrained("C:\\model.json")

Configuration saved in C:\model.json\config.json
Model weights saved in C:\model.json\pytorch_model.bin


In [66]:
#|tokenizer = AutoTokenizer.from_pretrained(r"C:\model.json\")
model = AutoModelForSequenceClassification.from_pretrained("C:\model.json")

loading configuration file C:\model.json\config.json
Model config BertConfig {
  "_name_or_path": "C:\\model.json",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "emb_size": 312,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 312,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 600,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 2048,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size

In [68]:
clf = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

In [76]:
clf("ЗАкрыть на хосте")

[{'label': 'LABEL_3', 'score': 0.9987146854400635}]

In [128]:
df[['Комментарий', 'target']].rename(columns={'Комментарий':'text','target': 'labels'}).to_csv('C:/data/ceo-text/ceo_test.csv', index=False)

In [129]:
data = load_dataset('csv', data_files='C:/data/ceo-text/ceo_test.csv')

Using custom data configuration default-70dfc129d4b6391b


Downloading and preparing dataset csv/default to C:\Users\Алексей\.cache\huggingface\datasets\csv\default-70dfc129d4b6391b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


Dataset csv downloaded and prepared to C:\Users\Алексей\.cache\huggingface\datasets\csv\default-70dfc129d4b6391b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 124.96it/s]


In [130]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 20153
    })
})

In [152]:
def preprocess(data):
    return tokenizer(data['text'], truncation=True)

In [151]:
imdb = load_dataset("imdb")

Reusing dataset imdb (C:\Users\Алексей\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 53.58it/s]


In [21]:
type(imdb)

datasets.dataset_dict.DatasetDict

In [61]:
# TODO написать функцию, которая преобразует датасет CEO в формат transformers
# Возможно, нужно будет еще labels перевести в инты
# TODO нужен препрцессинг текста

In [159]:
tokenized_data = data.map(preprocess, batched=True)
tokenized_df = imdb.map(preprocess, batched=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 28.86ba/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:10<00:00,  2.46ba/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:07<00:00,  3.46ba/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:15<00:00,  3.22ba/s]


In [154]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20153
    })
})

In [160]:
from transformers import DataCollatorWithPadding

In [161]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [162]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [163]:
model = AutoModelForSequenceClassification.from_pretrained("C:/rubert-tiny2/", num_labels=2)

loading configuration file C:/rubert-tiny2/config.json
Model config BertConfig {
  "_name_or_path": "C:/rubert-tiny2/",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "emb_size": 312,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 312,
  "initializer_range": 0.02,
  "intermediate_size": 600,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 2048,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 83828
}

loading weights file C:/rubert-tiny2/pytorch_model.bin
Some weights of the model checkpoint at C:/rubert-tiny2/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bia

In [164]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [165]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df['train'],
    eval_dataset=tokenized_df['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7815


Step,Training Loss
500,0.5056
1000,0.3815
1500,0.345


Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1000\special_tokens_map.json
Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json
Model weights saved in ./results\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1500\special_toke