In [1]:
import pandas as pd
from transformers import AutoTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments
import torch

In [2]:
df = pd.read_csv("lf_data_words.csv")
df = df.dropna()

In [3]:
df['text'] = df.apply(lambda row: f"{row['LFFUNC']}({row['LFARG']}) - {row['LFVAL']}", axis=1)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('ai-forever/ruBert-large')
model = BertForMaskedLM.from_pretrained('ai-forever/ruBert-large')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Some weights of the model checkpoint at ai-forever/ruBert-large were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
valid_examples = []
for _, row in df.iterrows():
    target = row['LFVAL']
    stop_words = {'.', '!', '?', 'не', 'это'}
    if target in stop_words:
        continue

    tokens = tokenizer.tokenize(target)
    if len(tokens) == 1:
        valid_examples.append(row)

df = pd.DataFrame(valid_examples)

In [6]:
class LexicalFunctionDataset(Dataset):
    def __init__(self, tokenizer, dataframe, max_length=64):
        self.examples = []
        for _, row in dataframe.iterrows():
            text = f"{row['LFFUNC']}({row['LFARG']}) - [MASK]"
            target = row['LFVAL']

            inputs = tokenizer(
                text,
                max_length=max_length,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            )

            mask_pos = (inputs["input_ids"][0] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]
            if len(mask_pos) == 0:
                continue

            labels = inputs["input_ids"].clone()
            labels[:, :] = -100
            labels[0, mask_pos[0]] = tokenizer.encode(target, add_special_tokens=False)[0]

            self.examples.append({
                "input_ids": inputs["input_ids"][0],
                "attention_mask": inputs["attention_mask"][0],
                "labels": labels[0]
            })
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

In [7]:
dataset = LexicalFunctionDataset(tokenizer, df)

In [8]:
print(len(dataset))

if len(dataset) == 0:
    raise ValueError("пустой")

30435


In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_total_limit=2,
    logging_steps=500,
    fp16=torch.cuda.is_available(),
    report_to="none",
    learning_rate=3e-5,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

In [10]:
trainer.train()

Step,Training Loss
500,1.1897
1000,0.783
1500,0.687
2000,0.5797
2500,0.4749
3000,0.4536
3500,0.4726
4000,0.4085
4500,0.3448
5000,0.3507


Step,Training Loss
500,1.1897
1000,0.783
1500,0.687
2000,0.5797
2500,0.4749
3000,0.4536
3500,0.4726
4000,0.4085
4500,0.3448
5000,0.3507


TrainOutput(global_step=9510, training_loss=0.43449756462365924, metrics={'train_runtime': 6102.321, 'train_samples_per_second': 24.937, 'train_steps_per_second': 1.558, 'total_flos': 1.772629405696512e+16, 'train_loss': 0.43449756462365924, 'epoch': 4.997634691195795})

In [11]:
def predict_lexical_function(func, arg, model, tokenizer, top_k=5):
    model.eval()
    text = f"{func}({arg}) - [MASK]"
    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    mask_pos = (inputs.input_ids[0] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits[0, mask_pos]
    probs = torch.softmax(logits, dim=-1)
    top_tokens = torch.topk(probs, top_k)

    print(f"для {func}({arg}):")
    for i, (token_id, prob) in enumerate(zip(top_tokens.indices[0], top_tokens.values[0])):
        print(f"{i+1}. {tokenizer.decode([token_id])} ({prob:.4f})")


In [12]:
test_cases_old = [
    ('Magn', 'довод'),
    ('Oper1', 'домино'),
    ('Ореr2', 'арест'),
    ('Incepoper1', 'азарт'),
    ('Func0', 'дорога'),
    ('INCEPFUNC0', 'день'),
    ('CAUSFUNC0', 'встреча'),
    ('REAL1', 'газета'),
    ('REAL1-M', 'долг')
]

test_cases_new = [
    ('Loc', 'дом'),
    ('Oper1', 'оценка'),
    ('Magn', 'друг'),
    ('ADV2-UN', 'причина'),
    ('ADV1-UN', 'надежда'),
    ('CAUSFUNC0', 'заседание'),
    ('Func0', 'дело'),
    ('INCEPOPER1', 'право'),
    ('Oper2', 'внимание')
]

for func, arg in test_cases_old:
    predict_lexical_function(func, arg, model, tokenizer)
    print()


for func, arg in test_cases_new:
    predict_lexical_function(func, arg, model, tokenizer)
    print()

для Magn(довод):
1. выдвигать (0.5220)
2. приводить (0.3141)
3. убедительно (0.1183)
4. доказывать (0.0059)
5. опровергать (0.0031)

для Oper1(домино):
1. играть (0.9996)
2. сыграть (0.0002)
3. игра (0.0002)
4. домино (0.0000)
5. заниматься (0.0000)

для Ореr2(арест):
1. сидеть (0.9801)
2. быть (0.0066)
3. отбывать (0.0041)
4. находиться (0.0035)
5. лежать (0.0023)

для Incepoper1(азарт):
1. входить (0.9929)
2. вступать (0.0063)
3. впадать (0.0002)
4. заходить (0.0002)
5. ввязываться (0.0002)

для Func0(дорога):
1. проходить (0.9752)
2. лежать (0.0153)
3. идти (0.0043)
4. ходить (0.0007)
5. быть (0.0006)

для INCEPFUNC0(день):
1. приходить (0.8755)
2. наступать (0.1006)
3. вставать (0.0039)
4. начинаться (0.0036)
5. появляться (0.0029)

для CAUSFUNC0(встреча):
1. проводить (0.9399)
2. устраивать (0.0269)
3. назначать (0.0198)
4. организовывать (0.0035)
5. заключать (0.0021)

для REAL1(газета):
1. читать (0.9999)
2. печатать (0.0000)
3. смотреть (0.0000)
4. слушать (0.0000)
5. видеть (0